<?php
/**
* @desc:多线程爬虫类
* @author [Lee] <[<[email protected]>]>
* @property
* 1、calltrigger 触发爬虫程序的回调函数
* 2、calltodo 处理业务逻辑的回调函数 如:把抓取到的内容处理后存到数据库
* 3、timeout 超时时间,默认5秒
* 4、depth 重定向深度,默认3
* 5、name 上传文件的名字,默认file
* 6、cookie 模拟登录时cookie存储在本地的文件,默认cookie_n.txt
* @method
* 1、ssl 是否设置https true:是 false:否
* 2、auth 启用验证 user:用户名 pass:密码
* 3、login 模拟登录,获取cookie
* 4、cookie 使用cookie登录
* 5、header 设置请求头 data:请求头数组
* 6、proxy 设置服务器代理 url:代理服务器url port:代理服务器端口
* 7、agent 设置浏览器代理 browse:代理浏览器 默认:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
* 8、get 模拟get请求 data:传递的数据
* 9、post 模拟post请求 data:传递的数据
* 10、json 模拟json请求 data:传递的数据
* 11、upload 模拟表单上传 files:上传的文件 array|string
* 12、download 下载文件 dir:要下载的文件 格式:a/b
* 13、run 执行 depth:深度
*/
class crawl{
public $calltrigger = ‘trigger‘; # 触发爬虫程序的回调函数
public $calltodo = ‘todo‘; # 处理业务逻辑的回调函数
public $timeout = 5; # 超时时间,默认5秒
public $depth = 3; # 重定向深度,默认3
public $name = ‘file‘; # 上传文件的名字,默认file
public $cookie = ‘cookie.txt‘; # 模拟登录时cookie存储在本地的文件,默认cookie_n
private $schemes = array();
private $hosts = array();
private $paths = array();
private $querys = array();
private $options = array();
private $chs;
private $fps;
private $handle;
private $urls = array();
/*
@desc:内部方法,获取页面中的超链接
@param content 页面内容
@return urls 获取到的超链接
*/
private function geturl($content){
$preg = ‘/<[a|A].*?href=[\‘\"]{0,1}([^>\‘\"\ ]*).*?>/‘;
$bool = preg_match_all($preg,$content,$res);
$urls = array();
if($bool){
$urls = $res[1];
}
$urls = array_unique($urls);
return $urls;
}
/*
@desc:内部方法,修复不完整的url
@param url 原始url
@param url 修复好的url
*/
private function reviseurl($url){
$info = parse_url($url);
$scheme = $info["scheme"]?:‘http‘;
$user = $info["user"];
$pass = $info["pass"];
$host = $info["host"];
$port = $info["port"];
$path = $info["path"];
$url = $scheme . ‘://‘;
if ($user && $pass) {
$url .= $user . ":" . $pass . "@";
}
$url .= $host;
if ($port) {
$url .= ":" . $port;
}
$url .= $path;
return $url;
}
/*
@desc:内部方法,调用回调函数进行业务处理
@param content 传入到回调函数的参数
*/
private function todo($content){
$calltodo = $this->calltodo;
call_user_func($calltodo,$content);
}
/*
@desc:触发爬虫程序的回调函数
@param urls 待处理的url数组
@param depth 处理深度
*/
private function trigger($urls,$depth){
$calltrigger = $this->calltrigger;
call_user_func($calltrigger,$urls,$depth);
}
/*
@desc:内部方法 设置get请求参数
@param data 请求数据
*/
private function setget($data){
$schemes = $this->schemes;
$hosts = $this->hosts;
$paths = $this->paths;
$querys = $this->querys;
foreach($this->chs as $k=>$v){
$sep = ($querys[$k] || !empty($data))?"?":"";
$qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k].$data;
$this->options[$k][CURLOPT_URL] = $qurl;
}
return $this;
}
/*
@desc:内部方法 设置post请求参数
@param data 请求数据
*/
private function setpost($data){
$schemes = $this->schemes;
$hosts = $this->hosts;
$paths = $this->paths;
$querys = $this->querys;
foreach($this->chs as $k=>$v){
$sep = $query?"?":"";
$qurl = $schemes[$k].‘://‘.$hosts[$k].$paths[$k].$sep.$querys[$k];
$this->options[$k][CURLOPT_URL] = $qurl;
$this->options[$k][CURLOPT_POST] = 1;
$this->options[$k][CURLOPT_POSTFIELDS] = $data;
}
return $this;
}
/*
@desc:内部方法 设置最终请求参数
*/
private function setopt(){
$options = $this->options;
foreach($options as $k=>$v){
curl_setopt_array(
$this->chs[$k],
$v
);
}
return $this;
}
/*
@desc:构造方法 设置初始请求参数
@param urls 请求地址数组
*/
public function __construct($urls){
$this->urls = $urls;
$this->handle = curl_multi_init();
foreach($urls as $k=>$v){
$info = parse_url($v);
$this->schemes[$k] = $info[‘scheme‘]?:‘http‘;
$this->hosts[$k] = $info[‘host‘];
$this->paths[$k] = $info[‘path‘];
$this->querys[$k] = $info[‘query‘];
$this->chs[$k] = curl_init();
$this->options[$k][CURLOPT_CONNECTTIMEOUT] = $this->timeout;
$this->options[$k][CURLOPT_RETURNTRANSFER] = 1;
$this->options[$k][CURLOPT_FOLLOWLOCATION] = 1;
$this->options[$k][CURLINFO_HEADER_OUT] = true;
$this->options[$k][CURLOPT_ENCODING] = ‘gzip‘;
$this->options[$k][CURLOPT_MAXREDIRS] = $this->depth;
curl_multi_add_handle ($this->handle,$this->chs[$k]);
}
}
/*
@desc:是否设置https请求
@param bool true:https请求 false:http请求
*/
public function ssl($bool = false){
if($bool){
foreach($this->chs as $k=>$v){
$this->scheme[$k] = ‘https‘;
$this->options[$k][CURLOPT_SSL_VERIFYHOST] = 1;
$this->options[$k][CURLOPT_SSL_VERIFYPEER] = false;
}
}
return $this;
}
/*
@desc:设置验证用户名、密码
@param user 用户名
@param pass 密码
*/
public function auth($user,$pass){
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_USERPWD] = $user.‘:‘.$pass;
}
return $this;
}
/*
@desc:模拟登录
*/
public function login(){
$cookie = $this->cookie;
$arr = explode(‘.‘,$cookie);
$name = $arr[0];
$ext = $arr[1];
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_COOKIEJAR] = $name.‘_‘.$k.‘.‘.$ext;
$this->options[$k][CURLOPT_RETURNTRANSFER] = 0;
}
return $this;
}
/*
@desc:带cookie登录
*/
public function cookie(){
$cookie = $this->cookie;
$arr = explode(‘.‘,$cookie);
$name = $arr[0];
$ext = $arr[1];
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_COOKIEFILE] = $name.‘_‘.$k.‘.‘.$ext;
}
return $this;
}
/*
@desc:设置请求头信息
@param data 请求头
*/
public function header($data){
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_HTTPHEADER] = $this->options[$k][CURLOPT_HTTPHEADER]?:array();
$this->options[$k][CURLOPT_HTTPHEADER] = array_merge($this->options[$k][CURLOPT_HTTPHEADER],$data);
}
return $this;
}
/*
@desc:设置代理服务器
@param url 代理服务器url
@param port 代理服务器端口
*/
public function proxy($url,$port){
$info = parse_url($url);
$scheme = $info[‘scheme‘]?:‘http‘;
$host = $info[‘host‘];
$path = $info[‘path‘];
$purl = $scheme.‘://‘.$host.$path.‘:‘.$port;
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_PROXY] = $purl;
}
return $this;
}
/*
@desc:设置代理浏览器
@param browse 代理浏览器
*/
public function agent($browse = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)‘){
foreach($this->chs as $k=>$v){
$this->options[$k][CURLOPT_USERAGENT] = $browse;
}
return $this;
}
/*
@desc:模拟get请求
@param data 请求数据
*/
public function get($data = array()){
$data = http_build_query($data);
$this->setget($data);
return $this;
}
/*
@desc:模拟post请求
@param data 请求数据
*/
public function post($data = array()){
$this->setpost($data);
return $this;
}
/*
@desc:模拟json请求
@param data 请求数据
*/
public function json($data = array()){
$data = json_encode($data);
$header = array(
‘Content-Type: application/json‘,
‘Content-Length:‘ . strlen($data)
);
$this->header($header);
$this->setpost($data);
return $this;
}
/*
@desc:模拟表单上传
@param files 文件路径
*/
public function upload($files){
$data = array();
$name = $this->name;
if(is_array($files)){
foreach($files as $k=>$v){
$data["{$name}[{$k}]"]=new CURLFile($v);
}
}else{
$data["{$name}"]=new CURLFile($files);
}
$this->setpost($data);
return $this;
}
/*
@desc:下载文件
@param dir 存储文件目录
*/
public function download($dir = ‘‘){
$paths = $this->paths;
if($dir && !is_dir($dir)){
mkdir($dir,0755,true);
}
foreach($this->paths as $k=>$v){
$name = strrchr($v, ‘/‘);
$dsep = $dir?‘/‘:‘‘;
$this->fps[$k]=fopen(‘.‘.$dsep.$dir.$name, ‘w‘);
$this->options[$k][CURLOPT_FILE] = $this->fps[$k];
}
$this->setget(‘‘);
return $this;
}
/*
@desc:执行方法
@param depth 深度 默认2
*/
public function run($depth = 2){
$this->setopt();
$chs = $this->chs;
$handle = $this->handle;
$urls = $this->urls;
if($depth > 0){
$depth--;
$active = null;
$mrc = curl_multi_exec($handle, $active);
while ($mrc == CURLM_CALL_MULTI_PERFORM) {
$mrc = curl_multi_exec($handle, $active);
}
while ($active && $mrc == CURLM_OK) {
if (curl_multi_select($handle) != -1) {
usleep(100);
}
$mrc = curl_multi_exec($handle, $active);
while ($mrc == CURLM_CALL_MULTI_PERFORM) {
$mrc = curl_multi_exec($handle, $active);
}
}
foreach ($chs as $k => $v) {
if (curl_error($chs[$k]) == "") {
$content = curl_multi_getcontent($chs[$k]);
$this->todo($content);
$aurls = $this->geturl($content);
$urls[$k] = $this->reviseurl($urls[$k]);
if (is_array($aurls) && !empty($aurls)) {
foreach ($aurls as $k1=>$u) {
if (preg_match(‘/^http/‘, $u)) {
$returl[$k1] = $u;
} else {
$real = $urls[$k] . ‘/‘ . $u;
$returl[$k1] = $real;
}
}
$this->trigger($returl,$depth);
}
}
curl_multi_remove_handle($handle, $chs[$k]);
curl_close($chs[$k]);
}
curl_multi_close($handle);
}
}
}
function todo($content){
echo ‘ok‘.PHP_EOL;
}
$urls=array(
‘www.baidu.com‘,
‘www.taobao.com‘
);
function trigger($urls = array(),$depth = 2){
$crawl = new crawl($urls);
$crawl->get()->run($depth);
}
trigger($urls);
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
ok
原文地址:http://blog.51cto.com/12173069/2125666
时间: 2024-10-12 19:19:55