curl是利用URL语法在命令行方式下工作的开源文件传输工具
本文在php中实现了的一个curl批处理的实例。
代码如下:
1 header("Content-Type:text/html;charset=utf8"); 2 3 /* 先获取两个页面的所有a标签 */ 4 // 初始化两个简单处理句柄 5 $ch1 = curl_init(); 6 $ch2 = curl_init(); 7 curl_setopt_array($ch1,array( 8 CURLOPT_URL => ‘http://www.sina.com.cn‘, 9 CURLOPT_HEADER => 0, 10 CURLOPT_RETURNTRANSFER => 1, 11 )); 12 curl_setopt_array($ch2,array( 13 CURLOPT_URL => ‘http://www.baidu.com/‘, 14 CURLOPT_HEADER => 0, 15 CURLOPT_RETURNTRANSFER => 1, 16 )); 17 18 // 初始化批处理句柄,并添加简单处理句柄 19 $mh = curl_multi_init(); 20 curl_multi_add_handle($mh,$ch1); 21 curl_multi_add_handle($mh,$ch2); 22 23 // 初始化执行状态 24 $state = null; 25 26 // 执行批处理 27 do{ 28 $mc = curl_multi_exec($mh,$state); 29 }while($mc == CURLM_CALL_MULTI_PERFORM); 30 while($mc == CURLM_OK && $state) { 31 while (curl_multi_exec($mh, $state) === CURLM_CALL_MULTI_PERFORM); 32 // 经过实验,发现curl_multi_select($mh)总是返回-1,意味着一下代码不会执行 33 if(curl_multi_select($mh) != -1) { 34 do{ 35 $mc = curl_multi_exec($mh,$state); 36 }while($mc == CURLM_CALL_MULTI_PERFORM); 37 } 38 } 39 40 // 获取内容 41 $text = curl_multi_getcontent($ch1); 42 $text .= curl_multi_getcontent($ch2); 43 44 // 找到页面中所有的a标签,保存到$matches 45 $matches = null; 46 preg_match_all("/<a.*?href\s*?=\s*?[\‘\"](.*?)[\‘\"].*?>(.*?)<\/a>/",$text,$matches); 47 48 // 关闭各个句柄 49 curl_multi_remove_handle($mh,$ch1); 50 curl_multi_remove_handle($mh,$ch2); 51 curl_multi_close($mh); 52 53 /*在找到的连接中继续查找title标签 */ 54 55 $handle = array(); // 存储简单处理句柄的数组 56 $mhandle = curl_multi_init(); //批处理句柄 57 // 处理100个页面 58 foreach(array_slice($matches[1],0,100) as $href) { 59 $tmp_h = curl_init(); 60 curl_setopt_array($tmp_h,array( 61 CURLOPT_URL => $href, 62 CURLOPT_HEADER => 0, 63 CURLOPT_RETURNTRANSFER => 1, 64 )); 65 curl_multi_add_handle($mhandle,$tmp_h); 66 $handle[] = $tmp_h; 67 } 68 do{ 69 $mrc = curl_multi_exec($mhandle,$active); 70 }while($mrc == CURLM_CALL_MULTI_PERFORM); 71 while($mrc == CURLM_OK && $active) { 72 while(curl_multi_exec($mhandle,$active) == CURLM_CALL_MULTI_PERFORM); 73 if(curl_multi_select($mhandle) != -1) { 74 do{ 75 $mrc = curl_multi_exec($mhandle,$active); 76 }while($mrc == CURLM_CALL_MULTI_PERFORM); 77 } 78 } 79 80 // 获取这些页面的内容 81 $mtext = null; 82 foreach($handle as $tmp_h) { 83 $mtext .= curl_multi_getcontent($tmp_h); 84 curl_multi_remove_handle($mhandle, $tmp_h); 85 } 86 $mmatches = array(); 87 preg_match_all("/<title>(.*?)<\/title>/",$mtext, $mmatches); 88 89 // 编码转换 90 mb_detect_order(‘GB2312,GBK,BIG5,GB18030,UNICODE ,CP936‘); 91 foreach($mmatches[1] as $key => $val) { 92 $encoding = mb_detect_encoding($val); 93 if($encoding != ‘UTF-8‘ && $encoding != ‘CP936‘ && $encoding != ‘GB18030‘ && $encoding !=‘‘) { 94 $mmatches[1][$key] = iconv($encoding,‘UTF-8//IGNORE‘,$val); 95 } 96 } 97 98 // 打印title信息 99 var_dump($mmatches[1]); 100 101 // 关闭批处理句柄 102 curl_multi_close($mhandle);
时间: 2024-10-13 09:54:18