set_time_limit(0); header("Content-type:text/html;charset=utf-8"); $updatePoint = date("Y-m-d", time()); $patMD = date("m-d", time()); $xmlDatas = ‘‘; for($i=0;$i<76;$i++) { $page = $i*10; $conts = file_get_contents("http://www.baidu.com/s?wd=site%3Awww.xxxx.com%20%E4%B9%90%E5%A4%A9%E5%A0%82&pn={$page}&oq=site%3Awww.xxxx.com%20%E4%B9%90%E5%A4%A9%E5%A0%82&ie=utf-8&rsv_idx=1&rsv_pq=aff4775f00063733&rsv_t=ff065MbpZuOoe%2B%2BV4iOkvVuzeSXd1n2FRBQwnnwPHtpsy%2F7pPFaTfcrWm4M&f=8&rsv_bp=1&tn=baidu"); $pat = ‘|\\"http://www.baidu.com/link\?url=?([^>]*)\\"\s|U‘; $xmlDatas .= getLists($pat, $conts, $updatePoint); } if(file_exists(dirname(__FILE__)."/silian.txt")){ file_put_contents(dirname(__FILE__)."/silian.txt", $xmlDatas); }else{ $fp = fopen(dirname(__FILE__)."/silian.txt", ‘w+b‘); fwrite($fp, $xmlDatas); fclose($fp); } function getLists($pattern, $contents, $updatePoint){ preg_match_all($pattern, $contents, $matches); $lists = $matches[0]; $xmlData = ""; $lists = array_unique($lists);//过滤重复的 ; if(!empty($lists)){ foreach ($lists as $key => $value) { # code... $value = trim($value, ‘"‘); $value = substr($value, 0,-1); $value = trim($value,‘"‘); $info = parse_url($value); $fp = fsockopen($info[‘host‘], 80,$errno, $errstr, 30); fputs($fp,"GET {$info[‘path‘]}?{$info[‘query‘]} HTTP/1.0"."\r\n"); fputs($fp, "Host: {$info[‘host‘]}"."\r\n"); fputs($fp, "Connection: close"."\r\n"); fputs($fp, "\r\n"); $rewrite = ‘‘; while(!feof($fp)) { $line = fgets($fp,512); if($line != " " ) { if(strpos($line,‘Location:‘) !== false) { $rewrite = str_replace("Location: ",‘‘,$line); } }else { break; } } $value = $rewrite; $xmlData .= $value ; } return $xmlData; }else{ exit(); } }
此抓取主要用于百度收录的网址查询,没有直接按关键词查询来查询收录情况。
时间: 2024-11-09 02:53:10