前提:需要下载:phpQuery/phpQuery.php
链接:http://www.cnblogs.com/wuheng1991/p/5145398.html
1.对于规则的部分
<?php header(‘Content-Type:text/html;charset=UTF-8‘); include ‘./phpQuery/phpQuery.php‘; set_time_limit(10000); $id = isset($_GET[‘id‘]) ? intval($_GET[‘id‘]) : 1; if($id > 46){ echo "finish!"; exit; } // exit; echo "当前 id=".$id; echo "<br/>"; $url = "http://www.genetex.com/web/Search/SearchList.aspx?&Category=494&Page=".$id."&SpecialGroupID=240"; echo "当前的URL:"; echo $url."<br/>"; phpQuery::newDocumentFile($url); $artList = pq(".sear_list"); $count = count($artList); // var_dump($count); $i = 0; foreach($artList as $li){ $url = ‘‘; $tr = ‘‘; $tr = pq($li)->eq(0)->find("li")->eq(1)->find("a")->attr(‘href‘); $tr = trim($tr); // var_dump($tr); $url = "http://www.genetex.com/"; $arr = explode("/",$tr); // exit; $end = trim($arr[2]); $rel_url = $url.$end; $rel_url = $rel_url."\r\n"; file_put_contents(‘url.txt‘,$rel_url,FILE_APPEND); //用于装载执行失败的数据 echo ‘<br/>‘; } ?> <script> function JumpUrl(){ location.href=‘?id=<?php echo ($id+1);?>‘; } setTimeout(JumpUrl,0); </script>
2.对于不规则的部分。(通过产品链接,得到产品相关信息)
<?php header(‘Content-Type:text/html;charset=UTF-8‘); include ‘./phpQuery/phpQuery.php‘; set_time_limit(10000); $id = isset($_GET[‘id‘]) ? intval($_GET[‘id‘]) : 0; $arr = file(‘./url.txt‘); // var_dump($arr); if($id > 453){ echo "finish!"; exit; } $curr_url = trim($arr[$id]); unset($arr); var_dump($curr_url); echo ‘<br/>‘; echo ‘<hr/>‘; phpQuery::newDocumentFile($curr_url); $arrAll = array(); $artList = pq(".table_style_1"); $app = pq(".sear_list_2")->eq(0)->find("span")->eq(1)->html(); $app = trim($app); echo "Application Information:".$app; echo ‘<hr/>‘; $arrAll[‘xing_num‘]= ‘‘; $arrAll[‘reference‘] = ‘‘; // var_dump($artList); foreach($artList as $key =>$li){ if($key == 0){ $tr1 = ‘‘; #### Catalog Number,Full Name,Storage Buffer $tr2 = ‘‘; #### Catalog Number,Full Name,Storage Buffer 对应的属性值 $tr3 = ‘‘; #### Product Name , Product Description,Storage Instruction $tr4 = ‘‘; #### Product Name , Product Description,Storage Instruction 对应的属性值 $tr5 = ‘‘; #### Applications,Background, Notes $tr6 = ‘‘; #### Applications,Background, Notes 对应的属性值 $tr1 = pq($li)->eq(0)->find("tr")->eq(0)->find("td")->eq(0)->html(); $tr1 = trim($tr1); $tr1 = str_replace("\r\n","",$tr1); $tr2 = pq($li)->eq(0)->find("tr")->eq(0)->find("td")->eq(1)->html(); $tr2 = trim($tr2); $tr2_a = pq($li)->eq(0)->find("tr")->eq(0)->find("td")->eq(1)->find(".recommend")->eq(0)->html(); $tr2_a = trim($tr2_a); // $tr2_a = str_replace("\r\n","",$tr2_a); $tr2_xx = ‘‘; ###星星 $tr2_xx = pq($li)->eq(0)->find("tr")->eq(0)->find("td")->eq(1)->find(".recommend")->eq(0) ->find("div")->eq(1)->html(); $tr2_xx = strip_tags($tr2_xx); $tr2_xx = str_replace("\r\n","",$tr2_xx); $tr2_xx = str_replace("(","",$tr2_xx); $tr2_xx = str_replace(")","",$tr2_xx); $tr2_xx = str_replace(" ","",$tr2_xx); $arrAll[‘xing_num‘] = $tr2_xx; // echo "<hr/>"; // echo var_dump($tr2_xx); // echo "<hr/>"; $tr2_yy = ‘‘; ### refercence $tr2_yy = pq($li)->eq(0)->find("tr")->eq(0)->find("td")->eq(1)->find(".recommend")->eq(0) ->find("div")->eq(2)->html(); $tr2_yy = strip_tags($tr2_yy); $tr2_yy = str_replace("\r\n","",$tr2_yy); $tr2_yy = str_replace("(","",$tr2_yy); $tr2_yy = str_replace(")","",$tr2_yy); $tr2_yy = str_replace(" ","",$tr2_yy); $arrAll[‘reference‘] = $tr2_yy; // echo "<hr/>"; // echo var_dump($tr2_yy); // echo "<hr/>"; $tr2 = str_replace($tr2_a,"",$tr2); $tr2 = strip_tags($tr2); $tr2 = str_replace(" ","",$tr2); $tr2 = str_replace("\r\n","",$tr2); $tr2 = trim($tr2); $tr3 = pq($li)->eq(0)->find("tr")->eq(1)->find("td")->eq(0)->html(); $tr3 = trim($tr3); $tr3 = str_replace("\r\n","",$tr3); if($key == 0){ $tr4 = pq($li)->eq(0)->find("tr")->eq(1)->find("td")->eq(1)->find(‘strong‘)->eq(0)->html(); }else{ $tr4 = pq($li)->eq(0)->find("tr")->eq(1)->find("td")->eq(1)->html(); } // $tr4 = trim($tr4); // $tr4 = str_replace(" ","",$tr4); $tr4 = str_replace("\r\n","",$tr4); $tr4 = trim($tr4); $tr5 = pq($li)->eq(0)->find("tr")->eq(2)->find("td")->eq(0)->html(); $tr5 = trim($tr5); $tr5 = str_replace("\r\n","",$tr5); $tr6 = pq($li)->eq(0)->find("tr")->eq(2)->find("td")->eq(1)->html(); $tr6 = str_replace(" ","",$tr6); $tr6 = str_replace("\r\n","",$tr6); $tr6 = trim($tr6); // echo $tr1." ; ".$tr2." ; ".$tr3." ; ".$tr4." ; ".$tr5." ; ".$tr6; // echo ‘<hr/>‘; // echo ‘<br/>‘; } // exit; } $arrAll[‘Application_Information‘] = $app; $arrAll[‘catalog_number‘] = $tr2; $arrAll[‘product_name‘] = $tr4; $arrAll[‘applications‘] = $tr6; $arrAll[‘position_controls‘] = ""; $art = pq("#tab1 table"); // var_dump($art); foreach($art as $k =>$v){ if($k == 0){ // var_dump($v); for($ii = 0;$ii < 5;$ii ++){ $a_1 = ‘‘; $a_1 = pq($v)->eq(0)->find("tr")->eq(7+$ii)->find("td")->eq(0)->html(); $a_1 = trim($a_1); if($a_1 == "Positive Controls"){ $a_2 = pq($v)->eq(0)->find("tr")->eq(7+$ii)->find("td")->eq(1)->html(); $a_2 = trim($a_2); $arrAll[‘position_controls‘] = $a_2; } } // var_dump($a_1); // exit; } } // var_dump($arrAll); $arrAll[‘full_name‘] = ‘‘; $arrAll[‘product_description‘] = ‘‘; $arrAll[‘synonyms‘] = ‘‘; $arrAll[‘host‘] = ‘‘; $arrAll[‘clonality‘] = ‘‘; $arrAll[‘isotype‘] = ‘‘; $arrAll[‘species_reactivity‘] = ‘‘; $arrAll[‘conjugation‘] = ‘‘; // echo ‘<br/>‘; // echo ‘<hr/>‘; foreach($artList as $key =>$li){ if($key == 1){ $tr1 = ‘‘; #### Catalog Number,Full Name,Storage Buffer $tr2 = ‘‘; #### Catalog Number,Full Name,Storage Buffer 对应的属性值 $tr3 = ‘‘; #### Product Name , Product Description,Storage Instruction $tr4 = ‘‘; #### Product Name , Product Description,Storage Instruction 对应的属性值 $tr5 = ‘‘; #### Applications,Background, Notes $tr6 = ‘‘; #### Applications,Background, Notes 对应的属性值 $tr1 = pq($li)->eq(0)->find("tr")->eq(0)->find("td")->eq(0)->html(); $tr1 = trim($tr1); $tr1 = str_replace("\r\n","",$tr1); $tr2 = pq($li)->eq(0)->find("tr")->eq(0)->find("td")->eq(1)->html(); $tr2 = trim($tr2); $tr2_a = pq($li)->eq(0)->find("tr")->eq(0)->find("td")->eq(1)->find(".recommend")->eq(0)->html(); $tr2_a = trim($tr2_a); // $tr2_a = str_replace("\r\n","",$tr2_a); $tr2 = str_replace($tr2_a,"",$tr2); $tr2 = strip_tags($tr2); // $tr2 = str_replace(" ","",$tr2); $tr2 = str_replace("\r\n","",$tr2); $tr2 = trim($tr2); $tr3 = pq($li)->eq(0)->find("tr")->eq(1)->find("td")->eq(0)->html(); $tr3 = trim($tr3); $tr3 = str_replace("\r\n","",$tr3); if($key == 0){ $tr4 = pq($li)->eq(0)->find("tr")->eq(1)->find("td")->eq(1)->find(‘strong‘)->eq(0)->html(); }else{ $tr4 = pq($li)->eq(0)->find("tr")->eq(1)->find("td")->eq(1)->html(); } // $tr4 = trim($tr4); // $tr4 = str_replace(" ","",$tr4); $tr4 = str_replace("\r\n","",$tr4); $tr4 = trim($tr4); for($jj = 0;$jj < 3;$jj ++){ $tr5 = ‘‘; $tr5 = pq($li)->eq(0)->find("tr")->eq(2+$jj)->find("td")->eq(0)->html(); $tr5 = str_replace("\r\n","",$tr5); $tr5 = trim($tr5); if($tr5 == "Synonyms"){ $tr6 = pq($li)->eq(0)->find("tr")->eq(2+$jj)->find("td")->eq(1)->html(); // $tr6 = str_replace(" ","",$tr6); $tr6 = str_replace("\r\n","",$tr6); $tr6 = trim($tr6); $arrAll[‘synonyms‘] = $tr6; } } #### host ##### for($jj = 0;$jj < 4;$jj ++){ $tr7 = ‘‘; $tr7 = pq($li)->eq(0)->find("tr")->eq(3+$jj)->find("td")->eq(0)->html(); $tr7 = str_replace("\r\n","",$tr7); $tr7 = trim($tr7); if($tr7 == "Host"){ $tr8 = pq($li)->eq(0)->find("tr")->eq(3+$jj)->find("td")->eq(1)->html(); // $tr6 = str_replace(" ","",$tr6); $tr8 = str_replace("\r\n","",$tr8); $tr8 = trim($tr8); $arrAll[‘host‘] = $tr8; } } ##### clonality #### for($jj = 0;$jj < 4;$jj ++){ $tr9 = ‘‘; $tr9 = pq($li)->eq(0)->find("tr")->eq(4+$jj)->find("td")->eq(0)->html(); $tr9 = str_replace("\r\n","",$tr9); $tr9 = trim($tr9); if($tr9 == "Clonality"){ $tr10 = pq($li)->eq(0)->find("tr")->eq(4+$jj)->find("td")->eq(1)->html(); // $tr6 = str_replace(" ","",$tr6); $tr10 = str_replace("\r\n","",$tr10); $tr10 = trim($tr10); $arrAll[‘clonality‘] = $tr10; } } #### isotype ###### for($jj = 0;$jj < 4;$jj ++){ $tr11 = ‘‘; $tr11 = pq($li)->eq(0)->find("tr")->eq(5+$jj)->find("td")->eq(0)->html(); $tr11 = str_replace("\r\n","",$tr11); $tr11 = trim($tr11); if($tr11 == "Isotype"){ $tr12 = pq($li)->eq(0)->find("tr")->eq(5+$jj)->find("td")->eq(1)->html(); // $tr6 = str_replace(" ","",$tr6); $tr12 = str_replace("\r\n","",$tr12); $tr12 = trim($tr12); $arrAll[‘isotype‘] = $tr12; } } #### species_reactivity ##### for($jj = 0;$jj < 6;$jj ++){ $tr13 = ‘‘; $tr13 = pq($li)->eq(0)->find("tr")->eq(6+$jj)->find("td")->eq(0)->html(); $tr13 = str_replace("\r\n","",$tr13); $tr13 = trim($tr13); if($tr13 == "Species Reactivity"){ $tr14 = pq($li)->eq(0)->find("tr")->eq(6+$jj)->find("td")->eq(1)->html(); // $tr6 = str_replace(" ","",$tr6); $tr14 = str_replace("\r\n","",$tr14); $tr14 = trim($tr14); $arrAll[‘species_reactivity‘] = $tr14; } } ######## conjugation ###### for($jj = 0;$jj < 6;$jj ++){ $tr15 = ‘‘; $tr15 = pq($li)->eq(0)->find("tr")->eq(7+$jj)->find("td")->eq(0)->html(); $tr15 = str_replace("\r\n","",$tr15); $tr15 = trim($tr15); if($tr15 == "Conjugation"){ $tr16 = pq($li)->eq(0)->find("tr")->eq(7+$jj)->find("td")->eq(1)->html(); // $tr6 = str_replace(" ","",$tr6); $tr16 = str_replace("\r\n","",$tr16); $tr16 = trim($tr16); $arrAll[‘conjugation‘] = $tr16; } } // echo $tr1." ; ".$tr2." ; ".$tr3." ; ".$tr4." ; ".$tr5." ; ".$tr6; // echo ‘<hr/>‘; // echo ‘<br/>‘; } // exit; } $arrAll[‘full_name‘] = $tr2; $arrAll[‘product_description‘] = $tr4; $arrAll[‘researchArea‘] = ""; // echo ‘<br/>‘; // echo ‘<hr/>‘; foreach($artList as $key =>$li){ if($key == 2){ $tr17 = pq($li)->eq(0)->find("tr")->eq(3)->find("td")->eq(0)->html(); $tr17 = trim($tr17); $tr17 = str_replace("\r\n","",$tr17); $tr18 = pq($li)->eq(0)->find("tr")->eq(4)->find("td")->eq(0)->html(); $tr18 = strip_tags($tr18); $tr18 = str_replace("More Hide","",$tr18); $tr18 = str_replace("\r\n","",$tr18); $tr18 = trim($tr18); // echo $tr18; // echo ‘<br/>‘; // echo ‘<hr/>‘; } } // echo ‘<br/>‘; // echo "★★★★☆"; // echo ‘<br/>‘; $arrAll[‘researchArea‘] = $tr18; $arrAll[‘url‘] = $curr_url; var_dump($arrAll); $html = ""; $html .= $arrAll[‘Application_Information‘]."\t".$arrAll[‘catalog_number‘]."\t".$arrAll[‘product_name‘]."\t"; $html .= $arrAll[‘applications‘]."\t".$arrAll[‘position_controls‘]."\t".$arrAll[‘full_name‘]."\t"; $html .= $arrAll[‘product_description‘]."\t".$arrAll[‘synonyms‘]."\t".$arrAll[‘host‘]."\t"; $html .= $arrAll[‘clonality‘]."\t".$arrAll[‘isotype‘]."\t".$arrAll[‘species_reactivity‘]."\t"; $html .= $arrAll[‘conjugation‘]."\t".$arrAll[‘researchArea‘]."\t".$arrAll[‘xing_num‘]."\t"; $html .= $arrAll[‘reference‘]."\r\n"; echo $html; echo ‘<br/>‘; // file_put_contents(‘info.txt‘,$html,FILE_APPEND); //用于装载执行失败的数据 $fp = fopen(‘file.csv‘, ‘a‘); fputcsv($fp,$arrAll); unset($arrAll); unset($artList); ?> <script> function JumpUrl(){ location.href=‘?id=<?php echo ($id+1);?>‘; } setTimeout(JumpUrl,0); </script>
3.对知识点的补充
a.使用phpQuery采集数据,实际上是通过链接地址,找到源码,再将源码转换为jQuery语法。然后通过jQuery语法获得数据
b.对于规则的部分,采集数据较容易。
c.对于不规则的部分,需要进行判断,较为复杂些,不过如果jQuery学的好,这也不是难事。
d.对函数:fputcsv() 的学习总结。如:
$fp = fopen(‘file.csv‘, ‘a‘); fputcsv($fp,$arrAll);
其中,$arrAll是一维数组。fputcsv(),会将函数的值写入到csv文件中,一次写一行,如果数组为空,则不会写入,一旦数组中的元素不为空,就可以写入。
参看:http://php.net/manual/zh/function.fputcsv.php
时间: 2024-10-10 01:26:07