针对淘宝的数据采集写了个简单的示例,但是最好的还是要用淘宝api去获取信息这个是比较正规的
<?php /** * 采集淘宝数据 * @Author Bieanju **/ header("Content-Type:text/html; charset=utf-8"); $url = "https://item.taobao.com/item.htm?spm=a217h.7274645.1998424065.10.5kIFLZ&id=43823358756"; $str = file_get_contents($url); $str = mb_convert_encoding($str,‘UTF-8‘,‘GBK‘); $goods[‘title‘] = preg_substr(‘/<h3 class="tb-main-title"[^>]*>/‘,‘/<\/h3>/‘,$str); $goods[‘market_price‘] = preg_substr(‘/<em class="tb-rmb-num">/‘,‘/<\/em>/‘,$str); preg_match(‘/<([a-z]+)[^i]*id=\"J_StrPrice\"[^>]*>([^<]*)<\/\\1>/is‘, $str, $price); preg_match(‘/]*id="J_ImgBooth"[^r]*rc="([^"]*)"[^>]*>/‘, $str, $img); $goods[‘price‘] = !empty($price[1]) ? $price[1] : 0; $url =!empty($img[1]) ? getImage(‘http:‘.$img[1],‘‘,‘upload‘,array(‘jpg‘,‘gif‘,‘png‘),1) : 0; $goods[‘url‘] = $url; if(isset($goods) && !empty($goods)){ echo "采集成功!<br />商品名称【".$goods[‘title‘]."】<br />商品价格【".$goods[‘market_price‘]."】<br />商品图片已成功保存到本地【".$goods[‘url‘]."】<br /><img src=‘".$goods[‘url‘]."‘ />"; }else{ exit("采集失败,请重试!"); } /** * 匹配区域采集数据 * @Author Bieanju **/ function preg_substr($start, $end, $str){ $temp = preg_split($start, $str); $content = preg_split($end, $temp[1]); return $content[0]; } /** * 下载图片到本地 * @Author Bieanju **/ function getImage($url, $filename=‘‘, $dirName, $fileType, $type=0){ if($url == ‘‘){return false;} $defaultFileName = basename($url); $suffix = substr(strrchr($url,‘.‘), 1); if(!in_array($suffix, $fileType)){ return false; } $filename = $filename == ‘‘ ? time().rand(0,9).‘.‘.$suffix : $defaultFileName; if($type){ $ch = curl_init(); $timeout = 5; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); $file = curl_exec($ch); curl_close($ch); }else{ ob_start(); readfile($url); $file = ob_get_contents(); ob_end_clean(); } $dirName = $dirName.‘/‘.date(‘Y‘, time()).‘/‘.date(‘m‘, time()).‘/‘.date(‘d‘,time()).‘/‘; if(!file_exists($dirName)){ mkdir($dirName, 0777, true); } $res = fopen($dirName.$filename,‘a‘); fwrite($res,$file); fclose($res); return $dirName.$filename; } ?>
时间: 2024-10-11 17:22:19