纯属初学...有很多需要改进的地方,请多多指点...
目标是抓取58同城 这个大分类下的列表数据: http://cd.58.com/caishui/?PGTID=14397169455980.9244072034489363&ClickID=1
简单分析:
1. 按照以下二级分类来获取每个列表的数据,
2. 主要分页: 可以看出,其分页是pn5 这里设置的,那么这个5就是页码了.
http://cd.58.com/dailijizh/pn5/?PGTID=117742907188706554997826849&ClickID=1
3. 电话号码: 实在隐藏的div里面,点击 联系商家即可看到.但是对于程序来说.只可以直接取得的.
代码如下:
//抓取58数据 var http = require("http"), cheerio = require("cheerio"), mongoose = require(‘mongoose‘); db = mongoose.createConnection(‘mongodb://127.0.0.1:27017/crawl58‘); db.on(‘error‘, function (error) { console.log(‘mongodb连接错误: ‘ + error); }); //存储数据 var mongooseSchema = new mongoose.Schema({ url: {type: String},//抓取地址 type: {type: String},//类型 content: {type: String},//抓取地址 updateTime: {type: Date, default: Date.now},//数据抓取时间 flag: {type: String, default: 0} //用于判断是否抓取过 0表示详情没有抓取过. }); // model var mongooseModel = db.model(‘pageList‘, mongooseSchema);//代理记账 //抓取数据 var proxy = [ //代理 {ip: ‘120.203.159.14‘, port: ‘8118‘}, {ip: ‘111.161.246.233‘, port: ‘8118‘}, {ip: ‘58.30.233.196‘, port: ‘8118‘}, {ip: ‘113.215.0.130‘, port: ‘80‘}, {ip: ‘183.218.63.179‘, port: ‘8181‘}, {ip: ‘120.198.245.36‘, port: ‘8080‘}, {ip: ‘120.203.158.149‘, port: ‘8118‘}, {ip: ‘124.240.187.89‘, port: ‘80‘}, {ip: ‘218.204.140.105‘, port: ‘8118‘}, {ip: ‘175.1.79.63‘, port: ‘80‘} ]; var proxyIndex = 5; var flag = false;//判断是否为最后一页 var pageNo = 1; function crawl() { console.log(‘正在抓取 页码: ‘ + pageNo); //url需要手动设置一下,每个分类抓完毕后,切换到下一个分类 //var url = ‘http://cd.58.com/dailijizh/pn‘ + pageNo + ‘/?PGTID=1007041601886955933022299‘ + pageNo + ‘&ClickID=1‘; var url = ‘http://cd.58.com/nashuishenbao/pn‘ + pageNo + ‘/?PGTID=1007041601886955933022299‘ + pageNo + ‘&ClickID=1‘; var type=‘纳税申报‘;//这里需要手动设置一下分类,对应url分类 if (flag) { console.log(‘抓取完毕.总页数为:‘ + pageNo); return false; } var option = { host: proxy[proxyIndex].ip, port: proxy[proxyIndex].port, method: ‘GET‘,//这里是发送的方法 path: url, header: { ‘Host‘: ‘cd.58.com‘, ‘Connection‘: ‘keep-alive‘, ‘Cache-Control‘: ‘max-age=0‘, ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36‘, ‘Referer‘: url, ‘Accept-Encoding‘: ‘gzip, deflate, sdch‘, ‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘, ‘Cookie‘: ‘userid360_xml=6B337B22E8098342C5F725D4F58495C6; time_create=1442050990222; id58=05dzXVSaeAcJgzn9Crp9Ag==; bdshare_firstime=1419409592050; bangbangguoqi=true; ppqp=1; tj_ershoubiz=true; tj_ershounobiz=true; CNZZDATA30017898=cnzz_eid%3D443859762-1419406677-%26ntime%3D1431055823; ag_fid=WeYSRnDPQwUjsUJF; myfeet_tooltip=end; quanmyy=forfirst; __ag_cm_=1439442804516; bangbigtip2=1; nearby=NOTSHOW; ipcity=cd%7C%u6210%u90FD; sessionid=4019a46c-3b78-45f9-8af1-d5d576171b60; 58home=cd; bangbangid=1080863912864997567; cookieuid1=05dvUVXOs3ZTEwlzHrnMAg==; __autma=253535702.1952421463.1439442813.1439598477.1439610035.5; __autmc=253535702; __autmz=253535702.1439610035.5.2.autmcsr=cd.58.com|autmccn=(referral)|autmcmd=referral|autmcct=/caishui/; final_history=19947936375429%2C20303113064713%2C16884696076038%2C18742095746434%2C22669284355361; ag_fid=WeYSRnDPQwUjsUJF; __utmt_pageTracker=1; city=cd; Hm_lvt_3bb04d7a4ca3846dcc66a99c3e861511=1439452109,1439458833,1439516942,1439598477; Hm_lpvt_3bb04d7a4ca3846dcc66a99c3e861511=1439627751; __utma=253535702.1249887847.1419409519.1439618478.1439625451.38; __utmb=253535702.20.10.1439625451; __utmc=253535702; __utmz=253535702.1439625451.38.15.utmcsr=cd.58.com|utmccn=(referral)|utmcmd=referral|utmcct=/dailijizh/pn2/; new_session=0; init_refer=http%253A%252F%252Fcd.58.com%252Fdailijizh%252Fpn2%252F%253FPGTID%253D198304873188692623092226919; new_uv=41‘ } }; //http.request(option, function (res) {//这里为使用代理IP,还有bug,暂时没有解决掉. http.get(url, function (res) { var data = ""; res.on(‘data‘, function (chunk) { data += chunk; }); res.on("end", function () { //解析数据并存入数据库 var $ = cheerio.load(data); if ($(‘a.next‘, ‘div.pager‘).length < 1) { flag = true;//设置 抓取完毕的标志 } var item = { url: url, type: type, content: data } //保存列表数据 mongooseModel.create(item, function (error) { if (error) { console.log(error); } else { console.log(‘保存成功 页码: ‘ + pageNo + ‘ ‘ + url); if (proxyIndex = 10) { proxyIndex = 0; } else { proxyIndex = proxyIndex + 1; } pageNo = pageNo + 1; setTimeout(crawl, 5020);//设置为5秒以上,就不会被跳转验证页面, O(∩_∩)O哈哈~被我发现了. 其实最好是使用代理. } }); }); }).on("error", function (error) { console.log(‘抓取错误: ‘ + error.message); }); }; //开始抓取数据 crawl();
时间: 2024-10-10 17:02:06