一个很简单的爬虫,爬取中大周边地点的点评信息。
# -*- coding: utf-8 -*- import requests import re import time def placeSplider(name, star, url): time.sleep(5) res = requests.get(‘http://www.dianping.com‘+url) text = res.text longInfo = "<p class=\"desc J-desc\">(.*?)</p>" longInfo_re = re.compile(longInfo, re.DOTALL) longInfos = longInfo_re.findall(text) info = "sml-rank-stars sml-str(.*?)\".*?<p class=\"desc\">(.*?)</p>" info_re = re.compile(info, re.DOTALL) results = info_re.findall(text) #print result #print ‘%d results‘ %len(results) if len(results) == 0 or len(results[0]) < 2 or results[0][1].count(u‘人点评‘) > 0: print u‘没有点评\n‘ return fOut = open(‘D:\\%s.txt‘ %name, ‘w‘) fOut.write(‘place star %s\n‘ %star) for result in results: star = result[0] info = result[1] if info.count(‘<span‘) > 0 or info.count(u‘仅售‘)>0:#去广告 print ‘‘ break else: if info[-6:] == u"......":#替换短评论为相应的长评论 info = info[:-6] for i in longInfos: if i.count(info) > 0: info = i break info = info.replace("<br/>", ‘‘) info = info.replace("<br>", ‘‘) info = info.replace(" ", ‘‘) print star, info fOut.write(‘%s\n‘ %star) fOut.write(‘%s\n‘ %info.encode(‘u8‘)) fOut.close() for page in range(1, 6): res = requests.get(‘http://www.dianping.com/search/keyword/206/0_%E4%B8%AD%E5%B1%B1%E5%A4%A7%E5%AD%A6/p‘+str(page)) text = res.text href = "data-hippo-type=\"shop\" title=\"(.*?)\" target=\"_blank\" href=\"(.*?)\".*?sml-rank-stars sml-str(.*?)\"" href_re = re.compile(href, re.DOTALL) result = href_re.findall(text) for place in result: name = place[0] url = place[1] star = place[2] print name, star, url placeSplider(name, star, url) time.sleep(5)
时间: 2024-10-28 15:45:13