1.首先创建爬虫项目
2.进入爬虫
class TaobaoSpider(scrapy.Spider): name = ‘taobao‘ allowed_domains = [‘taobao.com‘] #拿一个笔记本键盘做示例 start_urls = [‘https://s.taobao.com/search?initiative_id=tbindexz_20170306&ie=utf8&spm=a21bo.2017.201856-taobao-item.2&sourceId=tb.index&search_type=item&ssid=s5-e&commend=all&imgfile=&q=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7%94%B5%E8%84%91&suggest=0_1&_input_charset=utf-8&wq=%E7%AC%94%E8%AE%B0%E6%9C%AC&suggest_query=%E7%AC%94%E8%AE%B0%E6%9C%AC&source=suggest‘] #接下来,定义初始化函数 def __init__(self): super(TaobaoSpider,self).__init__() self.driver = webdriver.PhantomJS() #在这里,我用幽灵浏览器,当然也可以用Firefox()和Chrome() 火狐和谷歌浏览器 #然后,开始解析源码 def parse(self, response): div_info = response.xpath(‘//div[@class="info-cont"]‘) for div in div_info title = div.xpath(‘div[@class="title-row"]/a/text()‘).extract_first(‘‘) price = div.xpath(‘div[contains(@class, "sale-row")]/div/span[contains(@class, "price")]/strong/text()‘).extract_first(‘‘) print ‘名称:‘, title, ‘价格:‘, price #关闭爬虫并关闭浏览器 def closed(self,reason): print u‘爬虫关闭了, 原因:‘,reason self.driver.quit()写到这,爬虫类函数写完了,然后需要去设置middlewares中间件
import timefrom selenium import webdriverfrom scrapy.http.response.html import HtmlResponsefrom scrapy.http.response import Response需要这几个模块重写downloadMiddleware这个类
class SeleniumRequestDownloadMiddleWare(object):
super(SeleniumRequestDownloadMiddleWare, self).__init__()RequestDownloadMiddleWare(object):
self.driver = webdriver.PhantomJS()
def process_request(self,request,spider)
if spider.name ==‘taobao‘:
spider.driver.get(request.url)
#设置滚动条,往下拉页面获取源码
for x in xrange(1,11,2):
i = float(x)/10
js = "document.body.scrollTop=document.body.scrollHeight * %f"%i
spider.driver.execute_script(js)
time.sleep(1) #需要设置等待时间1秒,不然加载缓慢的话,不出数据
response = Response(url = request.url,body=bytes(spider.driver.page_source),request = request)
return response else: pass
原文地址:https://www.cnblogs.com/star-god/p/8379790.html
时间: 2024-10-28 11:38:15