"""CloseSpider is an extension that forces spiders to be closed after certain conditions are met. See documentation in docs/topics/extensions.rst """ class CloseSpider(object): def __init__(self, crawler): self.crawler = crawler self.close_on = { ‘timeout‘: crawler.settings.getfloat(‘CLOSESPIDER_TIMEOUT‘), ‘itemcount‘: crawler.settings.getint(‘CLOSESPIDER_ITEMCOUNT‘), ‘pagecount‘: crawler.settings.getint(‘CLOSESPIDER_PAGECOUNT‘), ‘errorcount‘: crawler.settings.getint(‘CLOSESPIDER_ERRORCOUNT‘), } if not any(self.close_on.values()): raise NotConfigured self.counter = defaultdict(int) if self.close_on.get(‘errorcount‘): crawler.signals.connect(self.error_count, signal=signals.spider_error) if self.close_on.get(‘pagecount‘): crawler.signals.connect(self.page_count, signal=signals.response_received) if self.close_on.get(‘timeout‘): crawler.signals.connect(self.spider_opened, signal=signals.spider_opened) if self.close_on.get(‘itemcount‘): crawler.signals.connect(self.item_scraped, signal=signals.item_scraped) crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) @classmethod def from_crawler(cls, crawler): return cls(crawler) def error_count(self, failure, response, spider): self.counter[‘errorcount‘] += 1 if self.counter[‘errorcount‘] == self.close_on[‘errorcount‘]: self.crawler.engine.close_spider(spider, ‘closespider_errorcount‘) def page_count(self, response, request, spider): self.counter[‘pagecount‘] += 1 if self.counter[‘pagecount‘] == self.close_on[‘pagecount‘]: self.crawler.engine.close_spider(spider, ‘closespider_pagecount‘) def spider_opened(self, spider): self.task = reactor.callLater(self.close_on[‘timeout‘], self.crawler.engine.close_spider, spider, reason=‘closespider_timeout‘) def item_scraped(self, item, spider): self.counter[‘itemcount‘] += 1 if self.counter[‘itemcount‘] == self.close_on[‘itemcount‘]: self.crawler.engine.close_spider(spider, ‘closespider_itemcount‘) def spider_closed(self, spider): task = getattr(self, ‘task‘, False) if task and task.active(): task.cancel()
1 上述代码是一个scrapy 关闭爬虫的一个的扩展类,从代码中可以看出主要是实现了timeout, itemcount, pagecount, errorcount 4种方式,因此可以在setting中设置这4种方式,当触发条件的时候会自动停止爬虫
在setting中设置 CLOSESPIDER_TIMEOUT # 指定时间退出 CLOSESPIDER_ITEMCOUNT # 生成了指定数量的item CLOSESPIDER_PAGECOUNT # 抓取了指定数量的响应 CLOSESPIDER_ERRORCOUNT # 在发生指定数量的错误
2 从CloseSpider类中可以了解到停止爬虫是使用了 self.crawler.engine.close_spider() 方法,因此当满足一定条件的时候我们也可以调用这个方法停止scrapy
# 在Spider文件中 self.crawler.engine.close_spider(self, 错误信息) # 在piplines或者是middlewares文件中 spider.crawler.engine.close_spider(spider, 错误信息)
原文地址:https://www.cnblogs.com/kentlin/p/10819998.html
时间: 2024-11-09 10:48:52