extensions.py文件 # -*- coding: utf-8 -*- # 该扩展会在以下事件时记录一条日志: # spider被打开 # spider被关闭 # 爬取了特定数量的条目(items) import logging from collections import defaultdict from scrapy import signals from scrapy.exceptions import NotConfigured from datetime import datetime logger = logging.getLogger(__name__) class SpiderOpenCloseLogging(object): def __init__(self, item_count): self.item_count = item_count self.items_scraped = 0 self.items_dropped = 0 self.stats = defaultdict(int) # 默认是0 正常状态 self.err_stats = defaultdict(int) # 默认是0 print("=="*20, ‘Extension object created 扩展对象被创建‘) @classmethod def from_crawler(cls, crawler): # first check if the extension should be enabled and raise # NotConfigured otherwise # 关键:这里如果是False就直接放弃对象的创建了,在settings中写一个MYEXT_ENABLED,设置为True if not crawler.settings.getbool(‘MYEXT_ENABLED‘): raise NotConfigured # get the number of items from settings # 默认每爬1000条才记录一次log,可以在settings中设置这个MYEXT_ITEMCOUNT数字 item_count = crawler.settings.getint(‘MYEXT_ITEMCOUNT‘, 1000) # instantiate the extension object ext = cls(item_count) # connect the extension object to signals # 把ext.spider_opened这个函数绑定到signal=signals.spider_opened这个信号上, # 每当一个item对象被yield出来的时候,这个信号就会产生 crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) # signals.item_scraped这个是主要的信号,前提是一个item被爬之后,并通过所有的Pipeline没有被drop掉 crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) # 注册一个item_dropped信号,当item被drop之后这个信号会触发 crawler.signals.connect(ext.item_dropped, signal=signals.item_dropped) # 注册一个ext.response_received crawler.signals.connect(ext.response_received, signal=signals.response_received) # return the extension object return ext def spider_opened(self, spider): # spider.log("opened spider %s" % spider.name) # 可以把spider.log替换成print print("opened spider %s" % spider.name) def spider_closed(self, spider): # spider.log("closed spider %s" % spider.name) # 可以把spider.log替换成print print("closed spider %s" % spider.name) def item_scraped(self, item, spider): self.items_scraped += 1 if self.items_scraped % self.item_count == 0: # spider.log("scraped %d items" % self.items_scraped) # 可以把spider.log替换成print print("scraped %d items" % self.items_scraped) def item_dropped(self, item, spider, response, exception): self.items_dropped += 1 if self.items_dropped % self.item_count == 0: # spider.log("scraped %d items" % self.items_scraped) print("dropped %d items" % self.items_dropped) def response_received(self, response, request, spider): # 监控爬虫的健康情况 # 统计当前这一分钟正确状态和错误状态的数量 now = datetime.now().strftime(‘%Y%m%d%H%M‘) self.stats[now] += 1 # 正常状态+! if response.status in [401, 403, 404, 500, 501, 502]: self.err_stats[now] += 1 # 错误状态+1 if self.err_stats[now] / float(self.stats[now]) > 0.2: # 占比 # 一般线上部署有warning信息会发邮件,有err信息会发短信 # warning级别比err低,但是比info高 logger.warning(f‘received {self.stats[now]} response and {self.err_stats[now]} of them is not 200,{now}‘)
settings中配置文件 # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html MYEXT_ENABLED = True # 使用自定义插件 MYEXT_ITEMCOUNT = 10 # 每爬10条打印一次或者记录一次日志 EXTENSIONS = { # ‘scrapy.extensions.telnet.TelnetConsole‘: None, ‘qianmu.extensions.SpiderOpenCloseLogging‘: 1, }
原文地址:https://www.cnblogs.com/kenD/p/12248037.html
时间: 2024-10-29 23:18:36