问题:在运行scrapy的过程中,如果想主动退出该怎么做?
背景:比如说我只要爬取当日的新闻,那么在遍历的时候,如果出现了超过1条不是当日的新闻,那么就不爬取了,就主动退出爬虫,这个时候该怎么做呢?
IDE:pycharm
版本:python3
框架:scrapy
系统:windows10
代码如下:
# -*- coding: utf-8 -*- import scrapy from torrentSpider.items.NavigationItem import NavigationItem from torrentSpider.items.TorrentItem import TorrentItem import time import random import logging import os class XxxSpider(scrapy.Spider): name = "xxx_spider" allowed_domains = [‘www.xxx.com‘] start_urls = [‘http://www.xxx.com/1.html‘] # 网站前缀 web_pre_url = ‘http://xxx.com‘ # 计数 count = 0 def parse(self, response): # 设置请求也随机延迟 time.sleep(random.randint(0, 5)) # 获取导航栏的数量 navigation_type_number = response.xpath(‘//*[@id="hypoNav"]/div/ul/li/em/a/text()‘).extract() for n_k in range(1, len(navigation_type_number)): navigation_item = NavigationItem() # 网站标题 navigation_item[‘navigation_title‘] = response.xpath(‘//*[@id="logoSea"]/div[1]/a/img/@alt‘).extract()[0] # 导航栏目分类名称 navigation_item[‘navigation_type‘] = response.xpath(‘//*[@id="hypoNav"]/div/ul/li[‘+str(n_k+1)+‘]/em/a/text()‘).extract()[0] # 导航链接 navigation_item[‘navigation_url‘] = response.xpath(‘//*[@id="hypoNav"]/div/ul/li[‘+str(n_k+1)+‘]/em/a/@href‘).extract()[0] # 获取子导航栏的数量 sub_navigation_type_number = response.xpath(‘//*[@id="nodeNav"]/div/ul/li/em/a/span/text()‘).extract() for sub_k in range(1, len(sub_navigation_type_number)): sub_navigation_item = NavigationItem() # 网站标题 sub_navigation_item[‘navigation_title‘] = response.xpath(‘//*[@id="logoSea"]/div[1]/a/img/@alt‘).extract()[0] # 副导航栏目分类名称 sub_navigation_item[‘sub_navigation_type‘] = response.xpath(‘//*[@id="nodeNav"]/div/ul/li[‘+str(sub_k)+‘]/em/a/span/text()‘).extract()[0] # 副导航栏链接 sub_navigation_item[‘sub_navigation_url‘] = response.xpath(‘//*[@id="nodeNav"]/div/ul/li[‘+str(sub_k)+‘]/em/a/@href‘).extract()[0] # 获取每页电影条目数长度 movie_name_tr_array = response.xpath(‘/html/body/div[2]/table[1]/tr/td[1]/table[2]/tbody/tr‘).extract() for i_k in range(1, len(movie_name_tr_array)): # 子链接 str_sub_url = ‘/html/body/div[2]/table[1]/tr/td[1]/table[2]/tbody/tr[‘+str(i_k)+‘]/td[1]/a/@href‘ m_link = self.web_pre_url + response.xpath(str_sub_url).extract()[0] yield scrapy.Request(url=m_link, callback=self.parse_links, dont_filter=True) # 解析下一页 next_link = response.xpath(‘//*[@class="pagegbk"]/@href‘).extract() if next_link: if len(next_link) == 1: next_link = next_link[0] else: next_link = next_link[1] yield scrapy.Request(self.web_pre_url + next_link, callback=self.parse) # 爬取子链接 def parse_links(self, response): torrent_item = TorrentItem() # 标题 torrent_item[‘torrent_title‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[1]/tbody/tr/td/font/text()‘) # 影片名称 torrent_item[‘torrent_name‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[1]/text()‘) # 导演 torrent_item[‘torrent_director‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[2]/text()‘) # 影片演员 torrent_item[‘torrent_actor‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[1]/span/font[2]/text()‘) # 语言 torrent_item[‘torrent_language‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[3]/text()‘) # 影片类型 torrent_item[‘torrent_type‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[4]/text()‘) # 影片地区 torrent_item[‘torrent_region‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[5]/text()‘) # 更新时间 torrent_item[‘torrent_update_time‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[6]/text()‘) # 影片状态 torrent_item[‘torrent_status‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[7]/text()‘) # 上映日期 torrent_item[‘torrent_show_time‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[1]/font[8]/text()‘) # 剧情介绍 torrent_item[‘torrent_introduction‘] = self.check_xpath_value(response, ‘/html/body/div[2]/table[2]/tbody/tr/td/div[2]/text()‘) # 影片地址 torrent_item[‘torrent_url‘] = self.check_xpath_value(response, ‘//*[@id="plist"]/table[2]/tbody/tr[2]/td/ul/li/input/@value‘) # 获取当前时间并格式化 current_date = time.strftime(‘%Y-%m-%d‘, time.localtime()) print(‘current_date = %s‘ % str(current_date)) print(‘torrent_update_time = %s‘ % torrent_item[‘torrent_update_time‘]) # 如果不是当天的就不爬取,并且计数 if torrent_item[‘torrent_update_time‘] == str(current_date): yield torrent_item else: self.count = self.count + 1 # 判断计数是否超过50,超过就不爬取了 if self.count > 1: # logging.info("计数超过10,停止爬虫") self.crawler.engine.close_spider(self, ‘计数超过10,停止爬虫!‘) pass # 判断是否为空 @staticmethod def check_xpath_value(response, xpath_url): xpath_value = response.xpath(xpath_url).extract() if xpath_value: if xpath_value[0].strip() != ‘‘: return xpath_value[0] else: return "null" else: return "null"
注意以上代码中标红的地方:
self.crawler.engine.close_spider(self, ‘计数超过10,停止爬虫!‘)
1,此行代码是写在spider文件中的
2,虽然这一行代码会停止爬虫,但是这一行代码的停止并不是立即停止
原因是因为当我们不更改爬虫的setting.py文件的时候,默认配置是:
# Configure maximum concurrent requests performed by Scrapy (default: 16) # CONCURRENT_REQUESTS = 32
含义就是:Scrapy downloader 并发请求(concurrent requests)的最大值,默认: 16
那么这个时候的问题来了,按照以上的写法,在队列里就已经有十几个请求了,你停止之后,这十几个请求依旧会执行下去,所以并不是立即停止,如果想改变的话,就必须改变此项配置,设为:
CONCURRENT_REQUESTS = 1
具体scrapy爬虫原理请自行百度,并请自行调试,谢谢~
原文地址:https://www.cnblogs.com/huangtao1927/p/10278501.html
时间: 2024-10-29 18:15:15