items.py
class CoserItem(scrapy.Item): url = scrapy.Field() name = scrapy.Field() info = scrapy.Field() image_urls = scrapy.Field() images = scrapy.Field()
spiders/coser.py
# -*- coding: utf-8 -*- from scrapy.selector import Selector import scrapy from scrapy.contrib.loader import ItemLoader from Cosplay.items import CoserItem class CoserSpider(scrapy.Spider): name = "coser" allowed_domains = ["bcy.net"] start_urls = ( ‘http://bcy.net/cn125101‘, ‘http://bcy.net/cn126487‘, ‘http://bcy.net/cn126173‘ ) def parse(self, response): sel = Selector(response) for link in sel.xpath("//ul[@class=‘js-articles l-works‘]/li[@class=‘l-work--big‘]/article[@class=‘work work--second-created‘]/h2[@class=‘work__title‘]/a/@href").extract(): link = ‘http://bcy.net%s‘ % link request = scrapy.Request(link, callback=self.parse_item) yield request def parse_item(self, response): l = ItemLoader(item=CoserItem(), response=response) l.add_xpath(‘name‘, "//h1[@class=‘js-post-title‘]/text()") l.add_xpath(‘info‘, "//div[@class=‘post__info‘]/div[@class=‘post__type post__info-group‘]/span/text()") urls = l.get_xpath(‘//img[@class="detail_std detail_clickable"]/@src‘) urls = [url.replace(‘/w650‘, ‘‘) for url in urls] l.add_value(‘image_urls‘, urls) l.add_value(‘url‘, response.url) return l.load_item()
pipelines.py
import requests from Cosplay import settings import os class ImageDownloadPipeline(object): def process_item(self, item, spider): if ‘image_urls‘ in item: images = [] dir_path = ‘%s/%s‘ % (settings.IMAGES_STORE, spider.name) if not os.path.exists(dir_path): os.makedirs(dir_path) for image_url in item[‘image_urls‘]: us = image_url.split(‘/‘)[3:] image_file_name = ‘_‘.join(us) file_path = ‘%s/%s‘ % (dir_path, image_file_name) images.append(file_path) if os.path.exists(file_path): continue with open(file_path, ‘wb‘) as handle: response = requests.get(image_url, stream=True) for block in response.iter_content(1024): if not block: break handle.write(block) item[‘images‘] = images return item
settings.py
ITEM_PIPELINES = {‘Cosplay.pipelines.ImageDownloadPipeline‘: 1} IMAGES_STORE = ‘../Images‘ DOWNLOAD_DELAY = 0.25 # 250 ms of delay
在项目根目录下新建main.py文件,用于调试
from scrapy import cmdline cmdline.execute(‘scrapy crawl coser‘.split())
执行程序
py2 main.py
原文地址:https://www.cnblogs.com/loaderman/p/11890355.html
时间: 2024-11-03 11:52:44