# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo import pymysql from scrapy import Request from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline class Images360Pipeline(object): def process_item(self, item, spider): return item # mongo db class MongoPipeline(object): def __init__(self, mongo_url, mongo_db): self.mongo_url = mongo_url self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_url=crawler.settings.get(‘MONGO_URL‘), mongo_db=crawler.settings.get(‘MONGO_DB‘) ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_url) self.db = self.client[self.mongo_db] def process_item(self, item, spider): self.db[item.collection].insert(dict(item)) return item def close_spider(self, spider): self.client.close() # mysql class MysqlPipeline(object): def __init__(self, host, database, user, password, port): self.host = host self.database = database self.user = user self.password = password self.port = port @classmethod def from_crawler(cls, crawler): return cls( host=crawler.settings.get(‘MYSQL_HOST‘), database=crawler.settings.get(‘MYSQL_DATABASE‘), user=crawler.settings.get(‘MYSQL_USER‘), password=crawler.settings.get(‘MYSQL_PASSWORD‘), port=crawler.settings.get(‘MYSQL_PORT‘) ) def open_spider(self, spider): self.db = pymysql.connect(self.host, self.user, self.password, self.database, charset=‘utf8‘, port=self.port) self.cursor = self.db.cursor() def close_spider(self, spider): self.db.close() def process_item(self, item, spider): data = dict(item) keys = ‘,‘.join(data.keys()) value = ‘,‘.join([‘%s‘] * len(data)) sql = ‘insert into %s (%s) values (%s)‘ % (item.table, keys, value) self.cursor.execute(sql, tuple(data.values())) self.db.commit() return item # 下载图片 class ImagePipeline(ImagesPipeline): def file_path(self, request, response=None, info=None): url = request.url file_name = url.split(‘/‘)[-1] return file_name # 如果图片下载失败,不进行保存数据库,IMAGES_STORE = ‘保存的文件名称如: ./images‘ def item_completed(self, results, item, info): image_paths = [x[‘path‘] for ok, x in results if ok] if not image_paths: raise DropItem(‘Image Downloaded Failed‘) return item def get_media_requests(self, item, info): yield Request(item[‘url‘])
settings.py配置
# 只列出部分代码,最先执行ImagePipeline ITEM_PIPELINES = { ‘images360.pipelines.ImagePipeline‘: 300, ‘images360.pipelines.MongoPipeline‘: 301, ‘images360.pipelines.MysqlPipeline‘: 302, } MAX_PAGE = 50 MONGO_URL = ‘localhost‘ MONGO_DB = ‘images360‘ BOT_NAME = ‘images360‘ MYSQL_HOST = ‘localhost‘ MYSQL_DATABASE = ‘images360‘ MYSQL_USER = ‘root‘ MYSQL_PASSWORD = ‘123456‘ MYSQL_PORT = ‘3306‘ # 下载的图片保存路径 IMAGE_STORE = ‘./images‘ SPIDER_MODULES = [‘images360.spiders‘] NEWSPIDER_MODULE = ‘images360.spiders‘ # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = ‘images360 (+http://www.yourdomain.com)‘ # Obey robots.txt rules ROBOTSTXT_OBEY = False
原文地址:https://www.cnblogs.com/412013cl/p/9098739.html
时间: 2024-10-07 06:30:17