爬虫py文件
# -*- coding: utf-8 -*-
import scrapy
from ..items import RtysItem
class RtSpider(scrapy.Spider):
name = 'rt' #爬虫名,启动项目时用
# allowed_domains = ['www.baidu.com'] #定义爬虫范围 注释掉就可以
start_urls = ['https://www.woyaogexing.com/touxiang/'] #起始url 项目启动时,会自动向url发起请求
def parse(self, response): # response直接代替响应
div_list=response.xpath('//div[@class="list-left z"]/div[2]/div') #解析数据
for i in div_list:
name = i.xpath('./a/text()').extract_first() #变量名 要与items.py中实例化的变量名一样
img_url = i.xpath('./a/img/@src').extract_first()
lianjie_url = i.xpath('./a/@href').extract_first()
items = RtysItem() #实例化items
items['name']=name #将实例化的字段存进字典中
items['img_url']=img_url
items['lianjie_url']=lianjie_url
yield items #发送给管道
pipelines.py 文件
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
class RtysPipeline(object):
def process_item(self, item, spider):
coon = pymongo.MongoClient('localhost',27017) #连接mongodb数据库
db = coon.rtys #创建数据库 有的话就直接用 没有就相当于创建
table = db.rt #创建表 有的话就直接用 没有就相当于创建
table.insert_one(dict(item)) #查入一条数据 转化成字典
return item
存入Mongo时要注意settings.py的配置 注释部分需要打开
settings.py文件
# -*- coding: utf-8 -*-
# Scrapy settings for rtys project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'rtys'
SPIDER_MODULES = ['rtys.spiders']
NEWSPIDER_MODULE = 'rtys.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False #False 爬的网站不受限制 True爬的网站受限制
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'rtys.middlewares.RtysSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
篡改ip的时候需要打开中间件
#DOWNLOADER_MIDDLEWARES = {
# 'rtys.middlewares.RtysDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { #需要注开
'rtys.pipelines.RtysPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class RtysItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field() #设置要爬取的字段名 爬几个就写几个
img_url = scrapy.Field()
lianjie_url = scrapy.Field()
pass
原文地址:https://www.cnblogs.com/pp8080/p/12191213.html
时间: 2024-10-09 21:12:27