scrapy爬虫框架(五)-CrawlSpider
通过CrawlSpider的链接提取器爬取微信小程序社区的文章
创建爬虫文件
此前要进入爬虫文件夹,使用cd
命令,再创建模版(templ)爬虫
scrapy genspider -t crawl 爬虫名 网站域名
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wxapp.items import WxappItem
class WxappspiderSpider(CrawlSpider):
name = 'wxappSpider'
allowed_domains = ['wxapp-union.com']
start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
rules = (
Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
Rule(LinkExtractor(allow=r'.+article-.+\.html'), callback="parse_detail", follow=True)
)
def parse_detail(self, response):
title = response.xpath("//h1[@class='ph']/text()").get()
links = response.xpath("//p[@class='authors']")
author = links.xpath(".//a/text()").get()
time = links.xpath(".//span[@class='time']//text()").getall()
article = response.xpath("//td[@id='article_content']//text()").getall()
article = "".join(article).strip()
item = WxappItem(title=title, author=author, time=time, article=article)
yield item
运行效果
原文地址:https://www.cnblogs.com/senup/p/12321418.html
时间: 2024-11-05 16:32:29