1. spider文件
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector item = DomzItem() image_urls = hxs.select(‘//img/@src‘).extract() item[‘image_urls‘] = ["http:" + x for x in image_urls] return item
from scrapy.selector import HtmlXPathSelector hxs = HtmlXPathSelector(response)
name = "wikipedia" allowed_domains = ["wikipedia.org"] start_urls = [ "http://en.wikipedia.org/wiki/Pune" ]
2. setting文件
ITEM_PIPELINES = [‘scrapy.contrib.pipeline.images.ImagesPipeline‘] IMAGES_STORE= ‘...‘
3. item 文件
image_urls = Field() images = Field()
时间: 2024-10-19 11:06:25