定义爬取数据
import scrapy
class LianjianItem(scrapy.Item):
name = scrapy.Field()
address = scrapy.Field()
type = scrapy.Field()
size = scrapy.Field()
price = scrapy.Field()
编写爬虫文件
# -*- coding: utf-8 -*-
import scrapy
from lianjian.items import LianjianItem
from scrapy.http import Request
class LianjiaspiderSpider(scrapy.Spider):
name = ‘lianjiaSpider‘
allowed_domains = [‘lianjia.com‘]
start_urls = [‘https://hz.fang.lianjia.com/loupan/yuhang-xiaoshan-binjiang/pg1/‘]
def parse(self, response):
item = LianjianItem()
item[‘name‘] = response.xpath(‘//div[@class="resblock-name"]/a/text()‘).extract()
item[‘address‘] = response.xpath(‘//div[@class="resblock-location"]/a/text()‘).extract()
item[‘type‘] = response.xpath(‘//a[@class="resblock-room"]/span/text()‘).extract()
item[‘size‘] = response.xpath(‘//div[@class="resblock-area"]/span/text()‘).extract()
item[‘price‘] =response.xpath(‘//div[@class="resblock-price"]/div[@class="second"]/text()‘).extract()
yield item
for i in range(1,52):
url = ‘https://hz.fang.lianjia.com/loupan/yuhang-xiaoshan-binjiang/pg‘+str(i)+‘/‘
yield Request(url,callback=self.parse)
定义管道
编写管道文件
# -*- coding: utf-8 -*-
import xlwt
import xlrd
class LianjianPipeline(object):
def __init__(self):
pass
def process_item(self, item, spider):
# print("进来了"+str(item))
line = 0
for i in range(len(item[‘name‘])):
name = item[‘name‘][i]
# self.sheet.write(line,0,name)
print(name)
address = item[‘address‘][i]
print(address)
# self.sheet.write(line,1,address)
type = item[‘type‘][i]
print(type)
# self.sheet.write(line, 2, type)
size = item[‘size‘][i]
print(size)
# self.sheet.write(line, 3, size)
price = item[‘price‘][i]
print(price)
# self.sheet.write(line, 4, price)
print("-----------------------")
line += 1
# self.book.save("lianjia.xls")
return item
settings.py开启管道
ITEM_PIPELINES = {
‘lianjian.pipelines.LianjianPipeline‘: 300,
}
启动爬虫文件
原文地址:https://www.cnblogs.com/yiweiblog/p/12652493.html
时间: 2024-11-02 11:49:54