import scrapyimport refrom collections import Counterfrom lianjia.items import LianjiaItem class LianjiaSpiderSpider(scrapy.Spider): name = ‘lianjia_spider‘ allowed_domains = [‘wh.lianjia.com‘] start_urls = [‘https://wh.lianjia.com/ershoufang/baibuting/‘] def parse(self, response): rsp = (response.body.decode("utf-8")) #print(response.xpath("//div")) item = LianjiaItem() info_list = response.xpath("//div//ul//li[@class=‘clear LOGCLICKDATA‘]") #print(len(info_list)) #print(info_list) for i in info_list: #print(i) item["xiaoqu_name"] = i.xpath(‘.//div[@class="houseInfo"]//a[@target="_blank"]/text()‘).extract()[0] #print(xiaoqu_name) #xiaoqu_link = i.xpath(‘.//div[@class="houseInfo"]//@href‘).extract()[0] #print(xiaoqu_link) item["name"] = i.xpath(‘.//div[@class="info clear"]//a/text()‘).extract()[0] #print(name) item["area"] = i.xpath(‘.//div[@class="info clear"]//div[@class="positionInfo"]//a/text()‘).extract()[0] #print(area) item["link"] = i.xpath(".//div[@class=‘title‘]//@href").extract()[0] #print(link) item["summary"] = i.xpath(‘.//div[@class="houseInfo"]/text()‘).extract()[0] # summary 总结 朝向 装修等,电梯等 #print(summary) item["floor"] = i.xpath(‘.//div[@class="info clear"]//div[@class="positionInfo"]/text()‘).extract()[0] #print(floor) item["zongjia"] = i.xpath(‘.//div[@class="info clear"]//div[@class="totalPrice"]//span/text()‘).extract()[0]# + "万" #组合上单位 #print(zongjia) item["danjia"] = i.xpath(‘.//div[@class="info clear"]//div[@class="unitPrice"]//span/text()‘).extract()[0] #print(danjia) yield item #经过分析发现,如果直接在 武昌 汉口 这样的大区域下搜索 ,最多显示30页数据,所以想要完全爬取,必须把所有小区域的链接挨个遍历 area_list = ["baibuting","dazhilu","dijiao","erqi2","houhu","huangpuyongqing","qianjinjianghan","sanyanglu","tazihu","yucaihuaqiao", "changqinglu","changfengchangmatou","changganglu","taibeixiangganglu","tangjiadun","wuguangwansongyuan","xinhualuwanda","yangchahu", "baofengchongren","changfengchangmatou","cbdxibeihu","gutian","hanzhengjie","jixian2","wujiashan","zongguan", "changqinghuayuan","dongxihuqita","jinyinhu","jiangjunlu","baishazhou","chuhehanjie","donghudongting","jiedaokou","jiyuqiao","shuiguohu","shouyi","shahu", "tuanjiedadao","wuchanghuochezhan","xudong","yangyuan","zhongbeilu","zhongnandingziqiao","zhuodaoquan","hongshanqita","qingshan1","huquanyangjiawan","luoshinanlu", "laonanhu","nanhuwoerma","xinnanhu","qilimiao","sixin","wangjiawan","zhongjiacun","guanxichangzhi","guangguguangchang","guanshandadao","guanggunan","guanggudong", "huakeda","jinronggang","minzudadao","sanhuannan","canglongdao","jiangxiaqita","miaoshan","wenhuadadao","caidianqita","dunkou", "hankoubei","huangbeiqita","panlongcheng","qianchuan","xinzhouqita","yangluo"] #counter = Counter(area_list) #查询列表中是否有重复 #print(counter) #遍历所有区域后,再遍历0~30页 这样才能确保网站上的所有数据都被爬取,否则信息严重缺失 for i in area_list: for num in range(0,30): yield scrapy.Request("https://wh.lianjia.com/ershoufang/"+ i +"/pg"+ str(num), callback=self.parse) items和pipelines无特别之处,按照常规写即可使用。
原文地址:https://www.cnblogs.com/cwkcwk/p/9710827.html
时间: 2024-10-02 01:43:08