# 链家新房爬虫 **今日目标** 爬取最新地区以及对应的房价 ```python import requests import re import csv class LianjiaSpider(object): def __init__(self): self.url=‘https://cq.lianjia.com/ershoufang/pg{}/‘ self.headers={‘User-Agent‘: ‘Mozilla/5.0‘} #获取网址 def get_page(self,url): res=requests.get(url,headers=self.headers) html=res.content.decode() #直接调用解析函数 self.parse_page(html) #解析数据 def parse_page(self,html): pattern=re.compile(‘<div class="houseInfo"><span.*?data-el="region">(.*?)</a>.*?<div class="totalPrice"><span>(.*?)</span>‘,re.S) r_list=pattern.findall(html) self.write_page(r_list) #保存 def write_page(self,r_list): film_list=[] with open(‘lianjia.csv‘,‘a‘) as f: writer = csv.writer(f) for r in r_list: # 把处理过的数据定义成元组 t=(r[0].strip(),r[1]+‘万‘) film_list.append(t) writer.writerows(film_list) def main(self): for page in range(1,11): url=self.url.format(page) self.get_page(url) print(‘正在打印{}页‘.format(page)) if __name__ == ‘__main__‘: spider=LianjiaSpider() spider.main() ```
原文地址:https://www.cnblogs.com/cxiaolong/p/11234872.html
时间: 2024-10-31 07:30:14