...
import requests from requests import ConnectionError from bs4 import BeautifulSoup import pymongo #MongoDB的配置 MONGO_URL = ‘localhost‘ MONGO_DB = ‘Fangtianxia‘ MONGO_TABLE = ‘HouseInfo‘ #配置MongoDB client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] base_url = ‘http://esf.sz.fang.com/housing/__1_0_0_0_‘ headers = { ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36‘ } #发送请求,获得响应 def get_html(url): print(‘正在爬取‘,url) try: response = requests.get(url=url,headers=headers) if response.status_code == 200: return response.text else: print(response.status_code) return None except ConnectionError: print(‘出现错误!‘) return None #解析响应体,进行信息提取 def parser_html(html): soup = BeautifulSoup(html,‘lxml‘) names = soup.select(‘.houseList .list .plotListwrap dd p‘)[::3]#小区名称、小区链接、 addresses = soup.select(‘.houseList .list .plotListwrap dd p‘)[1::3] # 小区地址 years = soup.select(‘.houseList .list .plotListwrap dd li‘)[::3]#修建时间 princes = soup.select(‘.houseList .list .listRiconwrap .priceAverage‘)#小区价格 ratioes = soup.select(‘.houseList .list .listRiconwrap .ratio‘)#小区涨幅 for i in range(len(names)): house ={ ‘name‘ : names[i].a.text.strip(), ‘url‘: names[i].find(name=‘a‘).attrs[‘href‘].strip(), ‘address‘:addresses[i].text.strip(), ‘prince‘:princes[i].text.strip()[:-4], ‘ratio‘: ratioes[i].text.strip() } save_to_mongo(house) def save_to_mongo(data): try: if db[MONGO_TABLE].insert(data): print(‘成功保存到数据库‘,data) except Exception: print(‘保存失败!‘,data) #主体函数 def main(): for page in range(1,101): url = base_url + str(page) +‘_0_0_0/‘#深圳的小区房价,不同页码下的url html = get_html(url)#完成请求,获得响应体 parser_html(html)#解析响应体,提取感兴趣的信息 if __name__==‘__main__‘: main()
原文地址:https://www.cnblogs.com/mysterious-killer/p/10156929.html
时间: 2024-11-05 16:27:05