1.代码
#coding=utf-8 import requests import bs4 import re import openpyxl def open_url(url): headers={"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"} res=requests.get(url,headers=headers) return res def find_data(res): data=[] soup=bs4.BeautifulSoup(res.text,"html.parser") content=soup.find(id="Cnt-Main-Article-QQ") target=content.find_all("p",style="TEXT-INDENT: 2em") target=iter(target) for each in target: if each.text.isnumeric(): data.append([re.search(r‘\[(.+)\]‘,next(target).text).group(1), re.search(r‘\d.*‘,next(target).text).group(), re.search(r‘\d.*‘,next(target).text).group(), re.search(r‘\d.*‘,next(target).text).group()]) return data def to_excel(data): wb=openpyxl.Workbook() wb.gusee_types=True ws=wb.active ws.append([‘城市‘,"平均房价","平均工资","房价工资比"]) for each in data: ws.append(each) wb.save("2017全国城市房价 工资排行榜.xlsx") def main(): url="https://news.house.qq.com/a/20170702/003985.htm" res=open_url(url) data = find_data(res) to_excel(data) with open("test.txt","w",encoding="utf-8") as file: file.write(res.text) if __name__=="__main__": main()
原文地址:https://www.cnblogs.com/utopia666/p/10706025.html
时间: 2024-11-09 11:57:14