pyquery允许对xml文档进行jquery查询。该API尽可能类似于jquery。pyquery使用lxml进行快速的xml和html操作,能够以jQuery的语法来操作解析 HTML 文档。
实例:爬取疫情报告https://voice.baidu.com/act/newpneumonia/newpneumonia
(今天报错还未调试成功,明天继续)
import requests from pyquery import PyQuery as pq def get_page(url): """发起请求 获得源码""" r = requests.get(url) r.encoding = ‘utf8‘ html = r.text return html def parse(text): """解析数据 写入文件""" doc = pq(text) # 获得每一行的tr标签 ths = doc(‘table.table thead tr.VirusTable_1-1-156_26gN5Z‘).items() for th in ths: area = th.find(‘span‘).text() # 地区 confirm = th.find(‘td:nth-child(2)‘).text() # 确诊 death = th.find(‘td:nth-child(3)‘).text() # 死亡 cure = th.find(‘td:nth-child(4)‘).text() # 治愈 with open(‘D:\yiqing.csv‘, ‘a+‘, encoding=‘utf8‘) as f: f.write(area + ‘\t\t‘) f.write(confirm + ‘\t\t‘) f.write(death + ‘\t\t‘) f.write(cure + ‘\t\t\n‘) print("写入完成") """ tds = doc(‘table.table tbody tr‘).items() for td in tds: rank = td.find(‘td:first-child‘).text() # 排名 name = td.find(‘div‘).text() # 大学名称 city = td.find(‘td:nth-child(3)‘).text() # 城市 score = td.find(‘td:nth-child(4)‘).text() # 总分 with open(‘D:\yiqing.csv‘, ‘a+‘, encoding=‘utf8‘) as f: f.write(rank + ‘\t\t‘) f.write(name + ‘\t\t‘) f.write(city + ‘\t\t‘) f.write(score + ‘\t\t\n‘) print("写入完成") """ if __name__ == "__main__": url = "https://voice.baidu.com/act/newpneumonia/newpneumonia" text = get_page(url) parse(text)
原文地址:https://www.cnblogs.com/sengzhao666/p/12304392.html
时间: 2025-01-15 03:36:45