直接上代码
#!/usr/bin/env python # -*- coding: utf-8 -*- import re import urllib.request def gettext(url,page): headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36") opener=urllib.request.build_opener() opener.addheaders=[headers] urllib.request.install_opener(opener) data=urllib.request.urlopen(url).read().decode("utf-8") userpat='<h2>(.*?)</h2>' textpat='<div class="content">(.*?)</div>' userlist=re.compile(userpat,re.S).findall(data) textlist=re.compile(textpat,re.S).findall(data) dictionary=dict(zip(userlist,textlist)) x=1 for key,value in dictionary.items(): value=value.replace("\n","") value=value.replace("<span>","") value=value.replace("</span>","") value=value.replace("<br/>","\n") print("第"+str(page)+"页"+str(x)+"用户"+key) print("内容:"+value) print('\n') print("-----------------------------") x+=1 for i in range(1,3): url="https://www.qiushibaike.com/8hr/page/"+str(i) gettext(url,i)
执行结果
原文地址:http://blog.51cto.com/superleedo/2123509
时间: 2024-10-11 02:13:41