抓取糗事百科内容及评论,不包含图片信息。user-agent填入浏览器的即可。user-agent对应的value,360极速浏览器的话,可以在地址栏输入about:version,回车,用户代理后面的一长串就是需要填入‘‘里面的内容。其他的可以自行百度
import urllib.request import re from urllib import request from bs4 import BeautifulSoup #1.获取网页源代码 def get_html(url): headers = { ‘User-Agent‘: ‘‘, } req = request.Request(headers=headers,url=url) response = urllib.request.urlopen(req) content = response.read().decode(‘utf-8‘) return content #获取评论链接 def get_comment_link(content,comment_url_base): soup = BeautifulSoup(content,‘html.parser‘) articleFloor = 1 for string in soup.find_all(attrs=re.compile(r"article block untagged mb15.*?")): comment = str(string.get(‘id‘)).strip().split("_")[2] comment_url = comment_url_base % comment#评论链接 get_comment_content(comment_url,articleFloor)#获取评论内容 articleFloor += 1 #获取糗事内容及评论内容 def get_comment_content(comment_url,articleFloor): commentPage = get_html(comment_url) commentFloor = 1 soupComment = BeautifulSoup(commentPage,‘html.parser‘) for item in soupComment.find_all(‘div‘,class_=‘content‘): print(articleFloor,".",item.get_text().strip())#获取糗事内容 for comment in soupComment.find_all(attrs="body"): print(" ",commentFloor,"楼回复:",comment.get_text())#获取评论内容 commentFloor += 1 def command(): while True: raw = input("点击enter查看或者输入exit退出,请输入你的选择:") if raw==‘enter‘: main() break else: break def main(): article_url_base = ‘https://www.qiushibaike.com/8hr/page/%d/‘#文章地址 comment_url_base = ‘https://www.qiushibaike.com/article/%s‘#评论地址 article_url = article_url_base % 2 content = get_html(article_url) get_comment_link(content,comment_url_base) if __name__ == ‘__main__‘: command()
原文地址:https://www.cnblogs.com/smart-zihan/p/9615915.html
时间: 2024-10-07 14:42:42