爱看书,爱看电影,又在学python3。就爬了一下。上代码
1 import urllib.request 2 from bs4 import BeautifulSoup 3 def get_html(url): 4 web=urllib.request.urlopen(url) 5 soup=BeautifulSoup(web,"html.parser") 6 data=soup.find("div",id="wrapper") 7 return data 8 def get_all(data): 9 data=data.find_all("table") 10 for link in data: 11 name=link.find("div",class_="pl2").find("a").get_text().replace(‘ ‘,‘‘).replace(‘\n‘,‘‘) 12 author=link.find("p",class_="pl").get_text().split(‘/‘)[0].replace(‘ ‘,‘‘) 13 score=link.find("span",class_="rating_nums").get_text().replace(‘ ‘,‘‘) 14 peoplenum=link.find("span",class_="pl").get_text().replace(‘ ‘,‘‘).replace(‘(‘,‘‘).replace(‘)‘,‘‘).replace(‘\n‘,‘‘) 15 try: 16 remark=link.find("p",class_="quote").get_text().replace(‘ ‘,‘‘).replace(‘\n‘,‘‘) 17 except: 18 remark=‘暂无评价‘ 19 with open(‘F://book.txt‘,‘a+‘, encoding=‘UTF-8‘) as f: 20 f.write(name+‘ ‘+author+‘ ‘+score+‘ ‘+peoplenum+‘ ‘+remark+‘\r\n‘) 21 if __name__ == ‘__main__‘: 22 url=‘https://book.douban.com/top250?start=‘ 23 with open(‘F://book.txt‘,‘a+‘, encoding=‘UTF-8‘) as f: 24 f.write(‘书籍名称 ‘+‘作者 ‘+‘评分 ‘+‘评价人数 ‘+‘评论 ‘+‘\r\n‘) 25 for i in range(10): 26 url1=url+str(i*25) 27 get_all(get_html(url1))
上面是书
下面是电影
1 import io 2 import sys 3 import urllib.request 4 from bs4 import BeautifulSoup 5 #获取网页 6 def get_html(url): 7 headers={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/60.0.3112.101 Safari/537.36‘} 8 req = urllib.request.Request(url=url,headers=headers) 9 res = urllib.request.urlopen(req) 10 html=res.read() 11 soup=BeautifulSoup(html,‘html.parser‘) 12 data = soup.find("ol").find_all("li") 13 return data 14 def get_all(data): 15 for info in data: 16 names = info.find("span") 17 name = names.get_text() 18 scores = info.find_all("span",{"class":"rating_num"}) 19 score = scores[0].get_text() 20 nums=info.find("div",class_="star").find_next().find_next().find_next().find_next().get_text() 21 with open(‘F://movie.txt‘,‘a+‘, encoding=‘UTF-8‘) as f: 22 f.write(name+‘ ‘+score+‘ ‘+nums+‘\r\n‘) 23 if __name__ == ‘__main__‘: 24 url=‘https://movie.douban.com/top250?start=‘ 25 with open(‘F://movie.txt‘,‘a+‘, encoding=‘UTF-8‘) as f: 26 f.write(‘电影名称‘+‘ ‘+‘评分‘+‘ ‘+‘评价人数‘+‘\r\n‘) 27 for i in range(10): 28 url1=url+str(i*25)+‘&filter=‘ 29 get_all(get_html(url1))
时间: 2024-12-12 15:36:27