1 import requests 2 import bs4 3 4 #获取网页代码 5 def gethtml(url): 6 try: 7 response = requests.get(url) 8 response.raise_for_status() 9 response.encoding = response.apparent_encoding 10 return response.text 11 except: 12 return "禁止爬取本网站" 13 14 #获取每一页中的文字 15 def chapters(url,name): 16 html = gethtml("http://www.bjkgjlu.com"+url) 17 soup = bs4.BeautifulSoup(html,‘html.parser‘) 18 for i in soup.find_all("div",attrs={"class":"chapter_content"}): 19 with open(name+".txt","wb") as f: 20 f.write(i.text.split("<")[0].encode("utf-8")) 21 print(name+"爬取结束,并存入文件") 22 23 if __name__=="__main__": 24 url = "http://www.bjkgjlu.com/303618kyi/catalog" 25 chapter_name_list = [] 26 chapter_url_list = [] 27 html =gethtml(url) 28 soup = bs4.BeautifulSoup(html, "html.parser") 29 30 for i in soup.findAll("div", attrs={"class": "col-xs-120 col-sm-60 col-md-40 col-lg-30"}): 31 for j in i.children: 32 chapter_name_list.append(j.text) 33 chapter_url_list .append(j.get("href")) 34 print(chapter_name_list ) 35 for j in range(len(chapter_name_list)): 36 chapters(chapter_url_list[j],chapter_name_list[j] )
原文地址:https://www.cnblogs.com/lsyb-python/p/11774319.html
时间: 2024-11-04 05:19:13