1 import requests 2 import re 3 html = ‘http://www.jingcaiyuedu.com/book/317834.html‘ 4 response = requests.get(html) 5 ‘‘‘while(str(response)!="<Response [200]>"): 6 response = requests.get(html) 7 print(response) 8 ‘‘‘ 9 response.encoding = ‘utf-8‘ 10 html = response.text 11 title =re.findall(r‘<meta property="og:novel:book_name" content="(.*?)"/>‘,html)[0] 12 dl = re.findall(r‘<dl id="list">.*?</dl>‘,html,re.S)[0] 13 chapter_info_list = re.findall(r‘href="(.*?)">(.*?)<‘,dl) 14 fb = open ("%s.txt"%title,"w",encoding = "utf-8") 15 for chapter_info in chapter_info_list: 16 chapter_url,chapter_title = chapter_info 17 18 chapter_url =‘http://www.jingcaiyuedu.com%s‘ % chapter_url 19 chapter_response = requests.get(chapter_url) 20 chapter_response.encoding = ‘utf-8‘ 21 chapter_html = chapter_response.text 22 chapter_content = re.findall(r‘<script>a1\(\);</script>(.*?)<script>a2\(\);</script>‘,chapter_html,re.S)[0] 23 chapter_content = chapter_content.replace(‘<br /><br /> ‘,‘‘) 24 chapter_content = chapter_content.replace(‘ ‘,‘‘) 25 chapter_content = chapter_content.replace(‘ ‘,‘‘) 26 fb.write(chapter_title) 27 fb.write(chapter_content) 28 fb.write(‘\n‘) 29 print(chapter_url) 30 31 #print(chapter_info_list)
第一次使用爬虫,python的功能由衷的强大,不过遭遇的运程主机的强行关闭,基本只能下载前几章就会遭遇强行关闭,下一价段争取解决
原文地址:https://www.cnblogs.com/kangdong/p/8480347.html
时间: 2024-10-12 20:53:10