原文引用https://www.dazhuanlan.com/2019/08/26/5d62f6fd2023a/
- 小说网址
- 使用绝色妖娆:鬼医至尊为例(主要是女朋友想看
^_^
)
下面是进程代码
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071 |
import requestsimport threadingfrom bs4 import BeautifulSoupimport reimport osimport timeimport sysimport threading req_header={‘Accept‘:‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,‘Accept-Encoding‘:‘gzip, deflate, br‘,‘Accept-Language‘:‘zh-CN,zh;q=0.9‘,‘Cookie‘:‘UM_distinctid=162afbabff819e-03f2f082776e95-b34356b-1fa400-162afbabff9294; CNZZDATA1259019190=1993576859-1523364262-https%253A%252F%252Fwww.baidu.com%252F%7C1523364262; bookid=124629; chapterid=6510968; chaptername=%25u7B2C1%25u7AE0%2520%25u6797%25u4E2D%25u9634%25u8C0B‘,‘Host‘:‘www.uxiaoshuo.com‘,‘Proxy-Connection‘:‘keep-alive‘,‘Referer‘:‘https://www.uxiaoshuo.com/‘,‘Upgrade-Insecure-Requests‘:‘1‘,‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘} req_url_base=‘http://www.uxiaoshuo.com‘ def (txt_id): txt={} _req_url=[] txt[‘title‘]=‘‘ txt[‘id‘]=str(txt_id) #小说编号 _req_url=txt[‘id‘].split(‘.‘) req_url=req_url_base+ txt[‘id‘] #根据小说编号获取小说URL print("小说编号:"+req_url) try: res=requests.get(req_url, params=req_header) #获取小说第一章接口 soups=BeautifulSoup(res.text,"html.parser") #soup转化 #获取小说题目 txt[‘title‘]=soups.select(‘#webhtml .box_con .con_top a‘)[1].text #打开小说文档写入小说相关信息 fo = open(‘{0}.txt‘.format(txt[‘title‘]), "ab+") #循环写入章节内容 while 1: if _req_url[-1]!=‘html‘: print(txt[‘title‘]+"全部下载成功!") break txt[‘c_title‘]=soups.select(‘#webhtml .box_con .zhangjieming h1‘)[0].text ##章节名称 txt[‘content‘]=soups.select(‘#webhtml .box_con .zhangjieTXT‘)[0] for i in txt[‘content‘].select("script"): #去除无用内容 i.decompose() for i in txt[‘content‘].select("div"): i.decompose() txt[‘content‘]=re.sub( ‘s+‘, ‘rnt‘, txt[‘content‘].text).strip(‘rn‘) #以二进制写入章节题目 fo.write((‘n‘+txt[‘c_title‘]+‘rn‘).encode(‘UTF-8‘)) #以二进制写入章节内容 fo.write((‘n‘+txt[‘content‘]+‘n‘).encode(‘UTF-8‘)) print(txt[‘c_title‘]) # print(‘章节名:‘+txt[‘c_title‘]) # print("章节内容:n"+txt[‘content‘]) req_url=soups.select(‘#webhtml .zhangjieming .bottem1 a‘)[3][‘href‘] _req_url=req_url.split(‘.‘) req_url=req_url_base+req_url res=requests.get(req_url, params=req_header) #获取下一章接口 soups=BeautifulSoup(res.text,"html.parser") #soup转化 except Exception as e: print(e) finally: return get_txt(‘/124/124629/7404934.html‘)get_txt(‘/135/135169/7373986.html‘) |
原文地址:https://www.cnblogs.com/petewell/p/11410423.html
时间: 2024-10-07 11:35:11