第一个小爬虫,问题较多!
import urllib.request import re import os import string import time import random path = os.getcwd() # 获取当前路径 def get_url(): def open_url(url): req = urllib.request.Request(url) req.add_header(‘User-Agent‘, ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36‘) page = urllib.request.urlopen(url) html = page.read().decode(‘utf-8‘) return html def get_txt(html): lst1 = str(r‘<head>[\s\S]*<title>.*</title>‘) lst2 = (r‘<div id="tac">[\s\S]*<div class="info bottominfo">‘) l1 = str(re.findall(lst1,html)) l1 = l1[51:].replace(‘\‘‘,‘‘).replace(‘\"‘, ‘‘).replace(‘>]‘,‘‘) l1list = l1.split(‘,‘)[0] l2 = str(re.findall(lst2,html)) l2 = l2[92:].replace(r‘\u3000\u3000‘,‘ ‘).replace(‘<br/><br/>‘,‘\n‘)[:-60] l2 = re.sub(‘\*‘,‘ ‘,l2) l2 = str(l2) f = open(path+r‘\\%s.txt‘%(l5),‘a‘) f.write(l1list) f.write(‘\n\n‘) f.write(l2) f.write(‘\n\n\n‘) print(l1list + ‘→→→下载完成→→→‘) def get_titlename(html): lst3 = str(r‘<head>[\s\S]*<title>.*</title>‘) l3 = str(re.findall(lst3,html)) l3 = l3[43:].split(‘_‘)[0].replace(‘txt下载‘,‘\n ——‘).replace(‘(‘,‘‘).replace(‘)‘,‘‘) print(l3 + ‘→正在下载‘) f = open(path+r‘\\%s.txt‘%(l5),‘a‘) f.write(l3) f.write(‘\n\n‘) print(l3 + ‘→→→titlename下载完成→→→‘) def get_txtname(html): lst4 = str(r‘<head>[\s\S]*<title>.*</title>‘) l4 = str(re.findall(lst4,html)) l5 = l4[43:].split(‘txt‘)[0] f = open(path+r‘\\%s.txt‘%(l5),‘a‘) f.close return l5 if __name__ == ‘__main__‘: print(‘\n使用说明:‘ ‘示例:《武道乾坤》,URL https://www.xiashu.la/2186/ ,该书目录为即为2186‘) url0 = ‘https://www.xiashu.la‘ ml = input(‘请输入目录‘) url1 = url0 + r‘/‘ + ml + r‘/‘ print(‘你输入的目录为:%s‘%url1) chapters = input(‘请输入总章节数(示例80页,则输入80):‘) chapters = int(chapters) print("当前工作目录 : %s" % path) get_txtname(open_url(url1)) l5 = get_txtname(open_url(url1)) get_titlename(open_url(url1)) for chapter in range(1,chapters+1): url = url1 +‘read_‘+ str(chapter) + ‘.html‘ t = random.randint(1,5) print(t) time.sleep(t)#单位:秒 get_txt(open_url(url))
原文地址:https://www.cnblogs.com/lasttime/p/10717619.html
时间: 2024-10-07 01:05:59