# 程序启动文件 start.py#!/usr/bin/python # -*- coding: utf-8 -*- import os, sys BASEPATH = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) print(BASEPATH) sys.path.append(BASEPATH) from core import SpiderMan if __name__ == ‘__main__‘: s=SpiderMan.SpiderMan() s.async()
# 爬虫调度器#!/usr/bin/python # -*- coding: utf-8 -*- from gevent import spawn,monkey,joinall;monkey.patch_all() from concurrent.futures import ThreadPoolExecutor from core.UrlManager import UrlManager from core.Htmldown import Htmldown from core.Htmlparser import Htmlparser # from core.DataOutput import DataOutput class SpiderMan: def __init__(self): self.manager=UrlManager() #url管理 self.downloader=Htmldown() #HTML下载 self.parser=Htmlparser() #HTML解析 # self.output=DataOutput() def index_work(self): ‘‘‘ 爬取凌霄主页 ‘‘‘ url = ‘http://www.lingxiaozhishang.com‘ self.manager.oldurls.add(url) #列表中添加每次传进来的url html_dict = self.downloader.down_page(url) #下载器下载 if html_dict is None: # raise print("爬取主页出错了") print("爬取主页出错了") return None new_urls = self.parser.parser_index(html_dict,url) # 解析二层链接 self.manager.add_urls(new_urls) # 所有的a标签存放的列表 print("爬取 主页 + 所有文章url 完成") def async(self): ‘‘‘ 开启协程 ‘‘‘ self.index_work() pool = ThreadPoolExecutor(10) # 开启十个线程池 while True: url = self.manager.get_url() # 从url管理器中获取url if url is None: break pool.submit(self.downloader.down_page,url).add_done_callback(self.parser.parser_page) # 提交下载任务,解析 pool.shutdown(wait=True) #最后得关闭线程池 print("完了-----------------------")
# URL管理器#!/usr/bin/python # -*- coding: utf-8 -*- class UrlManager: def __init__(self): self.newurls=set() self.oldurls=set() def add_url(self,newurl): ‘‘‘ 添加小说章节的url :return: ‘‘‘ if newurl not in self.oldurls: self.newurls.add(newurl) def add_urls(self,newurls): ‘‘‘ 添加多个小说章节的url :param newurls: :return: ‘‘‘ if len(newurls)==0:return for url in newurls: self.add_url(url) def get_url(self): ‘‘‘ 取出一个小说章节的url :return: ‘‘‘ try: url = self.newurls.pop() if url is not None: self.oldurls.add(url) return url except KeyError: pass def has_oldurls(self): ‘‘‘ 返回已爬小说章节的数量 :return: ‘‘‘ return len(self.oldurls)
# HTML下载器#!/usr/bin/python # -*- coding: utf-8 -*- import requests class Htmldown: def down_page(self,url): ‘‘‘ 下载网页内容 ‘‘‘ headers={‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0‘} r=requests.get(url,headers=headers) r.encoding=‘utf8‘ if r.status_code==200: return r.text
# HTML解析器 解析完直接存储到文件了,应该持久化到MongoDB中#!/usr/bin/python # -*- coding: utf-8 -*- from bs4 import BeautifulSoup class Htmlparser: def parser_index(self,html_conf,url): soup = BeautifulSoup(html_conf, ‘html.parser‘) list_a = soup.find(class_="chapterlist").find_all(‘a‘) new_urls=[] for a in list_a: #url=http://www.lingxiaozhishang.com #/book/439.html new_url ="%s%s"%(url,a.attrs["href"]) new_urls.append(new_url) return new_urls def parser_page(self,html_conf): ‘‘‘ 解析小说章节页面 :param html_conf: :return: ‘‘‘ html_conf =html_conf.result() soup=BeautifulSoup(html_conf,‘html.parser‘) title = soup.find(‘h1‘).get_text() text = soup.find(id="BookText").get_text() filepath = r"C:\Users\Administrator\Desktop\Article\db\%s.txt"%title with open(filepath,"w") as f: f.write(text) print("%s 下载完成"%title)
原文地址:https://www.cnblogs.com/52-qq/p/8343014.html
时间: 2024-10-30 12:38:05