# -*- coding:utf-8 -*-
# 要求:
import requests,queue,time,threading,os
from lxml import etree
class MessageSpider(object):
def __init__(self):
self.start_url = ‘http://www.aizhufu.cn/duanxinku/column/{}/{}.html‘
self.total_url = queue.Queue()
self.content = queue.Queue()
self.header = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36‘}
def get_url(self,cid,page):
url = self.start_url.format(cid,page)
print(‘正在获取‘,url)
response = requests.get(url,headers=self.header)
html_text = response.content.decode()
e_html = etree.HTML(html_text)
return e_html
def get_total(self):
print(‘开始获取总url‘)
html = self.get_url(77,1)
clm_list = html.xpath(‘//a[@class="tip"]/@columnid‘)
for cl in clm_list:
class_url = {}
sub_html = self.get_url(cl,1)
total_page = sub_html.xpath(‘//img[@pageno="1"]/@totalpage‘)
# print(total_page)
class_url[cl] = total_page[0] if total_page else 0
if not total_page:
continue
self.total_url.put(class_url)
print(‘总url解析完成‘)
def get_content(self):
print(‘开始解析网页内容‘)
while self.total_url.qsize():
if not self.total_url.qsize():
time.sleep(1)
print(‘解析网页内容操作在等待中。。。‘)
print(‘total_url.qsize:‘, self.total_url.qsize(), "\ncontent_qsize:", self.content.qsize())
continue
print(‘total_url.qsize:‘, self.total_url.qsize(), "\ncontent_qsize:", self.content.qsize())
messages = []
sub_message = {}
sub_class = self.total_url.get()
for k,v in sub_class.items():
for i in range(1,int(v)+1):
html =self.get_url(k,i)
class_title = html.xpath(‘//div[@class="r_title"]/b/text()‘)
message_list = html.xpath(‘//ul[@class="list"]/li/span/@original-title‘)
messages += message_list
print(class_title,‘\n‘,message_list)
sub_message[class_title[0]] = messages
self.content.put(sub_message)
self.total_url.task_done()
print(‘解析函数中‘,‘total_url.qsize:‘, self.total_url.qsize(), "\ncontent_qsize:", self.content.qsize())
print(class_title,‘类短信解析完成‘)
def save(self):
print(‘开始保存短信‘)
if not os.path.exists(‘短信库‘):
os.mkdir(‘短信库‘)
while self.total_url.qsize() or self.content.qsize():
if not self.content.qsize():
time.sleep(1)
# print(‘save函数中‘,‘total_url.qsize:‘, self.total_url.qsize(), "\ncontent_qsize:", self.content.qsize())
print(‘保存短信操作在等待中。。。‘)
continue
content = self.content.get()
for title,messages in content.items():
file_name = title + ‘.txt‘
full_name = os.path.join(‘短信库‘,file_name)
with open(full_name,‘w+‘) as f:
f.seek(0,0)
i = 1
for msg in messages:
msg = str(i)+‘、‘ + msg.replace(‘\xa0‘,‘ ‘) +‘\n\n‘
f.write(msg)
i += 1
self.content.task_done()
# print(‘save函数中‘,‘total_url.qsize:‘, self.total_url.qsize(), "\ncontent_qsize:", self.content.qsize())
print(title,‘类短信保存成功!‘)
print(‘所有短信保存成功!‘)
def run(self):
self.get_total()
th_list = []
th_cont = threading.Thread(target=self.get_content,)
th_list.append(th_cont)
th_save = threading.Thread(target=self.save,)
th_list.append(th_save)
for th in th_list:
th.setDaemon(True)
th.start()
self.total_url.join()
self.content.join()
if __name__ == "__main__":
ms = MessageSpider()
ms.run()
原文地址:https://www.cnblogs.com/liu-xiaobai/p/8536612.html