网上看到的教程,但是是用正则表达式写的,并不能运行,后面我就用xpath改了,然后重新写了逻辑,并且使用了双线程,也算是原创了吧#!/usr/bin/python# -*- encoding:utf-8 -*- from lxml import etreefrom multiprocessing.dummy import Pool as ThreadPoolimport requestsimport sys#编码reload(sys)sys.setdefaultencoding(‘utf-8‘) #定义输出函数def towrite(contentdict): f.writelines(u‘作者:‘ + contentdict[‘author‘] + ‘\n‘) f.writelines(u‘内容:‘ + contentdict[‘content‘] + ‘\n‘) f.writelines(u‘好笑:‘ + contentdict[‘vote‘] + ‘\n‘) f.writelines(u‘评论:‘ + contentdict[‘span‘] + ‘\n\n‘) def spider(url): #得到页面代码 user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘ headers = { ‘User-Agent‘ : user_agent } html = requests.get(url,headers=headers) #获取内容 selector = etree.HTML(html.text) content_field = selector.xpath(‘//*[@id="content-left"]/div[@class="article block untagged mb15"]‘) item={} for i in range(len(content_field)): #作者 author_f= content_field[i].xpath(‘div[@class="author clearfix"]‘)[0] author=author_f.xpath(‘string(.)‘).replace(‘\n‘,‘‘).replace(‘ ‘,‘‘) #内容 content_f=content_field[i].xpath(‘div[@class="content"]/text()‘) content=‘‘ for n in range(len(content_f)): content_temp=content_f[n].replace(‘\n‘,‘‘).replace(‘ ‘,‘‘).replace(‘\t‘,‘‘) content+=str(content_temp) #好笑 vote=‘‘ vote_temp= content_field[i].xpath(‘div[@class="stats"]/span[@class="stats-vote"]/i/text()‘)[0] vote+=str(vote_temp) #评论,如果评论为空,则不会显示i节点 span=‘‘ span_temp_l= content_field[i].xpath(‘div[@class="stats"]/span[@class="stats-comments"]/a/i/text()‘) span_temp=[] if len(span_temp_l)>0: span_temp=span_temp_l[0] else: span_temp=‘0‘ span+=str(span_temp) item[‘author‘] = author item[‘content‘] = content item[‘vote‘] = vote item[‘span‘] = span towrite(item) if __name__ == ‘__main__‘: pool = ThreadPool(4) f = open(‘content.txt‘,‘a‘) url = [] for i in range(1,36): newpage = ‘http://www.qiushibaike.com/hot/page/‘ + str(i) url.append(newpage) results = pool.map(spider, url) pool.close() pool.join() f.close()
时间: 2024-10-03 04:16:48