1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @Date : 2017-08-29 18:38:23 4 # @Author : EnderZhou ([email protected]) 5 # @Link : http://www.cnblogs.com/enderzhou/ 6 # @Version : $Id$ 7 8 import requests 9 import sys 10 from Queue import Queue 11 import threading 12 from bs4 import BeautifulSoup as bs 13 import re 14 15 # 默认爬取百度76页搜索结果url,调用格式 Python.exe 本文件名称.py 搜索关键字,如关键字含特殊符号使用引号包含起来。 16 # 爬取结果有txt文档输出。目前尚未能过来百度推广链接,后续有可能会完善。另外后续将会添加同一网站相同路径不通参数url的过滤。 17 # https://www.baidu.com/s?wd=ichunqiu&pn=10 18 # wd参数为搜索内容关键字 pn参数控制页码 第二页为10 每页新增10 最大页数参数为750即76页。 19 20 headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.78 Safari/537.36‘,} 21 22 class BaiduSpider(threading.Thread): 23 def __init__(self,queue): 24 threading.Thread.__init__(self) 25 self._queue = queue 26 27 def run(self): 28 while not self._queue.empty(): 29 url = self._queue.get() 30 try: 31 self.spider(url) 32 except Exception as e: 33 # print e 34 pass 35 36 def spider(self,url): 37 r = requests.get(url=url,headers=headers) 38 soup = bs(r.content,‘html.parser‘) 39 urllist = soup.find_all(name=‘a‘,attrs={‘data-click‘:re.compile((‘.‘)),‘class‘:None,‘data-is-main-url‘:None}) 40 for i in urllist: 41 l = requests.get(url=i[‘href‘],headers=headers) 42 if l.status_code == 200: 43 ll = l.url.split(‘/‘) 44 lll = ll[0]+‘//‘+ll[2]+‘\n‘ 45 #可根据需求修改是否显示主域名 46 sys.stdout.write(lll+l.url+‘\n‘) 47 f1 = open(‘out_para.txt‘,‘a+‘) 48 f1.write(l.url+‘\n‘) 49 f1.close() 50 with open(‘out_index.txt‘) as f: 51 if lll not in f.read(): 52 f2 = open(‘out_index.txt‘,‘a+‘) 53 f2.write(lll) 54 f2.close() 55 56 def main(keyword): 57 queue = Queue() 58 for i in range(0,760,10): 59 l = ‘https://www.baidu.com/s?wd=‘+keyword+‘&pn=‘+str(i) 60 # print l 61 queue.put(l) 62 threads = [] 63 thread_count = 5 64 for i in range(thread_count): 65 threads.append(BaiduSpider(queue)) 66 for t in threads: 67 t.start() 68 for t in threads: 69 t.join() 70 71 if __name__ == ‘__main__‘: 72 if len(sys.argv) != 2: 73 print ‘Enter:python %s keyword‘ % sys.argv[0] 74 sys.exit(-1) 75 else: 76 f1 = open(‘out_para.txt‘,‘w‘) 77 f1.close() 78 f2 = open(‘out_index.txt‘,‘w‘) 79 f2.close() 80 main(sys.argv[1])
时间: 2024-10-07 06:38:48