1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @Date : 2017-08-24 10:17:28 4 # @Author : EnderZhou ([email protected]) 5 # @Link : http://www.cnblogs.com/enderzhou/ 6 # @Version : $Id$ 7 8 import requests 9 from bs4 import BeautifulSoup as bs 10 import threading 11 import Queue 12 import urllib 13 14 class jiandan_ooxx(threading.Thread): 15 def __init__(self,queue): 16 threading.Thread.__init__(self) 17 self._queue = queue 18 19 def run(self): 20 while not self._queue.empty(): 21 url = self._queue.get_nowait() 22 self.spider(url) 23 24 def spider(self,url): 25 r = requests.get(url = url) 26 soup = bs(r.content,‘html.parser‘) 27 imges = soup.find_all(name=‘img‘,attrs={}) 28 lists = [] 29 for i in imges: 30 if ‘border‘ in str(i): 31 continue 32 elif ‘onload‘ in str(i): 33 lists.append(i[‘org_src‘]) 34 print i[‘org_src‘] 35 img = ‘http:‘ + i[‘org_src‘] 36 else: 37 lists.append(i[‘src‘]) 38 print i[‘src‘] 39 img = ‘http:‘ + i[‘src‘] 40 name = img.split(‘/‘)[-1] 41 urllib.urlretrieve(img,filename=name) 42 43 def main(number): 44 url = ‘http://jandan.net/ooxx/page-‘ 45 headers = {} 46 queue = Queue.Queue() 47 48 # 此处由最新页面开始爬取,默认爬取最新10页的图片,把number-10改成1即可爬取全部页面的图片。 49 for i in xrange(number,number-10,-1): 50 queue.put(url+str(i)) 51 threads = [] 52 thread_count = 10 53 54 for i in range(thread_count): 55 threads.append(jiandan_ooxx(queue)) 56 57 for t in threads: 58 t.start() 59 for t in threads: 60 t.join() 61 62 if __name__ == ‘__main__‘: 63 # 获取最新页码并传入main函数 64 r = requests.get(‘http://jandan.net/ooxx‘) 65 soup = bs(r.content,‘html.parser‘) 66 string = soup.find_all(name=‘span‘,attrs={‘class‘:‘current-comment-page‘}) 67 number = int(string[1].string[1:-1]) 68 main(number)
时间: 2024-10-08 02:28:50