1 m Queue import Queue 2 from gzip import GzipFile 3 from StringIO import StringIO 4 import time 5 import socket 6 class ContentEncodingProcessor(urllib2.BaseHandler): 7 """A handler to add gzip capabilities to urllib2 requests """ 8 9 # add headers to requests 10 def http_request(self, req): 11 req.add_header("Accept-Encoding", "gzip, deflate") 12 return req 13 14 # decode 15 def http_response(self, req, resp): 16 old_resp = resp 17 18 # if(resp.geturl() != req): 19 # print ‘no‘ 20 # return 1 21 # gzip 22 if resp.headers.get("content-encoding") == "gzip": 23 gz = GzipFile( 24 fileobj=StringIO(resp.read()), 25 mode="r" 26 ) 27 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) 28 resp.msg = old_resp.msg 29 # deflate 30 if resp.headers.get("content-encoding") == "deflate": 31 gz = StringIO( deflate(resp.read()) ) 32 resp = urllib2.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) # ‘class to add info() and 33 resp.msg = old_resp.msg 34 return resp 35 36 # deflate support 37 import zlib 38 def deflate(data): # zlib only provides the zlib compress format, not the deflate format; 39 try: # so on top of all there‘s this workaround: 40 return zlib.decompress(data, -zlib.MAX_WBITS) 41 except zlib.error: 42 return zlib.decompress(data) 43 44 45 #(set timeout) 46 socket.setdefaulttimeout(10) 47 48 encoding_support = ContentEncodingProcessor 49 opener = urllib2.build_opener( encoding_support, urllib2.HTTPHandler) 50 51 class Fetcher: 52 def __init__(self,threads): 53 self.opener = urllib2.build_opener(urllib2.HTTPHandler) 54 self.lock = Lock() #线程锁 55 self.q_req = Queue() #任务队列 56 self.q_ans = Queue() #完成队列import socket 57 self.threads = threads 58 for i in range(threads): 59 t = Thread(target=self.threadget) 60 t.setDaemon(True) 61 t.start() 62 self.running = 0 63 64 def __del__(self): #解构时需等待两个队列完成 65 time.sleep(0.5) 66 self.q_req.join() 67 self.q_ans.join() 68 69 def taskleft(self): 70 return self.q_req.qsize()+self.q_ans.qsize()+self.running 71 72 def push(self,req): 73 self.q_req.put(req) 74 75 def pop(self): 76 return self.q_ans.get() 77 78 def threadget(self): 79 while True: 80 ans = ‘‘ 81 req = self.q_req.get() 82 # print req 83 84 with self.lock: #要保证该操作的原子性,进入critical area 85 self.running += 1 86 87 try: 88 # ans = self.opener.open(req).read() 89 #content = opener.open(req).read() 90 content = urllib2.urlopen(req).read() 91 # print temp.geturl() 92 # print req 93 # add gzip support from here 94 ans = str(content) 95 except Exception, what: 96 print what 97 pass 98 99 self.q_ans.put((ans,req)) 100 with self.lock: 101 self.running -= 1 102 self.q_req.task_done() 103 time.sleep(0.01) # don‘t spam 104 105 if __name__ == "__main__": 106 a = [0] * 3600000 107 links = [ ‘http://www.songtaste.com/song/%d/‘%i for i in range(1,3600000) ] 108 f = Fetcher(threads=50) 109 for url in links: 110 f.push(url) 111 while f.taskleft(): 112 the_page,x =f.pop() 113 # print the_page 114 try: 115 npos = the_page.index(‘chart#fav‘) 116 except : 117 pass 118 else: 119 for j in range(npos,1,-1): 120 if the_page[j] == ‘,‘: 121 k = j 122 break 123 sum = 0 ; 124 t = 1 ; 125 for j in range(k-1,1,-1): 126 if the_page[j] <= ‘9‘ and the_page[j] >=‘0‘: 127 sum = sum + (int(the_page[j]) - int(‘0‘)) * t 128 t *= 10; 129 else : 130 break 131 p = int(x[30:-1]) 132 if(p % 10000 <= 5 ) 133 a[p] = sum 134 if sum != 0: 135 print p 136 print sum 137
基于Python的urllib2模块的多线程网络爬虫程序,布布扣,bubuko.com
时间: 2024-10-26 20:52:51