代码用的python2.7,抓取xici免费代理,检测放入数据库中,为以后爬虫做准备。下面直接上代码
1 #-*-encoding=utf-8-*- 2 3 import requests 4 from lxml import etree 5 import time 6 import pymongo 7 from multiprocessing import Pool 8 9 10 class Getproxy(object): 11 def __init__(self): 12 self.headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36‘} 13 self.url = ‘http://www.xicidaili.com/wt/‘ 14 self.client = pymongo.MongoClient(‘localhost‘,27017) 15 self.xici = self.client[‘xici‘] 16 self.xiciipinfo =self.xici[‘xiciipinfo‘] 17 #self.removeip = ‘127.0.0.1‘ #第一次运行会检测该变量,因为下面只有检测失败了才会赋值 18 19 def getip(self,num): 20 #爬西祠所有代理,更新放入数据库 21 url = self.url + str(num) 22 wb_data = requests.get(url, headers= self.headers) 23 html = etree.HTML(wb_data.text) 24 # htmls = etree.tostring(html) 25 ips = html.xpath(‘//tr[@class="odd"]/td[2]/text()‘) 26 ports = html.xpath(‘//tr[@class="odd"]/td[3]/text()‘) 27 protocols = html.xpath(‘//tr[@class="odd"]/td[6]/text()‘) 28 areas = html.xpath(‘//tr[@class="odd"]/td[4]/a/text()‘) 29 for ip, port, protocol, area in zip(ips, ports, protocols, areas): 30 data = { 31 ‘ip‘: ip, 32 ‘port‘: port, 33 ‘protocol‘: protocol, 34 ‘area‘: area, 35 } 36 print data 37 #self.xiciipinfo.insert_one(data) 38 #if self.removeip != ip: #此处加一个判断,如果是下面检测过的不可用的ip,就不更新进入数据库,可以节省下面的检测时间 39 self.xiciipinfo.update({‘ip‘:ip}, {‘$set‘:data}, True) 40 41 42 def count(self,num): 43 for i in range(1,num): 44 self.getip(i) 45 time.sleep(2) 46 47 48 def dbclose(self): 49 self.client.close() 50 51 52 def getiplist(self): 53 # 将数据库内数据整理放入列表 54 ips = self.xiciipinfo.find() 55 proxylist = [] 56 for i in ips: 57 b = "http" + "://" + i[‘ip‘] + ":" + i[‘port‘] 58 proxies = {"http": b} 59 # print proxies 60 proxylist.append(proxies) 61 # print proxylist 62 return proxylist 63 64 def iptest(self, proxy): 65 # 检测ip,并更新进入数据库,删掉不可用的ip 66 ip = proxy[‘http‘][7:].split(‘:‘)[0] 67 try: 68 requests.get(‘http://wenshu.court.gov.cn/‘, proxies=proxy, timeout = 6) 69 except: 70 print ‘field...............>>>>>>>>>>>>>>>>>>>>>>>>‘ 71 #self.removeip = ip #赋值给类属性 72 self.xiciipinfo.remove({‘ip‘: ip}) # 用remove方法,将符合条件的删掉 73 print ‘remove it now.....{}‘.format(ip) 74 else: 75 print ‘<<<<<<<<<<<<<<<<<.............success‘ 76 print proxy 77 78 79 if __name__ == ‘__main__‘: 80 pool = Pool() 81 proxy = Getproxy() 82 proxy.count(2) 83 iplist = proxy.getiplist() 84 map(proxy.iptest, iplist) 85 proxy.dbclose()
ip代理池-基于mongodb数据库
时间: 2024-10-12 02:25:54