代码:
from bs4 import BeautifulSoup from requests import Session, get, post from time import sleep import random import re, os class ProxyIpPool(object): def __init__(self,page): object.__init__(self) self.page = page def init_proxy_ip_pool(self): url = ‘https://www.kuaidaili.com/free/‘ tablelist = [‘IP‘, ‘PORT‘, ‘类型‘, ‘位置‘] ip = [] port = [] type = [] position = [] r = Session() headers = { ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, ‘Accept-Encoding‘: ‘gzip, deflate, br‘, ‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘, ‘Connection‘:‘keep-alive‘, ‘Host‘: ‘www.kuaidaili.com‘, # ‘Referer‘: url, # 点击下一页时 每一页的referer对应的url为:从前一页的link来到当前页的那个link。比如:从百度进入代理IP第一页时的referer的url就是百度的link ‘Upgrade-Insecure-Requests‘: ‘1‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.168 Safari/537.36‘ } if self.page > 1: url = url + ‘inha/‘ + str(self.page) + ‘/‘ request = r.get(url,headers=headers,timeout=2,) print(request.status_code) soup = BeautifulSoup(request.text, ‘lxml‘) tags = soup.find_all(‘td‘, attrs={‘data-title‘: tablelist}) # 获取所有IP ip_tag_match = re.compile(r‘data-title="IP">(.+?)</td‘) ip.append(ip_tag_match.findall(str(tags))) # 获取所有端口 port_tag_match = re.compile(r‘data-title="PORT">(.+?)</td‘) port.append(port_tag_match.findall(str(tags))) # 获取所有类型 type_match = re.compile(r‘data-title="类型">(.+?)</td‘) type.append(type_match.findall(str(tags))) # 获取所有位置 position_tag_match = re.compile(r‘data-title="位置">(.+?)</td‘) position.append(position_tag_match.findall(str(tags))) sleep(random.random()*7) # ip、port、type、position作为字典保存 data_title = {‘ip‘: ip, ‘port‘: port, ‘type‘: type, ‘position‘: position} return data_title def create_proxy_ip_pool(page): pool = ProxyIpPool(page).init_proxy_ip_pool() print(‘初始化完成!开始创建代理池...‘) iplist = pool.get(‘ip‘) portlist = pool.get(‘port‘) typelsit = pool.get(‘type‘) positionlist = pool.get(‘position‘) for i in range(0, len(iplist[0])): print(format(iplist[0][i],‘<22‘) + format(portlist[0][i],‘<17‘) + format(typelsit[0][i],‘<12‘) + positionlist[0][i]) try: with open(‘C:/Users/adimin/Desktop/proxyip.txt‘,‘a‘) as fp: fp.write(format(iplist[0][i],‘<22‘) + format(portlist[0][i],‘<17‘) + format(typelsit[0][i],‘<12‘) + positionlist[0][i] + ‘\r\n‘) except FileExistsError as err: print(err) os._exit(2) if __name__ == ‘__main__‘: print(‘正在初始化代理池...请耐心等待...‘) print(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘类型‘, ‘^16‘) + format(‘位置‘, ‘^16‘)) try: with open(‘C:/Users/adimin/Desktop/proxyip.txt‘, ‘a‘) as fp: fp.write(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘类型‘, ‘^16‘) + format(‘位置‘, ‘^16‘) + ‘\r\n‘) except: with open(‘C:/Users/adimin/Desktop/proxyip.txt‘, ‘w‘) as fp: fp.write(format(‘IP‘, ‘^16‘) + format(‘PORT‘, ‘^16‘) + format(‘类型‘, ‘^16‘) + format(‘位置‘, ‘^16‘) + ‘\r\n‘) # 不知道为什么只能在外面循环才能爬取多页的IP 如果把代码改为在init_proxy_ip_pool函数中进行循环 则只能爬一页多一点... for i in range(1,2177): create_proxy_ip_pool(i)
运行结果:
保存到本地:
原文地址:https://www.cnblogs.com/darkchii/p/8475945.html
时间: 2024-10-30 01:52:00