各大免费IP的网站的反爬手段往往是封掉在一定时间内访问过于频繁的IP,因此在爬取的时候需要设定一定的时间间隔,不过说实话,免费代理很多时候基本都不能用,可能一千个下来只有十几个可以用,而且几分钟之后估计也扑街了。虽然有那种付费的大量代理IP,但是也不见得好,测试过,里面优质的也很少。目前体验比较好的还是私密代理,当然还有其他。贵有贵的道理。
import requests import time import random from fake_useragent import UserAgentfrom requests.exceptions import RequestException from lxml import etree import csv class IPSpider(object): def __init__(self): self.url = ‘https://www.kuaidaili.com/free/inha/‘ self.url_test = ‘http://www.baidu.com/‘ # 直接拿百度来测试IP能不能用 def get_headers(self): """ 随机产生请求头 :return: """ ua = UserAgent() headers = { ‘User-Agent‘: ua.random } return headers def get_page(self, url): """ 获取网页源代码 :param url: :return: """ while True: try: headers = self.get_headers() response = requests.get(url, headers=headers, verify=False) if response.status_code == 200: return response.text print(response.status_code) raise ValueError("打开网页错误") except RequestException as err: print(err) def parse_ip(self, text): """ 提取页面的IP和端口号 :param text: :return: """ html = etree.HTML(text) ip = html.xpath("//tr/td[1]/text()") print(ip) port = html.xpath("//tr/td[2]/text()") print(port) return zip(ip, port) def test_ip(self, ip, port): """ 测试IP是否可用 :param ip: :param port: :return: """ try: # url_ip = ‘http://‘ + ip + ‘:‘ + port # proxies = { # ‘http‘: url_ip, # ‘https‘: url_ip # } proxies = { ‘http‘: ‘http://{}:{}‘.format(ip, port), ‘https‘: ‘https://{}:{}‘.format(ip, port), } headers = self.get_headers() response = requests.get(url=self.url_test, headers=headers, proxies=proxies, timeout=8) if response.status_code == 200: print("%s可用" % ip) return ip, port return None except RequestException: print(‘%s失效‘ % ip) def save_ip(self, result): """ 可用的IP保存 :param result: :return: """ with open("kuaidailiip.csv", "a")as f: writer = csv.writer(f) writer.writerows(result) def run(self): """ 主函数 :return: """ for i in range(1, 1001): url = self.url + str(i) + ‘/‘ text = self.get_page(url) ip = self.parse_ip(text) result = [] for j in ip: ok_ip = self.test_ip(j[0], j[1]) if ok_ip == None: continue else: result.append(ok_ip) self.save_ip(result) time.sleep(random.randint(5, 7))if __name__ == ‘__main__‘: spider = IPSpider() spider.run()
原文地址:https://www.cnblogs.com/lattesea/p/11576055.html
时间: 2024-11-09 05:08:55