# -*- coding: utf-8 -*- from lxml import html from time import sleep import requests from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium import webdriver url = ‘http://www.tianyancha.com/company/638562997‘ ‘‘‘ for i in range(0, 100000): data = requests.get(url) dataHtml = html.fromstring(data.content) print dataHtml.xpath(‘//title/text()‘)[0], len(data.content),data.status_code ‘‘‘ dcap = dict(DesiredCapabilities.PHANTOMJS) dcap = { "phantomjs.page.settings.userAgent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36", "phantomjs.page.settings.loadImages": False, "phantomjs.page.settings.resourceTimeout": 5000 } proxy = [ ‘--proxy=120.27.142.209:82‘, ‘--proxy-type=http‘, ‘--ignore-ssl-errors=true‘, ‘--ssl-protocol=tlsv1‘ ] for i in range(0, 50): driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=proxy) #driver = webdriver.Chrome() driver.get(url) #wait = WebDriverWait(driver, 10) #a = wait.until(EC.presence_of_element_located(By.CSS_SELECTOR, ‘div.datatable‘)) sleep(5) open(‘logs/2.html‘, ‘w‘).write(driver.page_source.encode(‘utf8‘)) print i, driver.title
时间: 2024-10-17 10:20:06