from selenium import webdriver from selenium.webdriver.chrome.options import Options from time import sleep # 创建一个对象,用来控制chorme以无界面模式打开 chrome_options = Options() chrome_options.add_argument(‘--headless‘) chrome_options.add_argument(‘--disable-gpu‘) # 实例化driver对象 bro = webdriver.Chrome(executable_path=‘F:\\anaconda\chromedriver.exe‘, chrome_options=chrome_options) # 发送请求 bro.get(url=‘http://www.baidu.com‘) # 截图 # bro.save_screenshot(‘first.jpg‘) # 查找命令 my_input = bro.find_element_by_id(‘kw‘) # 向标签中录入指定的数据 my_input.send_keys(‘美女‘) # 查找“百度一下” my_button = bro.find_element_by_id(‘su‘) my_button.click() # 获取当前浏览器显示页面的页面源码 page_text = bro.page_source print(type(page_text)) bro.quit()
无头浏览器实现
from selenium import webdriver from selenium.webdriver.chrome.options import Options from time import sleep bro = webdriver.Chrome(‘F:\\anaconda\chromedriver.exe‘) url = ‘https://36kr.com/information/contact‘ bro.get(url=url) # 爬取动态加载出来的数据 js = ‘window.scrollTo(0, document.body.scrollHeight)‘ bro.execute_script(js) sleep(2) bro.execute_script(js) sleep(2) bro.execute_script(js) page_text = bro.page_source print(page_text) with open(‘./36k.html‘, ‘w‘, encoding=‘utf-8‘) as fp: fp.write(page_text) bro.close()
加载滚动条
import requests from lxml import etree import re from multiprocessing.dummy import Pool import random url = ‘https://www.pearvideo.com/category_8‘ headers = { ‘Use-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36‘, } response = requests.get(url=url, headers=headers, verify=False).content.decode() xpath_data = etree.HTML(response) li_list = xpath_data.xpath(‘//*[@id="listvideoListUl"]/li‘) # 实现并发建立的数据池 video_url_list = [] for li in li_list: # print(li) v_href = ‘https://www.pearvideo.com/‘ + li.xpath(‘.//div[@class="vervideo-bd"]/a/@href‘)[0] # print(v_href) d_response = requests.get(url=v_href, headers=headers).content.decode() video_url = re.findall(‘srcUrl="(.*?)",‘, d_response, re.S)[0] video_url_list.append(video_url) # print(video_url) # 常见5个线程 pool = Pool(5) dowmloadVideo = lambda link: requests.get(url=link, headers=headers).content # map返回的列表中存储的就是下载完毕的视频二进制数据值 video_url_list = pool.map(dowmloadVideo, video_url_list) def save_video(data): i = random.randint(1, 1000) video_name = ‘video/‘ + str(i) + ‘.mp4‘ # i = i + 1 with open(video_name, ‘wb‘) as fp: fp.write(data) pool.map(save_video, video_url_list) pool.close() pool.join()
多线程的实现
from selenium import webdriver from time import sleep bro = webdriver.Chrome(‘F:\\anaconda\chromedriver.exe‘) bro.get(‘https://qzone.qq.com/‘) sleep(1) # 定位到iframe bro.switch_to.frame(‘login_frame‘) user = bro.find_element_by_id(‘switcher_plogin‘) user.click() sleep(3) username = bro.find_element_by_id(‘u‘) username.send_keys(‘*****‘) sleep(3) password = bro.find_element_by_id(‘p‘) password.send_keys(‘*****‘) sleep(2) login = bro.find_element_by_id(‘login_button‘) login.click() sleep(10) bro.quit()
iframe的实现
原文地址:https://www.cnblogs.com/abc23/p/10751549.html
时间: 2024-10-14 03:48:13