基于selenium实现自动化爬取数据
如果想具体查看selenium自动化模块的更多功能请看我的博客测试分类中有介绍
selenium
- 概念:基于浏览器自动化的模块
- 自动化:可以通过代码指定一系列的行为动作,然后将其作用到浏览器中。
- pip install selenium
- selenium和爬虫之间的关联
- 1.便捷的捕获到任意形式动态加载的数据(可见即可得)
- 2.实现模拟登录
- 谷歌驱动下载:http://chromedriver.storage.googleapis.com/index.html
#1.基于浏览器的驱动程序实例化一个浏览器对象
bro = webdriver.Chrome(executable_path=‘./chromedriver‘)
#对目的网站发起请求
bro.get(‘https://www.jd.com/‘)
#标签定位
search_text = bro.find_element_by_xpath(‘//*[@id="key"]‘)
search_text.send_keys(‘iphoneX‘) #向标签中录入数据
btn = bro.find_element_by_xpath(‘//*[@id="search"]/div/div[2]/button‘)
btn.click()
sleep(2)
#在搜索结果页面进行滚轮向下滑动的操作(执行js操作:js注入)
bro.execute_script(‘window.scrollTo(0,document.body.scrollHeight)‘)
sleep(2)
bro.quit()
- 药监总局为例:http://125.35.6.84:81/xk/
- 前三页所有企业名称爬取
url = ‘http://125.35.6.84:81/xk/‘
bro = webdriver.Chrome(executable_path=‘./chromedriver‘)
bro.get(url)
page_text_list = []#每一页的页面源码数据
sleep(1)
#捕获到当前页面对应的页面源码数据
page_text = bro.page_source #当前页面全部加载完毕后对应的所有的数据
page_text_list.append(page_text)
#点击下一页
for i in range(2):
next_page = bro.find_element_by_xpath(‘//*[@id="pageIto_next"]‘)
next_page.click()
sleep(1)
page_text_list.append(bro.page_source)
for page_text in page_text_list:
tree = etree.HTML(page_text)
li_list = tree.xpath(‘//*[@id="gzlist"]/li‘)
for li in li_list:
name = li.xpath(‘./dl/@title‘)[0]
print(name)
sleep(2)
bro.quit()
动作链
from selenium.webdriver import ActionChains
url = ‘https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable‘
bro = webdriver.Chrome(executable_path=‘./chromedriver‘)
bro.get(url)
sleep(1)
#如果通过find系列的函数进行标签定位,如果标签是存在于iframe下面,则会定位失败
#解决方案:使用switch_to即可
bro.switch_to.frame(‘iframeResult‘)
div_tag = bro.find_element_by_xpath(‘//*[@id="draggable"]‘)
#对div_tag进行滑动操作
action = ActionChains(bro)
action.click_and_hold(div_tag)#点击且长按
for i in range(6):
#perform让动作链立即执行
action.move_by_offset(10,15).perform()
sleep(0.5)
bro.quit()
- 如何让selenium规避检测
- 有的网站会检测请求是否为selenium发起,如果是的话则让该次请求失败
- 规避检测的方法:
- selenium接管chrome浏览器
- 实现步骤
- 1.必须将你电脑中安装的谷歌浏览器的驱动程序所在的目录找到。且将目录添加到环境变量中。
- 2.打开cmd,在命令行中输入命令:
- chrome.exe --remote-debugging-port=9222 --user-data-dir="一个空文件夹的目录"
- 指定执行结束后,会打开你本机安装好的谷歌浏览器。
- 3.执行如下代码:可以使用下属代码接管步骤2打开的真实的浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
#本机安装好谷歌的驱动程序路径
chrome_driver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chrome_driver,chrome_options=chrome_options)
print(driver.title)
12306模拟登录
url =https://kyfw.12306.cn/otn/login/init
主要用到ActionChains 链式操作,超级鹰解析验证码。还有图片得裁剪
import requests
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode(‘utf8‘)
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
‘user‘: self.username,
‘pass2‘: self.password,
‘softid‘: self.soft_id,
}
self.headers = {
‘Connection‘: ‘Keep-Alive‘,
‘User-Agent‘: ‘Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)‘,
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
‘codetype‘: codetype,
}
params.update(self.base_params)
files = {‘userfile‘: (‘ccc.jpg‘, im)}
r = requests.post(‘http://upload.chaojiying.net/Upload/Processing.php‘, data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
‘id‘: im_id,
}
params.update(self.base_params)
r = requests.post(‘http://upload.chaojiying.net/Upload/ReportError.php‘, data=params, headers=self.headers)
return r.json()
def tranformImgCode(imgPath,imgType):
chaojiying = Chaojiying_Client(‘929235569‘, ‘lyz19960415‘, ‘904189‘)
im = open(imgPath, ‘rb‘).read()
return chaojiying.PostPic(im,imgType)[‘pic_str‘]
from selenium import webdriver
from selenium.webdriver import ActionChains
from time import sleep
from PIL import Image
headers = {
}
url = "https://kyfw.12306.cn/otn/login/init"
chrome = webdriver.Chrome(executable_path="./chromedriver")
#打开浏览器12306页面
chrome.get(url=url)
sleep(2)
#进行截图
chrome.save_screenshot("main.png")
#定位到image标签
img_tag = chrome.find_element_by_class_name("touclick-image")
#裁剪出截图中验证码部分
location = img_tag.location #验证码得左下角下标
size = img_tag.size #验证码尺寸
#基于验证码尺寸指定裁剪范围
img_range = (int(location["x"]),int(location["y"]),int(location["x"]+size["width"]),int(location["y"]+size["height"]))
#根据img_range表示的裁剪范围进行图片的裁剪
i=Image.open("./main.png")
image =i.crop(img_range)
image.save("./code.png")
#用超级鹰获取坐标
result = tranformImgCode(‘./code.png‘,9004)
all_list = []
if "|" in result: #这是存在2个图片都符合要求
result1 = result.split("|") #[‘174,71‘, ‘272,60‘]
count = len(result1)
lst = []
for w in range(count):
x = int(result1[w].split(",")[0])
y = int(result1[w].split(",")[1])
lst.append(x)
lst.append(y)
all_list.append(lst)
else: # 这是一个图片符合要求得
lst = []
x = int(result.split(",")[0])
y = int(result.split(",")[1])
lst.append(x)
lst.append(y)
all_list.append(lst)
for xy in all_list:
x = xy[0]
y = xy[1]
ActionChains(chrome).move_to_element_with_offset(img_tag,x,y).click().perform()
sleep(1)
chrome.find_element_by_id("username").send_keys("洲神再次")
chrome.find_element_by_id("password").send_keys("1234567")
chrome.find_element_by_id("loginSub").click()
原文地址:https://www.cnblogs.com/zzsy/p/12687962.html
时间: 2024-11-07 01:08:02