#!/usr/bin/env python # -*- coding:utf-8 -*- import os import re import urllib import json import socket import urllib.request import urllib.parse import urllib.error # 设置超时 import time timeout = 5 socket.setdefaulttimeout(timeout) class Crawler: # 睡眠时长 __time_sleep = 0.1 __amount = 0 __start_amount = 0 __counter = 0 # 获取图片url内容等 # t 下载图片时间间隔 def __init__(self, t=0.1): self.time_sleep = t # 开始获取 def __getImages(self, word=‘美女‘): search = urllib.parse.quote(word) # pn int 图片数 pn = self.__start_amount while pn < self.__amount: headers = {‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0‘} url = ‘http://image.baidu.com/search/avatarjson?tn=resultjsonavatarnew&ie=utf-8&word=‘ + search + ‘&cg=girl&pn=‘ + str( pn) + ‘&rn=60&itg=0&z=0&fr=&width=&height=&lm=-1&ic=0&s=0&st=-1&gsm=1e0000001e‘ # 设置header防ban try: time.sleep(self.time_sleep) req = urllib.request.Request(url=url, headers=headers) page = urllib.request.urlopen(req) data = page.read().decode(‘utf8‘) except UnicodeDecodeError as e: print(‘-----UnicodeDecodeErrorurl:‘, url) except urllib.error.URLError as e: print("-----urlErrorurl:", url) except socket.timeout as e: print("-----socket timout:", url) else: # 解析json json_data = json.loads(data) self.__saveImage(json_data, word) # 读取下一页 print("下载下一页") pn += 60 finally: page.close() print("下载任务结束") return # 保存图片 def __saveImage(self, json, word): if not os.path.exists("./" + word): os.mkdir("./" + word) # 判断名字是否重复,获取图片长度 self.__counter = len(os.listdir(‘./‘ + word)) + 1 for info in json[‘imgs‘]: try: if self.__downloadImage(info, word) == False: self.__counter -= 1 except urllib.error.HTTPError as urllib_err: print(urllib_err) pass except Exception as err: time.sleep(1) print(err); print("产生未知错误,放弃保存") continue finally: print("小黄图+1,已有" + str(self.__counter) + "张小黄图") self.__counter += 1 return # 下载图片 def __downloadImage(self, info, word): time.sleep(self.time_sleep) fix = self.__getFix(info[‘objURL‘]) urllib.request.urlretrieve(info[‘objURL‘], ‘./‘ + word + ‘/‘ + str(self.__counter) + str(fix)) # 获取后缀名 def __getFix(self, name): m = re.search(r‘\.[^\.]*$‘, name) if m.group(0) and len(m.group(0)) <= 5: return m.group(0) else: return ‘.jpeg‘ # 获取前缀 def __getPrefix(self, name): return name[:name.find(‘.‘)] # page_number 需要抓取数据页数 总抓取图片数量为 页数x60 # start_page 起始页数 def start(self, word, spider_page_num=1, start_page=1): self.__start_amount = (start_page - 1) * 60 self.__amount = spider_page_num * 60 + self.__start_amount self.__getImages(word) crawler = Crawler(0.05) crawler.start(‘科幻‘, 1, 5) #crawler.start(‘二次元 性感‘, 3, 3) # crawler.start(‘帅哥‘, 5)
from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from time import sleep userid=input(‘username‘) passid=input(‘password‘) sendtxt="hello"#邮件内容 sendto=input(‘sendto‘) #打开Firefox driver = webdriver.Firefox() driver.get(‘http://mail.10086.cn/‘)#邮箱网址 #填写用户名 emailElem = driver.find_element_by_id(‘txtUser‘) emailElem.send_keys(userid)#用户名 #填写密码 passwordElem = driver.find_element_by_id(‘txtPass‘) passwordElem.send_keys(passid)#密码 passwordElem.submit() #写邮件 sleep(10) write=driver.find_element_by_id(‘btn_compose‘) write.click() sleep(10) iframe_xpath = "//iframe[contains(@id,‘compose_‘)]" #写信页所在的iframe的xpath路径,通过火狐浏览器的firebug插件可以查看到是在一个iframe的页面内 iframe = driver.find_element_by_xpath(iframe_xpath) driver.switch_to_frame(iframe) sleep(5) driver.find_element_by_id(‘toContaine‘).send_keys(sendto) driver.find_element_by_id(‘txtSubject‘).send_keys(sendtxt) driver.find_element_by_id(‘topSend‘).click()
时间: 2024-10-29 19:09:06