#爬取千图网性感美女模块的图片 #第一页:http://sc.chinaz.com/tupian/xingganmeinvtupian.html #第二页:http://sc.chinaz.com/tupian/xingganmeinvtupian_2.html #两种url结构不同,注意 可以使用if语句判断 import urllib.request import urllib.parse from lxml import etree import time import os #定义下载图片的函数 def down_load_image(image_src): dirpath = "xinggan" #创建一个文件夹 if not os.path.exists(dirpath): os.mkdir(dirpath) #搞个文件名 filename = os.path.basename((image_src)) #搞图片路径 filepath = os.path.join(dirpath,filename) #发送请求,保存图片 #构造请求 headers = { "User-Agent": ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘ } #合成请求 request = urllib.request.Request(url=image_src,headers=headers) #获取响应 response = urllib.request.urlopen(request) with open(filepath,"wb") as fp: fp.write(response.read()) #定义解析回传函数,并且下载蹄片 def parse_content(content): tree = etree.HTML(content) image_list = tree.xpath(‘//div[@id="container"]/div/div/a/img/@src2‘) #src搜不到 变为src2能搜到 是因为懒加载问题 # print(image_list) # print(len(image_list)) # exit() #遍历列表 依次下载图片 for image_src in image_list: down_load_image(image_src) #定义构造请求的函数 def handle_request(url,page): #区别第一页和以后页的格式 if page == 1: url = "http://sc.chinaz.com/tupian/xingganmeinvtupian.html" else: url = url % page #print(url) #构造请求 headers = { "User-Agent": ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘ } #合成请求 request = urllib.request.Request(url=url,headers=headers) return request #主函数 def main(): url = "http://sc.chinaz.com/tupian/xingganmeinvtupian_%s.html" start_page = int(input("请输入开始爬取的页码:")) end_page = int(input("请输入爬取结束的页码:")) for page in range(start_page,end_page+1): #生成请求对象 request = handle_request(url,page) #发送请求 content = urllib.request.urlopen(request).read().decode() #解析回传数据 parse_content(content) time.sleep(2) if __name__==‘__main__‘: main() print("图片爬取结束...")
如果学这个不用来做点有意思的事情,那将毫无意义!
原文地址:https://www.cnblogs.com/Qiuzhiyu/p/12183119.html
时间: 2024-10-04 23:44:39