妹子图爬取
页面链接
正则实现代码:
import re import requests import os hehehe = os.getcwd() headers = { ‘User-Agent‘: "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", ‘Referer‘: ‘http://i.meizitu.net‘} # 获取all页面的html文本 def get_root_html(root_url): all_page_url = requests.get(root_url, headers=headers) if all_page_url.status_code == 200: return all_page_url.text return "页面获取失败" # 解析all页面, 获取里面所有项目的url def parse_root_html(root_html): all_url_re = r‘<a\shref="(http://www.mzitu.com/\d+)"\starget="_blank"‘ pattern = re.compile(all_url_re) all_url_list = re.findall(pattern, root_html) return all_url_list # 获取单个项目的页面 def get_one_page_html(one_page_url): one_page_html = requests.get(one_page_url, headers=headers) return one_page_html.text # 解析单个项目的页面, 获取该项目一共有多少张图片, 然后组合每一张照片所在的url def parse_one_page_html(one_page_html, one_page_url): # 获取存在最大页数的那个div标签 pattern = re.compile(r‘<div\sclass="pagenavi">.*?</div>‘, re.S) div = re.findall(pattern, one_page_html) # 获取div内所有的span标签 pattern1 = re.compile(r‘<span.*?</span>‘, re.S) one_page_url_list = re.findall(pattern1, div[0]) # 最大页数的数字在返回的span列表的倒数第二项, 得到这个span标签内的数字即: 该项目的最大页数 # print(one_page_url_list[-2]) max_num_re = r‘\d+‘ pattern2 = re.compile(max_num_re) max_num = re.findall(pattern2, one_page_url_list[-2]) # print(max_num[0]) max_num = max_num[0] every_img_page_list = [] for i in range(int(max_num)): one_img_url = str(one_page_url) + ‘/‘ + str(i) every_img_page_list.append(one_img_url) return every_img_page_list # 为该项目在当前路径下, 创建文件夹 def mkdir_folder(one_page_html): folder_name_re = r‘<h2\sclass="main-title">(.*?)</h2>‘ pattern = re.compile(folder_name_re, re.S) folder_name = re.findall(pattern, one_page_html)[0] # 我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 path = str(folder_name).replace("?", ‘_‘) path = path.strip() isExists = os.path.exists(os.path.join(str(hehehe), path)) if not isExists: os.makedirs(os.path.join(str(hehehe), path)) os.chdir(os.path.join(str(hehehe), path)) # 切换到目录 print(folder_name + "文件创建成功") return folder_name # 获取图片的地址 def get_img_url(one_img_page_url): one_img_page_html = requests.get(one_img_page_url, headers=headers) ong_img_re = r‘<img\ssrc="(.*?)"\salt=".*?"‘ pattern = re.compile(ong_img_re, re.S) one_img_url = re.findall(pattern, one_img_page_html.text)[0] # print("一个图片的地址:"+str(one_img_url)) return one_img_url # 下载图片 def download_one_img(one_img_url, folder_name): # print(one_img_url) # print(type(one_img_url)) img = requests.get(one_img_url, headers=headers) file_name = one_img_url[-6:] print("正在下载图片:" + str(file_name)) f = open(str(file_name), ‘wb‘) f.write(img.content) f.close() # 该爬虫的总控制函数体 def main(root_url): root_html = get_root_html(root_url) all_url_list = parse_root_html(root_html) for i in range(5): one_page_html = get_one_page_html(all_url_list[i]) folder_name = mkdir_folder(one_page_html) every_img_page_list = parse_one_page_html( one_page_html, all_url_list[i]) for i in range(len(every_img_page_list)): one_img_url = get_img_url(every_img_page_list[i]) download_one_img(one_img_url, folder_name) if __name__ == "__main__": main(‘http://www.mzitu.com/all‘)
原文地址:https://www.cnblogs.com/amou/p/9206528.html
时间: 2024-12-12 08:25:26