爬取4k图片网图片

注意更改路径

  1 import os
  2 import requests
  3 from lxml import etree
  4 from urllib.request import urlopen, Request
  5 import time
  6
  7 class BiAnImage():
  8     def __init__(self):
  9         self.base_url = "http://pic.netbian.com"
 10         self.header = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}
 11     def get_html(self, url):
 12         response = requests.get(url, self.header)
 13         if response.status_code == 200:
 14             response.encoding = response.apparent_encoding
 15             return response.text
 16         return None
 17     def get_url_1_list(self, html_1):
 18         url_1_items = []
 19         title_1_items = []
 20         x_html = etree.HTML(html_1)
 21         url_list = x_html.xpath(‘//div[@id="main"]/div[2]/a/@href‘)
 22         title_list = x_html.xpath(‘//div[@id="main"]/div[2]/a/text()‘)
 23         for url, title in zip(url_list, title_list):
 24             url_1_items.append(self.base_url + url)
 25             title_1_items.append(title)
 26         return title_1_items, url_1_items
 27     def get_url_2_list(self, html_2):
 28         url_2_items = []
 29         title_2_items = []
 30         x_html = etree.HTML(html_2)
 31         url_list = x_html.xpath(‘//ul[@class="clearfix"]/li/a/@href‘)
 32         title_list = x_html.xpath(‘//ul[@class="clearfix"]/li/a/b/text()‘)
 33         last_page = x_html.xpath(‘//a[text()="下一页"]/preceding-sibling::a[1]/text()‘)  # 直接查找下一页 => 上一个元素
 34         for url, title in zip(url_list, title_list):
 35             url_2_items.append(self.base_url + url)
 36             title_2_items.append(title)
 37         return url_2_items, title_2_items, last_page
 38     def get_image_url(self, image_html):
 39         x_image_html = etree.HTML(image_html)
 40         image_url = x_image_html.xpath(‘//a[@id="img"]/img/@src‘)
 41         return self.base_url + image_url[0]
 42     def save_image(self, save_path, image_name, image_url):
 43         req = Request(url=image_url, headers=self.header)
 44
 45         content = urlopen(req).read()
 46         img_name = image_name.replace(‘ ‘, ‘‘) + image_url[-4:]
 47         with open(save_path + img_name, ‘wb‘) as f:
 48             f.write(content)
 49             print(img_name, "下载完成...")
 50     def run(self):
 51         # 获取所有分类标题, 链接
 52         html = self.get_html(self.base_url)
 53         title_1_items, url_1_items = self.get_url_1_list(html)
 54         for title_1, url_1 in zip(title_1_items, url_1_items):
 55             if title_1 == "4K动漫":
 56             # if title_1 == "4K风景": TODO: 这里加一个判断就可以下载指定分类下的图片
 57                 html_2 = self.get_html(url_1)
 58                 url_2_items, title_2_items, last_page = self.get_url_2_list(html_2)
 59
 60                 # 通过拿到分类页面中的last_page, 获取该分类下所有页面链接
 61                 for page in range(1, int(last_page[0])):
 62                     if page == 1:
 63                         more_url_1 = url_1  # more_url_1 是每个分类下每一页的链接
 64                     else:
 65                         more_url_1 = url_1 + "index_{}.html".format(page)
 66                     detail_html = self.get_html(more_url_1)
 67                     url_2_items, title_2_items, last_page = self.get_url_2_list(detail_html)
 68
 69                     # 获取当前页面中所有图片链接
 70                     for url_2, title_2 in zip(url_2_items, title_2_items):
 71
 72                         # print(title_1, url_1, last_page[0], more_url_1, title_2, url_2)
 73                         pictures = "C:/Users/25766/AppData/Local/Programs/Python/Python38/imgs/"
 74
 75                         time.sleep(2)
 76                         # 在这里对下载的文件进行分类, 如果文件不存在, 就直接新建一个文件夹
 77                         if os.path.exists(pictures + title_1) is False:
 78                             os.makedirs(pictures + title_1)
 79                         save_path = pictures + title_1 + "/"
 80                         image_html = self.get_html(url_2)
 81                         img_url = self.get_image_url(image_html)
 82                         self.save_image(save_path, title_2, img_url)
 83                         #print(save_path)
 84
 85                           # 跳出一个页面中所有图片链接
 86                      # 跳出一个分类的所有页面
 87                  # 跳出所有分类
 88
 89 bian = BiAnImage()
 90 bian.run()
 91

原文地址：https://www.cnblogs.com/rstz/p/12704537.html

时间： 2024-11-02 22:17:16

爬取4k图片网图片的相关文章

零基础爬取堆糖网图片（一）

零基础爬取堆糖网图片(一) 全文介绍: 首先堆糖网是一个美图壁纸兴趣社区,有大量的美女图片今天我们实现搜索关键字爬取堆糖网上相关的美图. 当然我们还可以实现多线程爬虫,加快爬虫爬取速度涉及内容: 爬虫基本流程 requests库基本使用 urllib.parse模块 json包 jsonpath库图例说明: 请求与响应 sequenceDiagram 浏览器->>服务器: 请求服务器-->>浏览器: 响应爬虫基本流程 graph TD A[目标网站] -->|分析网

python爬取煎蛋网图片

py2版本: #-*- coding:utf-8 -*-#from __future__ import unicode_literimport urllib,urllib2,timeimport re,sys,osheaders={'Referer':'http://jandan.net/','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2

python 爬取煎蛋网图片

__author__ = mkdir(path): os path = path.strip() path = path.rstrip() mkfile = os.path.exists(path) mkfile: () : os.makedirs(path) () urllib, urllib2, re geturl(url): file_lists = [] req = urllib2.Req

python3爬虫爬取煎蛋网妹纸图片

其实之前实现过这个功能,是使用selenium模拟浏览器页面点击来完成的,但是效率实际上相对来说较低.本次以解密参数来完成爬取的过程. 首先打开煎蛋网http://jandan.net/ooxx,查看网页源代码.我们搜索其中一张图片的编号,比如3869006,看下在源代码中是否能找到图片链接从上面的HTML结构中找到这个标号对应的一些属性,没有直接的图片链接地址,只有一个src=//img.jandan.net/blank.gif,这很明显不是个真实的链接地址,因为每一个图片编号都有这个值.我

Python 爬取煎蛋网妹子图片

1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @Date : 2017-08-24 10:17:28 4 # @Author : EnderZhou ([email protected]) 5 # @Link : http://www.cnblogs.com/enderzhou/ 6 # @Version : $Id$ 7 8 import requests 9 from bs4 import BeautifulSoup as bs

python爬取某个网站的图片并保存到本地

python爬取某个网站的图片并保存到本地 #coding:utf-8 import urllib import re import sys reload(sys) sys.setdefaultencoding('gb2312') #获取整个页面的数据 def getHtml (url): page = urllib.urlopen(url) html = page.read() return html #保存图片到本地 def getImg(html): reg = r'src="(.+?\.

python爬取某个网页的图片-如百度贴吧

python爬取某个网页的图片-如百度贴吧作者:vpoet 日期:大约在冬季注:随意copy,不用告诉我 #coding:utf-8 import urllib import urllib2 import re if __name__ =="__main__": rex=r'src="(http://imgsrc.baidu.com/forum/w%3D580.*?\.jpg)"'; Response=urllib2.urlopen("http://t

第一篇博客（python爬取小故事网并写入mysql）

前言: 这是一篇来自整理EVERNOTE的笔记所产生的小博客,实现功能主要为用广度优先算法爬取小故事网,爬满100个链接并写入mysql,虽然CS作为双学位已经修习了三年多了,但不仅理论知识一般,动手能力也很差,在学习的空余时间前前后后DEBUG了很多次,下面给出源代码及所遇到的BUG. 本博客参照代码及PROJECT来源:http://kexue.fm/archives/4385/ 源代码: 1 import requests as rq 2 import re 3 import codecs

python爬虫入门练习，使用正则表达式和requests爬取LOL官网皮肤

刚刚python入门,学会了requests模块爬取简单网页,然后写了个爬取LOL官网皮肤的爬虫,代码奉上 #获取json文件#获取英雄ID列表#拼接URL#下载皮肤 #导入re requests模块 import requestsimport reimport time def Download_LOL_Skin(): #英雄信息Json文件地址:https://lol.qq.com/biz/hero/champion.js #获取英雄信息列表 json_url = "https://lol.

selenium爬取煎蛋网

selenium爬取煎蛋网直接上代码 from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as ES import requests import urllib.requ