python3爬虫-通过requests爬取图虫网

import requests
from fake_useragent import UserAgent
from requests.exceptions import Timeout
from urllib.parse import quote, unquote
import re, json, os, hashlib
from lxml import etree
import time
from multiprocessing import Process, Queue, Pool     # 之前想使用多进程,通过队列处理图片下载。没有实现

userAgent = UserAgent()
headers = {
    "User-Agent": userAgent.random,
    "Host": "tuchong.com",
    "Referer": "https://tuchong.com/explore/"
}
baseUrl = "https://tuchong.com/rest/tag-categories/"
baseTagUrl = "https://tuchong.com/rest/tags/"
tagReferer = "https://tuchong.com/tags/"

timeout = 5
s = requests.Session()

dic = {
    "subject": [],
    "style": [],
    "equipment": [],
    "location": [],
}

categoriesDict = {
    "subject": "题材",
    "style": "风格",
    "equipment": "器材",
    "location": "地区",
}

def getCategoryPage(url, category, page=1):
    try:
        url = url + category
        params = {
            "page": page,
            "count": 20
        }
        response = s.get(url=url, headers=headers, timeout=timeout, params=params)
        if response.status_code == 200:
            response.category = category
            return response
    except Timeout as e:
        print(e)
        return None

def getTagNameUrl(response):
    if not response:
        return None
    data_dict = response.json()
    tag_list = data_dict.get("data").get("tag_list")
    tag_name_list = [tag.get("tag_name") for tag in tag_list]
    return tag_name_list

def getNextPageUrl(response):
    if not response:
        return []
    data_dict = response.json()
    pages = int(data_dict.get("data").get("pages"))
    for page in range(2, pages + 1):
        yield page

def getAllTag():
    global dic
    s.get(url="https://tuchong.com/explore/", headers=headers, timeout=timeout)
    for category in categoriesDict.keys():
        print("获取 -{}- 第 <{}> 页tagName信息.........".format(categoriesDict.get(category), 1))
        response = getCategoryPage(url=baseUrl, category=category)
        tag_name_list = getTagNameUrl(response) or []
        dic.get(category).extend(tag_name_list)
        time.sleep(1)
        for page in getNextPageUrl(response):
            print("获取 -{}- 第 <{}> 页tagName信息.........".format(categoriesDict.get(category), page))
            response = getCategoryPage(url=baseUrl, category=category, page=page)
            tag_name_list = getTagNameUrl(response) or []
            dic.get(category).extend(tag_name_list)
            time.sleep(1)

def getTagPage(url, tag, page):
    tag = quote(tag)
    url = url + tag + "/posts"
    params = {
        "page": page,
        "count": 20,
        "order": "weekly"
    }
    headers["Referer"] = tagReferer + tag + "/"
    try:
        response = requests.get(url=url, params=params, headers=headers, timeout=timeout)
        if response.status_code == 200:
            return response
    except Timeout as e:
        print(e)
        return None

def getImagesInfo(response):
    print(‘---‘)
    if not response:
        return None
    result = response.json().get("result")
    if result == "INVALID":
        print("数据取完了")
        return None
    postList = response.json().get("postList")
    imageUrlList = [dic.get("url") for dic in postList]
    titleList = [dic.get("title").strip() for dic in postList]
    for img_url_title in zip(titleList, imageUrlList):
        img_url_title = list(img_url_title)
        yield img_url_title

def get_md5(img_url):
    m = hashlib.md5()
    m.update(bytes(img_url, encoding="utf-8"))
    return m.hexdigest()

def download(imgsUrl):
    if imgsUrl:
        for img_url in imgsUrl:
            response = requests.get(url=img_url)
            name = get_md5(img_url)
            print("正在下载{}...".format(img_url))
            with open(os.path.join(BASE_PATH, name) + ".jpg", "wb") as f:
                f.write(response.content)

def gogo(tagname):
    page = 1
    while True:
        response = getTagPage(url=baseTagUrl, tag=tagname, page=page)
        print("开始爬取 {} 第 {} 页...".format(tagname, page))
        info = getImagesInfo(response) or []
        if not response:
            return
        for info_tuple in info:
            imgsUrl = putImageUrl(info_tuple)
            download(imgsUrl)
        page += 1
        time.sleep(5)

def putImageUrl(img_url_title_list):
    if img_url_title_list:
        img_url = img_url_title_list[1]
        try:
            response = s.get(url=img_url, headers=headers, timeout=timeout)
            html = etree.HTML(response.text)
            imgsUrl = html.xpath("//article[@class=‘post-content‘]/img/@src")
            return imgsUrl
        except requests.exceptions.ConnectionError as e:
            print(e)
            return None

def downloadImage():
    for key in dic:
        tagname_list = dic.get(key)
        for tagname in tagname_list:
            gogo(tagname)

def run():
    getAllTag()
    print("所有tag信息获取完毕.........")
    print("开始获取每个tag的内容.........")
    downloadImage()

if __name__ == ‘__main__‘:
    BASE_PATH = r"D:\tuchong"
    run()

原文地址:https://www.cnblogs.com/zhuchunyu/p/10765904.html

时间: 2024-11-09 08:24:34

python3爬虫-通过requests爬取图虫网的相关文章

【Python3 爬虫】U11_爬取中国天气网

目录 1.网页分析 2.代码实现 1.网页分析 庚子年初,各种大事件不期而至,又赶上最近气温突变,所以写个爬虫来爬取下中国天气网,并通过图表反映气温最低的前20个城市. 中国天气网:http://www.weather.com.cn/textFC/hb.shtml 打开后如下图: 从图中可以看到所有城市按照地区划分了,并且每个城市都有最低气温和最高气温,通过chrome查看Elements,如下: 从上图可以看到展示当天的数据,那么<div class='conMidtab'>..这个标签则没

【Python3 爬虫】U09_爬取hao6v电影网

目录 1.需求描述 2.实现代码 1.需求描述 爬取hao6v电影网的数据,先通过xpath解析第一个页面,获取到每部电影的url详情页地址,然后解析详情页地址,获取出所需的数据 页面如下: 2.实现代码 # Author:Logan import requests from lxml import etree HEADERS = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML

python3爬虫-使用requests爬取起点小说

import requests from lxml import etree from urllib import parse import os, time def get_page_html(url): '''向url发送请求''' resoponse = session.get(url, headers=headers, timeout=timeout) try: if resoponse.status_code == 200: return resoponse except Except

爬虫 Scrapy框架 爬取图虫图片并下载

items.py,根据需求确定自己的数据要求 1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # https://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class TodayScrapyItem(scrapy.Item): 12

python爬虫入门练习,使用正则表达式和requests爬取LOL官网皮肤

刚刚python入门,学会了requests模块爬取简单网页,然后写了个爬取LOL官网皮肤的爬虫,代码奉上 #获取json文件#获取英雄ID列表#拼接URL#下载皮肤 #导入re requests模块 import requestsimport reimport time def Download_LOL_Skin(): #英雄信息Json文件地址:https://lol.qq.com/biz/hero/champion.js #获取英雄信息列表 json_url = "https://lol.

spider爬虫练习,爬取顶点小说网,小说内容。

------------恢复内容开始------------ 我这里练习爬虫的网站是顶点小说网,地址如下: https://www.booktxt.net/ 我这里以爬取顶点小说网里面的凡人修仙传为例子: 首先观察界面: 第一章: 第二章: 由上面可得出规律,每点一次下一章,url后面的数字就会自动加1. 爬虫主要分为3步: 第一步:构建url列表 def url_list(self): url = "https://www.booktxt.net/1_1562/{}.html" ur

【Python3 爬虫】14_爬取淘宝上的手机图片

现在我们想要使用爬虫爬取淘宝上的手机图片,那么该如何爬取呢?该做些什么准备工作呢? 首先,我们需要分析网页,先看看网页有哪些规律 打开淘宝网站http://www.taobao.com/ 我们可以看到左侧是主题市场,将鼠标移动到[女装/男装/内衣]这一栏目,我们可以看到更细类的展示 假如我们现在需要爬取[羽绒服],那么我们进入到[羽绒服]衣服这个界面 此时查看浏览器地址,我们可以看到 网址复制到word或者其他地方会发生url转码 我们可以选中[羽绒服模块的第1,2,3页进行网址对比],对比结果

requests爬取中国天气网深圳七日天气

1 # conding=utf-8 2 import json 3 import re 4 import requests 5 6 def get_data(url): 7 response = requests.get(url) 8 response.encoding = 'utf-8' 9 if response.status_code == 200: 10 11 return response.text 12 return None 13 14 def parse_data(html):

python 学习 - 爬虫入门练习 爬取链家网二手房信息

import requests from bs4 import BeautifulSoup import sqlite3 conn = sqlite3.connect("test.db") c = conn.cursor() for num in range(1,101): url = "https://cs.lianjia.com/ershoufang/pg%s/"%num headers = { 'User-Agent': 'Mozilla/5.0 (Windo