python3爬虫-网易云排行榜,网易云歌手及作品

import requests, re, json, os, time
from fake_useragent import UserAgent
from lxml import etree
from urllib import parse

class MyError(Exception):
    def __init__(self, status, msg):
        self.status = status
        self.msg = msg

class WyRinking():
    def __init__(self):
        ua = UserAgent()
        self.stratUrl = "https://music.163.com/discover/toplist"
        self.headers = {
            "User-Agent": ua.random
        }
        self.timeout = 10
        self.allow_redirects = False
        self.nameList = []
        self.urlList = []

    def __getRinkNameUrl(self, response):
        ‘‘‘获取所有排行榜名字,和url‘‘‘
        html_selector = self.__etreeSelector(response)
        self.nameList = html_selector.xpath(
            "//div[contains(@class,‘item‘) and contains(@class,‘f-cb‘)]/p[@class=‘name‘]/a/text()") or []
        self.urlList = html_selector.xpath(
            "//div[contains(@class,‘item‘) and contains(@class,‘f-cb‘)]/p[@class=‘name‘]/a/@href") or []

    def __getPageHtml(self, url):
        ‘‘‘请求页面‘‘‘
        try:
            response = requests.get(url, headers=self.headers, timeout=self.timeout,
                                    allow_redirects=self.allow_redirects)
            return response
        except requests.exceptions.Timeout as e:
            print("Timeout Error>>:", e)
            self.__getPageHtml(url=url)

    def __getRankHtml(self):
        ‘‘‘获取每个排行榜的html源码‘‘‘
        if not self.nameList and not self.urlList:
            raise MyError(10000, "{},{} 数据不能为空".format(self.nameList, self.urlList))
        if len(self.nameList) != len(self.urlList):
            raise MyError(10001, "nameList,urlList数据不能一一对应")
        for i in range(len(self.urlList)):
            url = parse.urljoin(self.stratUrl, url=self.urlList[i])
            response = self.__getPageHtml(url=url)
            response.customizeName = self.nameList[i]
            self.__getRankInfo(response)

    def __getRankInfo(self, response):
        ‘‘‘获取到网页中的json格式数据,写入到文件‘‘‘
        html_selector = self.__etreeSelector(response)

        test = html_selector.xpath("//*[@id=‘song-list-pre-data‘]/text()")[0] or ""
        updateTime = html_selector.xpath("//span[contains(@class,‘sep‘) and contains(@class,‘s-fc3‘)]/text()")[0]
        try:
            data = json.loads(test)
        except json.decoder.JSONDecodeError:
            data = json.loads(test + ‘"}}]‘)
        ‘‘‘
        if not len(songNmaeList) == len(songUrlList) == len(songIdList) == len(songIdList):
            raise MyError(10001, "songNmaeList,songUrlList,songIdList,songIdList数据不能一一对应")
        ‘‘‘
        fileName = response.customizeName + ‘--‘ + updateTime + ".json"
        if not Rink_BASE_PATH:
            raise MyError(10005, "需要在全局中配置该参数Rink_BASE_PATH,用于文件存放地址")
        if not os.path.exists(Rink_BASE_PATH):
            os.makedirs(Rink_BASE_PATH)
        path = os.path.join(Rink_BASE_PATH, fileName)
        self.__writeToFile(path, data)

    def __writeToFile(self, path, data):
        print(‘正在写入文件{}.json‘.format(path))
        index = 1
        with open(path, "w", encoding="utf-8") as f:
            for data_dic in data:
                dic = {}
                dic["rankNum"] = index
                dic["songId"] = data_dic.get("id")
                dic["songName"] = data_dic.get("name")
                dic["artistsInfo"] = data_dic.get("artists")
                dic["commentThreadId"] = data_dic.get("commentThreadId")
                f.write(json.dumps(dic, ensure_ascii=False) + "\n")
                index += 1

    def __reSongId(self, songurl: str):
        ‘‘‘
        :param songurl:  /song?id=1336871144 格式类似于这样
        ‘‘‘
        pattern = r"id=(\d+)"
        try:
            id = re.findall(pattern, songurl)[0]
        except IndexError:
            raise MyError(10002, "歌曲id获取失败")
        return id

    def collectRanking(self):
        ‘‘‘获取网易云排行榜数据‘‘‘
        response = self.__getPageHtml(url=self.stratUrl)
        self.__getRinkNameUrl(response)
        self.__getRankHtml()

    def __etreeSelector(self, response):
        ‘‘‘将response对象转换为xml格式‘‘‘
        return etree.HTML(response.text)

class WySinger():
    __isFirstStatus = True  # 请求华语男歌手页面的时候,获取到A-Z对应的参数,这个只需要获取一次就足够

    def __init__(self):
        ua = UserAgent()
        self.stratUrl = "https://music.163.com/discover/artist"
        self.headers = {
            "User-Agent": ua.random
        }
        self.timeout = 10
        self.allow_redirects = False
        self.sCategoryNameList = []
        self.sCategoryIdList = []
        self.sCategoryUrlList = []
        self.initialIdList = []
        self.markList = []

    def __getPageHtml(self, url):
        ‘‘‘请求页面‘‘‘
        try:
            response = requests.get(url, headers=self.headers, timeout=self.timeout,
                                    allow_redirects=self.allow_redirects)
            return response
        except requests.exceptions.Timeout as e:
            print("Timeout Error>>:", e)
            self.__getPageHtml(url=url)

    def __getSingerCategory(self, response):
        htmlSelector = self.__etreeSelector(response)
        sCategoryNameList = htmlSelector.xpath(
            "//*[@id=‘singer-cat-nav‘]/div[@class=‘blk‘]//li/a[@class=‘cat-flag‘]/text()")
        sCategoryIdList = htmlSelector.xpath(
            "//*[@id=‘singer-cat-nav‘]/div[@class=‘blk‘]//li/a[@class=‘cat-flag‘]/@data-cat")
        sCategoryUrlList = htmlSelector.xpath(
            "//*[@id=‘singer-cat-nav‘]/div[@class=‘blk‘]//li/a[@class=‘cat-flag‘]/@href")
        if sCategoryUrlList and len(sCategoryNameList) == len(sCategoryIdList) == len(sCategoryUrlList):
            self.sCategoryNameList = sCategoryNameList or []
            self.sCategoryIdList = sCategoryIdList or []
            self.sCategoryUrlList = [parse.urljoin(self.stratUrl, url) for url in sCategoryUrlList or []]

    def __getSingerListPage(self):
        if not self.sCategoryNameList and not self.sCategoryUrlList:
            raise MyError(10000, "{},{} 数据不能为空".format(self.sCategoryNameList, self.sCategoryUrlList))
        if len(self.sCategoryNameList) != len(self.sCategoryUrlList):
            raise MyError(10001, "nameList,urlList数据不能一一对应")
        for sCategoryUrl in self.sCategoryUrlList:
            response = self.__getPageHtml(sCategoryUrl)
            if self.__isFirstStatus:
                self.__getInitialId(response)
                self.__isFirstStatus = False

            for inintalId in self.initialIdList:
                if inintalId == "-1":
                    # inintalId 为-1的时候代表热门,但是会和后面的歌手信息重复,所以做个判断
                    continue
                url = sCategoryUrl + "&initial=" + inintalId
                res = self.__getPageHtml(url)
                yield res

    def __getSingerIdUrl(self, response):
        htmlSelector = self.__etreeSelector(response)
        aSelector = htmlSelector.xpath(
            "//*[@id=‘m-artist-box‘]//a[@class=‘msk‘] | //*[@id=‘m-artist-box‘]/li[@class=‘sml‘]/a[1]")
        singerUrlList = [parse.urljoin(self.stratUrl, selector.xpath("@href")[0]) for selector in aSelector]
        singerNameList = [selector.xpath("@title")[0].replace("的音乐", "") for selector in aSelector]
        if singerUrlList and len(singerUrlList) == len(singerNameList):
            yield list(zip(singerUrlList, singerNameList))
        else:
            yield []

    def __getInitialId(self, response):
        ‘‘‘获取A-Z对应的initialId‘‘‘
        htmlSelector = self.__etreeSelector(response)
        urlList = htmlSelector.xpath("//*[@id=‘initial-selector‘]/li/a/@href")
        initialIdList = [self.__reInitialId(url) for url in urlList]
        markList = htmlSelector.xpath("//*[@id=‘initial-selector‘]/li/a/text()")

        if len(initialIdList) == len(markList):
            self.initialIdList = initialIdList
            self.markList = markList

    def __reInitialId(self, url):
        ‘‘‘
        url格式为:/discover/artist/cat?id=1001&initial=-1
        ‘‘‘
        pattern = r"initial=(.*)"
        initialId = re.findall(pattern, url, re.S)[0]
        return initialId

    def __getSingerDetails(self, response):
        htmlSelector = self.__etreeSelector(response)
        try:
            data_json = htmlSelector.xpath("//*[@id=‘song-list-pre-data‘]/text()")[0]
            data_list = json.loads(data_json, strict=False)
            singerDetails_json = htmlSelector.xpath("//script[@type=‘application/ld+json‘]/text()")[0]
            singerDetails_dict = json.loads(singerDetails_json, strict=False)
            singerDetails_content = singerDetails_dict.get("description")
            return data_list, singerDetails_content
        except Exception as e:
            # 有些音乐人是没有音乐作品的,所以通过索引取值([0])就会抛异常,我这里捕捉改异常,不进行处理就好
            print(e)
            return None, None

    def __writeToFile(self, datalist, singerDetails_content, singerName):
        if not os.path.exists(Singer_BASE_PATH):
            os.makedirs(Singer_BASE_PATH)
        path = os.path.join(Singer_BASE_PATH, singerName)
        print("正在写入{}".format(singerName))
        with open(path + ".txt", ‘w‘, encoding="utf-8") as f:
            f.write("歌手简介:{}".format(singerDetails_content) + "\n")
            for data in datalist:
                f.write("-" * 50 + "\n")
                f.write("歌曲名:{}".format(data.get("name")) + "\n")
                f.write("歌曲ID:{}".format(data.get("privilege").get("id")) + "\n")
                f.write("歌曲专辑:{}".format(data.get("album").get("name")) + "\n")
                f.write("歌曲别号:{}".format("无" if not data.get("alias") else data.get("alias")) + "\n")

    def __etreeSelector(self, response):
        ‘‘‘将response对象转换为xml格式‘‘‘
        return etree.HTML(response.text)

    def collectSinger(self):
        response = self.__getPageHtml(url=self.stratUrl)
        self.__getSingerCategory(response)
        resGenerator = self.__getSingerListPage()
        for res in resGenerator:
            time.sleep(1)
            a = self.__getSingerIdUrl(res)  # a是一个生成器,不知道取啥名,它__next__就是一个列表,这个列表就是当前页面所有歌手名和url的元组
            for i in a:  # i 就是 a__next__得来的列表
                for b in i:  # b 就是我们想要的结果  一个元组(歌手名,歌手详情页的url)
                    singerUrl = b[0]
                    singerName = b[1]
                    singerResponse = self.__getPageHtml(singerUrl)
                    datalist, singerDetails_content = self.__getSingerDetails(singerResponse)
                    if not datalist and not singerDetails_content:
                        continue
                    self.__writeToFile(datalist, singerDetails_content, singerName)

if __name__ == ‘__main__‘:
    Rink_BASE_PATH = r"D:\spidersData\Rinking"
    Singer_BASE_PATH = r"D:\spidersData\SingerInfo"
    wangyiyun = WyRinking()
    wangyiyun.collectRanking()  # 获取网易云排行榜数据
    wangyiyun = WySinger()
    wangyiyun.collectSinger()  # 获取网易云所有歌手及作品

原文地址:https://www.cnblogs.com/zhuchunyu/p/10765932.html

时间: 2024-10-08 20:13:07

python3爬虫-网易云排行榜,网易云歌手及作品的相关文章

【Python3 爬虫】U11_爬取中国天气网

目录 1.网页分析 2.代码实现 1.网页分析 庚子年初,各种大事件不期而至,又赶上最近气温突变,所以写个爬虫来爬取下中国天气网,并通过图表反映气温最低的前20个城市. 中国天气网:http://www.weather.com.cn/textFC/hb.shtml 打开后如下图: 从图中可以看到所有城市按照地区划分了,并且每个城市都有最低气温和最高气温,通过chrome查看Elements,如下: 从上图可以看到展示当天的数据,那么<div class='conMidtab'>..这个标签则没

Python实现简单的爬虫获取某刀网的更新数据

昨天晚上无聊时,想着练习一下Python所以写了一个小爬虫获取小刀娱乐网里的更新数据 [python] view plain copy #!/usr/bin/python # coding: utf-8 import urllib.request import re #定义一个获取网页源码的子程序 head = "www.xiaodao.la" def get(): data = urllib.request.urlopen('http://www.xiaodao.la').read(

Python3爬虫实战:实战源码+博客讲解

Python Spider 贵有恒,何必三更起五更睡:最无益,只怕一日暴十寒. Python3爬虫实战:实战源码+博客讲解 个人网站 CSDN博客 CSDN爬虫专栏 学习交流群[328127489] 声明 代码.教程仅限于学习交流,请勿用于任何商业用途! 文章首发声明 文章在自己的个人网站首发,其他平台文章均属转发,如想获得最新更新进展,欢迎关注我的个人网站:http://cuijiahua.com/ 目录 爬虫小工具 文件下载小助手 爬虫实战 笔趣看小说下载 百度文库免费文章下载助手_rev1

深网与暗网初学者指南

揭秘深网和暗网:你所不知道的互联网 事物总有正反面,网络也一样,其中的深网和暗网(互联网中无法被搜索引擎抓取到的那部分网络)已经存在多年了,不过在IoE(Internet of Everything万物互联)时代,它可能会扮演更为重要的角色. 我们时常会听到有人提起一个不为人知的网络,也就是所谓的暗网. 如果不是发生了一些大事(比如最近的丝绸之路2.0事件),可能大多数人一辈子都不会听到暗网这个名词.不过现在这个情况正在慢慢改变,一旦IoE的进化得以实现,再加上各类渗透,暗网的融合趋势便不可阻挡

如何快速判断IP是内网还是外网(转)

TCP/IP协议中,专门保留了三个IP地址区域作为私有地址,其地址范围如下: 10.0.0.0/8:10.0.0.0-10.255.255.255 172.16.0.0/12:172.16.0.0-172.31.255.255 192.168.0.0/16:192.168.0.0-192.168.255.255 使用保留地址的网络只能在内部进行通信,而不能与其他网络互连.如果要与外部通信,那么必须通过网关与外部通信,这里使用了NAT, NAPT技术就是用来保证通信的代理机制. 提示:上述IP地址

BAT&VBS脚本:Windows连接VPN后同时登陆内网和外网

今天连公司的VPN,发现连上去之后就登陆不了外网了,上网查了下,再加上与同事的交流,发现连接VPN后同时登陆外网需要下面几个步骤: (我的环境:Windows 7 旗舰版 Service Pack 1) 1)建立好VPN,连接VPN 2)进入到VPN属性页面 3)找到网络选项卡,选中"Internet版本协议4(TCP/IPv4)",点击下方的"属性"按钮 4)在"Internet版本协议4(TCP/IPv4)"的属性界面中,点击"高级

树莓派实战1:查询自己内网,外网ip

这4个实战合起来的完成效果就是:让树莓派每小时把自己ip地址通过邮件自动地发送到自己邮箱 实战2 :http://blog.csdn.net/make_app/article/details/47305179 实战3 :http://blog.csdn.net/make_app/article/details/47305337 实战4:http://blog.csdn.net/make_app/article/details/47305387 0.系统环境 树莓派a+,系统wheezy 输入un

Forefront TMG 2010 篇(三)--内网互访&内网访问 Internet

Forefront TMG 2010 篇(二)--安装 上一篇我们已经安装好了 TMG2010,那下面我们就要对它进行投入使用: 使用环境 : 下面我们就按着上面的图来进行操作: 如有兴趣的,可以直接到微软的网站上面查看相关 Forefront TMG 内容: http://technet.microsoft.com/zh-cn/forefront/default.aspx Forefront TMG 2010 篇(三)--内网互访&内网访问 Internet

Hyper-V 配置虚拟机内网及外网环境

一.为Hyper-V虚拟机准备内外网环境 1.内网环境——虚拟机与虚拟机及主机可以互通 内网环境的虚拟交换机使用仅内部网络,之后在本地配置内网的IP段即可 对vEthernet (localSwitch)设置IP 至此,所有连接到此交换机上的虚拟机,只要改好对应的IP地址就可以互通了 2.外网环境——使虚拟机也可以连到互联网上 外网环境的虚拟交换机使用外部网络,并选择可以正常上网的物理网卡,我这里是PC,对于无线网卡没有测试,应用后查看本地网络 表示vEthernet (internetSwit