1、从酷狗网站爬取 新歌首发的新歌名字、播放时长、链接等
from bs4 import BeautifulSoup as BS import requests import re import json class StockCrawler(): def __init__(self): pass def get_stockinfo(self,url): res=requests.get(url) res.encoding=‘utf-8‘ soup=BS(res.text,‘html.parser‘) stock_info=[] div=soup.find_all(‘div‘,id=‘SongtabContent‘) #定位最外层的新歌区域标签 li=div[0].find_all(‘li‘) for i in li: #遍历每首歌的标签,分别取歌名、歌播放时长、链接 print(i) link=‘http:‘+i.find_all(‘a‘)[0][‘href‘] print(‘link:‘,link) songname=i.a.find_all(‘span‘,class_=‘songName‘)[0].text songtime=i.a.find_all(‘span‘,class_=‘songTime‘)[0].text print(‘songname:‘,songname) print(‘songtime:‘,songtime) stock_info.append((songname,songtime,link)) return stock_info def write_file(self,data,file_name): with open(file_name,‘w‘,encoding=‘utf-8‘) as fp: for i in data: fp.write(i[0]+‘,‘+i[1]+‘,‘+i[2]+‘\n‘) #写入文件 if __name__==‘__main__‘: url=‘http://www.kugou.com/‘ stockcrawler=StockCrawler() data=stockcrawler.get_stockinfo(url) stockcrawler.write_file(data,‘f:\\test\\pppp.txt‘)
2、sohu网站首页 爬取包含"体育"2个字的链接
r = requests.get("http://www.sohu.com") r.encoding="utf-8" html = r.text soup = BeautifulSoup(html,"html.parser")#html可以是html内容 links = [] for i in list(soup.find_all(["a"])): try: print(i["href"]) if i["href"].strip().startswith(r"//"): print(1) i["href"]="http:"+i["href"] if i["href"].find("javascript")>=0: print(2) continue if i["href"].find("mailto")>=0: print(3) continue if len(i["href"].strip())==1: print(4) continue #print(i["href"]) links.append(i["href"].strip()) #print("*"*50) except Exception as e: print(e) for link in links: print(link) x=1 for link in links: r = requests.get(link) r.encoding = "utf-8" if "体育" in r.text: with open("e:\\pic\\"+str(x)+".txt","w",encoding="utf-8") as fp: fp.write(r.text) x+=1
3、使用代理服务器 发送请求
proxy=‘168.0.86.146:8080‘ #如果代理需要验证,只需要在前面加上用户名密码,如下所示 # proxy=‘username:[email protected]:8888‘ proxies={ ‘http‘:‘http://‘+proxy, ‘https‘:‘https://‘+proxy, } try: response=requests.get(‘http://httpbin.org/get‘,proxies=proxies) print(response.text) except requests.exceptions.ConnectionError as e: print("Error",e.args)
4、Srapy 爬虫框架
#Scrapy 爬虫框架 ‘‘‘ scrapy startproject testman 1)items.py 存储你要爬取的数据的变量。 类似于字典。 2)pipelines.py(保存爬取后的数据):保存你抓取网页,分析后的存储的 变量中的数据存入到某个地方。(json文件)txt文件、excel 、数据库。 3)settings.py:设定spider的优先级,自动生成的,取消掉数据就可以了。 ITEM_PIPELINES = {‘gr.pipelines.GrPipeline‘: 300,} gr:抓取器的名字 4)在spider的目录下,写一下分析逻辑(从网页中取想要的数据,保存到items.py声明的变量中。) 框架使用的步骤: 1 新建scrapy工程,在任意目录下,cmd中执行:scrapy startproject groad 2 生成工程的内容,在scrapy的根目录下cmd中 执行:scrapy genspider newsong www.kugou.com 3 编写相关代码 4 在scrapy的根目录下,cmd执行抓取:scrapy crawl newsong ‘‘‘
原文地址:https://www.cnblogs.com/xiaoxiao075/p/10925542.html
时间: 2024-11-04 09:15:02