1 #导入模块 2 import requests,os,sys,re 3 from bs4 import BeautifulSoup 4 5 6 #创建文件夹 7 path = os.getcwd() 8 new_path = os.path.join(path,‘小姐姐‘) 9 if not os.path.isdir(new_path): 10 os.mkdir(new_path) 11 12 #开始写爬虫的主体程序 13 14 #爬虫头部信息,有些网站要有头部信息,不然爬取不到(具体为什么可以自行百度) 15 headers = {‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36‘} 16 response = requests.get(‘http://www.mmonly.cc/tag/ltmn/‘,headers = headers) 17 # print(response) 18 19 #转码编译,response.content 得到的是二进制字符流 20 con = response.content.decode(‘gb18030‘) 21 soup = BeautifulSoup(con,‘lxml‘) #申明获取页面的解析用的解释器 22 #找到class为ABox的标签 23 my_gril = soup.select(‘.ABox ‘) 24 # print(soup) 25 # print(my_gril) 26 #利用for in 循环迭代找到套图的网址 27 for gril in my_gril: 28 j = gril.find(‘a‘) 29 link = j.get(‘href‘) 30 flink = link 31 # print(flink) 32 response = requests.get(flink) 33 con1 = response.content.decode(‘gb18030‘) 34 soup1 = BeautifulSoup(con1,‘lxml‘) 35 # print(con1) 36 #找到class为pages 下的所有a标签 37 my_gril1 = soup1.select(‘.pages ul li a‘) 38 # print(my_gril1) 39 #得到第一个a标签的内容 40 s = my_gril1[0].get_text() 41 #利用正则,得到图片的总数(网站是一张图片一页,也就是得到套图的页数) 42 num = int(re.sub(‘\D‘,‘‘,s)) 43 # print(num) 44 #利用for in 循环迭代得到每张图片对应的网址 45 for page in range(1,(num+1)): 46 if page == 1: 47 flink1 = link 48 else: 49 flink1 = link.replace(‘.html‘,‘_%s.html‘ % page) 50 response = requests.get(flink1) 51 con2 = response.content.decode(‘gb18030‘) 52 soup2 = BeautifulSoup(con2,‘lxml‘) 53 my_gril2 = soup2.select(‘#big-pic img‘) 54 # print(my_gril2) 55 # 找到图片 56 for pic in my_gril2: 57 pic_link = pic.get(‘src‘) 58 flink2 = pic_link 59 # print(flink2) 60 response =requests.get(flink2) 61 con3 = response.content 62 #将图片写入‘小姐姐‘文件夹中 63 with open(u‘小姐姐‘ + ‘/‘ + flink2[-11:], ‘wb‘) as code: 64 code.write(con3) 65 66 #最后提醒:强撸灰灰湮灭
以前写的算只是爬取了一个套图里的多张图片,现在的是爬取了一个分类下的所有套图中的所有图片(受不鸟,受不鸟啊),主要用到的模块是requests,os,sys,re和BeautifulSoup。
原文地址:https://www.cnblogs.com/hcq-learning/p/9076997.html
时间: 2024-10-07 10:47:37