前言
emmmm
python简单图片抓取
1 import requests 2 import threading 3 import queue 4 from subprocess import Popen,PIPE 5 from bs4 import BeautifulSoup as bs 6 import urllib 7 import base64 8 9 10 queue=queue.Queue() 11 12 class Jiandan(threading.Thread): 13 def __init__(self,queue): 14 threading.Thread.__init__(self) 15 self._queue=queue 16 17 def run(self): 18 while not self._queue.empty(): 19 url=self._queue.get_nowait() 20 self.spider(url) 21 22 def spider(self,url): 23 headers={} 24 r=requests.get(url) 25 #选择lxml解析器 26 soup=bs(r.content,‘lxml‘) 27 #查找html标签name为img的元素获取到它 28 imgs=soup.find_all(name=‘img‘,attrs={}) 29 30 for img in imgs: 31 if ‘onload‘ in str(img): 32 img=img[‘org_src‘] 33 else: 34 img=img[‘src‘] 35 36 name=img.split(‘/‘)[-1] 37 38 #获取的url没http:所以要加上http协议才能访问下载 39 img="http:"+img 40 41 #存储图片的路径地址 42 lu="C:\\Users\\xhds1\\Desktop\\img\\"+name 43 print(lu) 44 urlretrieve(img,lu) #urlretrieve模块下载图片 45 # print(name) 46 47 def main(): 48 sss="" 49 for i in range(137,139): 50 riqi="20200121-%s"%(i) 51 52 #使用了base64模块进行URL编码 这里遇到的问题是编码后就成为了字节流类型 53 #查了资料才得知必须转换成字符串类型才行 纠结了好久 54 strbs=base64.b64encode(riqi.encode(encoding="utf-8")).decode("utf-8") 55 56 queue.put("https://jandan.net/pic/"+strbs+"#comments") 57 58 threads=[] 59 thread_count=5 60 61 for i in range(thread_count): 62 threads.append(Jiandan(queue)) 63 for t in threads: 64 t.start() 65 for t in threads: 66 t.join() 67 68 if __name__==‘__main__‘: 69 main()
参考学习:
浅析Python3中的bytes和str类型:https://www.cnblogs.com/chownjy/p/6625299.html
https://www.cnblogs.com/OliverQin/p/8641700.html
原文地址:https://www.cnblogs.com/xhds/p/12227818.html
时间: 2024-10-01 07:54:23