1 import requests,json,urllib.parse 2 import threading 3 threading_lock=threading.BoundedSemaphore(value=10)#设置最大线程 4 5 def get_page(url): 6 page=requests.get(url) 7 page=page.content 8 page=page.decode(‘utf-8‘) 9 return page 10 11 def pages_from_duitang(label): 12 pages=[] 13 url = ‘https://www.duitang.com/napi/blog/list/by_search/?kw={}&type=feed&start={}&limit=100‘ 14 label=urllib.parse.quote(label) 15 for index in range(0,3000,100): 16 page_url=url.format(label,index) 17 print(page_url) 18 page=get_page(page_url) 19 pages.append(page) 20 return pages 21 22 23 24 def findall_page(page,startpaet,endstart): 25 all_strat=[] 26 end=0 27 while page.find(startpaet,end) != -1: 28 start=page.find(startpaet,end)+len(startpaet) 29 end=page.find(endstart,start) 30 string=page[start:end] 31 all_strat.append(string) 32 return all_strat 33 34 def pic_urls_from_pages(pages): 35 pic_urls=[] 36 for page in pages: 37 urls=findall_page(page,‘"path":"‘,‘"‘) 38 pic_urls.extend(urls) 39 40 return pic_urls 41 42 def download_pics(url,name): 43 r=requests.get(url) 44 path=r‘C:\Users\Administrator\Desktop\爬取堆糖图片\pics\\‘+str(name)+ ‘.jpg‘ 45 with open(path,‘wb‘) as f: 46 f.write(r.content) 47 threading_lock.release() 48 def main(label): 49 pages=pages_from_duitang(label) 50 pic_urls=pic_urls_from_pages(pages) 51 name=0 52 for url in pic_urls: 53 name+=1 54 threading_lock.acquire() 55 print(‘正在下载第{}张图片‘.format(name)) 56 t=threading.Thread(target=download_pics,args=(url,name)) 57 t.start() 58 #download_pics(url,name) 59 60 61 62 main(‘表情包‘)
时间: 2024-11-03 01:25:05