1 #爬取lol全英雄皮肤 2 import re 3 import traceback # 异常跟踪 4 import requests 5 from bs4 import BeautifulSoup 6 #获取html 7 def get_url(url, hander): 8 try: 9 r = requests.get(url, headers=hander, timeout=30) 10 r.raise_for_status() 11 r.encoding = r.apparent_encoding 12 return r.text 13 except: 14 traceback.print_exc() #将异常信息打印出来 15 return "" 16 #解析html 17 def prasing_page(lst,html): 18 try: 19 soup = BeautifulSoup(html, "html.parser") 20 for a in soup.find_all(‘li‘, class_=re.compile(‘boxShadow‘)): 21 tag_a = a(‘a‘) 22 for i in tag_a: 23 lst.append(i[‘href‘]) 24 return lst 25 except: 26 traceback.print_exc() 27 return "" 28 #解析获取到的单个html并筛选和下载 29 def getUrl_prasingpag(lst, hander): 30 hero_img_url = [] 31 hero_skin_name = [] 32 hero_name = [] 33 for u in lst: 34 try: 35 r = requests.get(u, headers=hander, timeout=30) 36 r.raise_for_status() 37 r.encoding = r.apparent_encoding 38 #二次解析 39 soup = BeautifulSoup(r.text, "html.parser") 40 pag = soup.find_all(‘div‘, class_=re.compile(‘othersPifuBox‘)) 41 for m in pag: 42 tag_img = m(‘img‘) 43 tag_p = m(‘p‘) 44 tag_span = m(‘span‘) 45 for m in tag_p: 46 hero_skin_name.append(m.string) 47 for m in tag_img: 48 hero_img_url.append(m[‘src‘]) 49 for m in tag_span: 50 hero_name.append(m.string) 51 except: 52 traceback.print_exc() # 将异常信息打印出来 53 continue 54 #下载到本地 55 for i in range(len(hero_name)): 56 try: 57 path = ‘O:/lol_hero_jpg/‘ + hero_skin_name[i]+‘--‘ + hero_name[i] + ‘.jpg‘ 58 f = open(path, ‘wb‘) 59 r = requests.get(hero_img_url[i], stream=True) 60 f.write(r.content) 61 print("\r当前进度>>>>>>>>>>>>>>>>>>{:.0f}%>>>>>>>>>>>>>>>>>>".format(i * 100 / len(lst)), end="") 62 f.close() 63 except: 64 traceback.print_exc() # 将异常信息打印出来 65 continue 66 67 def main(): 68 hander = {"User-Agent":"Mozilla/5.0"} 69 deep = 43 #定义爬取页数 70 list = [] 71 for i in range(deep): 72 try: 73 url = "http://********/hero_"+str(1+i)+".shtml" 74 html = get_url(url, hander) 75 prasing_page(list, html) 76 getUrl_prasingpag(list, hander) 77 except: 78 continue 79 80 main()
原文地址:https://www.cnblogs.com/llww/p/12149699.html
时间: 2024-10-28 09:45:35