#导入多线程模块:import threadingimport osimport requests # 发送请求import timefrom bs4 import BeautifulSoup # 解析文本#导入线程池执行器和进程池执行器:from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor#导入获取CPU的数量cpu_count模块的功能:from multiprocessing import cpu_countbase_path = os.path.dirname(os.path.abspath(__file__))img_path = os.path.join(base_path, ‘img‘) def func(num): #小f字符串拼接下: response = requests.get(f‘http://pic.yesky.com/c/6_20491_{num}.shtml‘) soup = BeautifulSoup(response.text, ‘html.parser‘) # 将请求结果交给bs4解析 div_obj = soup.find(name=‘div‘, attrs={"class": "lb_box"}) # 经过分析之后,定位到指定div list_dd = div_obj.find_all(name=‘dd‘) for dd in list_dd: # 每一张图片的dl a_obj = dd.find(‘a‘) # 拼接文件夹的路径,并创建文件夹 dir_path = os.path.join(img_path, a_obj.text) if not os.path.isdir(dir_path): # 判断文件是否存在 os.mkdir(dir_path) a_response = requests.get(a_obj.get(‘href‘)) a_response.encoding = ‘GBK‘ soup2 = BeautifulSoup(a_response.text, ‘html.parser‘) div_obj2 = soup2.find(name=‘div‘, attrs={"class": "overview"}) # print(div_obj2) print(response.url) try: img_list = div_obj2.find_all(name=‘img‘) for img in img_list: img_src = img.get("src") img_response = requests.get(img_src.replace(‘113x113‘, ‘740x-‘)) file_path = os.path.join(dir_path, img_src.rsplit(‘/‘, 1)[-1]) with open(file_path, ‘wb‘) as f: f.write(img_response.content) except Exception as e: pass if __name__ == ‘__main__‘: #获取电脑CPU的数量: # print(cpu_count()) #定义开始时间: start = time.time() #开当前电脑的cup核数的进程池: # p = ProcessPoolExecutor(max_workers=cpu_count()) #循环5圈: # for i in range(1,6): #获取进程池并submit提交下: # p.submit(func,i) # p.shutdown() #开当前电脑的cup核数的线程池: t = ThreadPoolExecutor(max_workers=cpu_count()) for i in range(1,6): t.submit(func,i) t.shutdown() print("执行时间:{}".format(time.time() - start))
原文地址:https://www.cnblogs.com/zhang-da/p/12210152.html
时间: 2024-10-09 08:41:13