1 # -*- coding: utf-8 -*- 2 3 import json 4 import os 5 import time 6 from multiprocessing import Pool 7 import multiprocessing 8 import requests 9 from selenium import webdriver 10 11 12 def get_image_links(keyword, num_requested = 1000): 13 """get image links with selenium 14 """ 15 number_of_scrolls = int(num_requested/400) + 1 16 img_urls = set()#设置为集合,自动去除重复链接 17 chrome_options = webdriver.ChromeOptions() 18 # chrome_options.add_argument(‘--headless‘)#设置无头浏览器 19 # chrome_options.add_argument(‘user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"‘) 20 # chrome_options.add_argument("lang=en_US")#设置语言 21 # prefs = {"profile.managed_default_content_settings.images":2} 22 # chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片 23 driver = webdriver.Chrome(chrome_options=chrome_options) 24 driver.maximize_window() 25 search_query = keyword 26 url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch" 27 driver.get(url) 28 for _ in range(number_of_scrolls): 29 for i in range(5): 30 # multiple scrolls needed to show all 400 images 31 driver.execute_script("window.scrollBy(0, 100000)") 32 time.sleep(1) 33 time.sleep(5)#等待页面刷新,否则有可能元素不可见 34 try: 35 # driver.find_element_by_xpath("//input[@value=‘Show more results‘]").click()#浏览器的中英文版本不同 36 driver.find_element_by_xpath("//input[@value=‘显示更多结果‘]").click() 37 except Exception as e: 38 print("reach the end of page ") 39 break 40 41 # with open(‘page.html‘,‘w‘) as f: 42 # f.write(driver.page_source) 43 imgs = driver.find_elements_by_xpath(‘//div[contains(@class,"rg_meta")]‘)#模糊定位 44 for i,img in enumerate(imgs): 45 img_url = json.loads(img.get_attribute(‘innerHTML‘))["ou"] 46 img_urls.add(img_url) 47 driver.quit() 48 print("finish getting all image urls!") 49 50 return img_urls 51 52 def download(urls,download_dir): 53 ‘‘‘download images 54 ‘‘‘ 55 print("start downloading images!") 56 for url in urls: 57 filename=os.path.join(download_dir,os.path.basename(url)) 58 try: 59 r = requests.get(url, stream=True, timeout=60) 60 r.raise_for_status() 61 with open(filename, ‘wb‘) as f: 62 f.write(r.content) 63 except Exception: 64 continue 65 print("finish downloading images!") 66 67 keywords = [‘girl‘,‘boy‘] 68 download_dir = ‘./images/‘ 69 download_dirs = [] 70 for keyword in keywords: 71 path = os.path.join(download_dir,keyword) 72 download_dirs.append(path) 73 if not os.path.exists(path): 74 os.makedirs(path) 75 76 # for keyword in main_keywords: 77 # image_urls = get_image_links(keyword) 78 # download(image_urls,download_dir) 79 80 81 ################################### 82 # get image links/MultiProcess 83 ################################### 84 img_urls=[] 85 multiprocessing.freeze_support() 86 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself 87 for keyword in keywords: 88 img_urls.append(p.apply_async(get_image_links, (keyword,))) 89 #img_urls:[<multiprocessing.pool.ApplyResult object at 0x7f536925fcc0>, <multiprocessing.pool.ApplyResult object at 0x7f536925fd68>] 90 for i,urls in enumerate(img_urls): 91 img_urls[i]=urls.get() 92 p.close() 93 p.join() 94 95 96 # # ################################### 97 # # # download images/MultiProcess 98 # # ################################### 99 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself 100 for i,urls in enumerate(img_urls): 101 p.apply_async(download, [urls,download_dirs[i]]) 102 p.close() 103 p.join()
原文地址:https://www.cnblogs.com/buyizhiyou/p/11140128.html
时间: 2024-10-27 15:55:58