学习python有一段时间了这几天想写一个爬去百度图片的小爬虫代码from selenium import webdriverimport urllib,reimport timeimport urllib2import sysimport osimport socketimport threadingsocket.setdefaulttimeout(15.0)def mkdir(name): #判断文件存放的目录是否存在 if not os.path.exists(name): os.mkdir(name)def get_html(name,papg):#通过selenium+PhantomJS来访问目标网址 try: #异常处理 name = urllib.quote(name) driver=webdriver.PhantomJS() driver.get(‘https://image.baidu.com/search/index?tn=baiduimage&word={}&pn={}‘.format(name,papg)) data=driver.page_source driver.quit() return data except Exception: return Nonedef req(html):#抓取图片的正则表达式 try: s=r‘data-objurl="(http://.*?)"‘ req=re.findall(s,html) return req except Exception: return Nonedef Loadown(req):#下载函数 for i in req: try: #heard={‘User-Agent‘:"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"} #urllib2.Request.add_header(‘User-Agent‘,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0") if urllib2.urlopen(i).getcode()==200: print i urllib.urlretrieve(i,name+‘/%s‘% len(os.listdir(name))) else: pass except Exception : passdef three(req):#线程函数 threading.Thread(target=Loadown, args=(req,)).start() while (threading.activeCount() > 20): if (threading.activeCount() < 20): break; def ForImg(papg): html = get_html(name, papg) res = req(html) if res != None: three(res) #data-thumburl="https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=2766886107,1571085905&fm=23&gp=0.jpg"if __name__ == ‘__main__‘: print ‘----------------------------------------------------------------------------‘ name = raw_input(‘请输入搜索的目标:‘).decode(sys.stdin.encoding) name = name.encode(‘utf-8‘) mkdir(name) s=raw_input(‘请输入需要几页数据‘) if s.isdigit(): s=int(s) else: print ‘请输入数字‘ papg=0 for i in range(0,s): ForImg(papg) papg+=20
时间: 2024-10-08 07:47:30