百度图片有些有编码问题,暂时不能爬取,多试几个
#思路:抓取图片地址,根据地址转存图片(注意名称);难点:转码 # -*- coding:utf-8 -*- from urllib import request,error import json,re # for page in range(4): # url = "http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%93%88%E5%A3%AB%E5%A5%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=%E5%93%88%E5%A3%AB%E5%A5%87&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn="+str(page*30)+"&rn=30&gsm=1e&1520997016315=" # try: # response = request.urlopen(url).read().decode("utf-8") # print(type(response)) # # except error.URLError as e: # print(e.reason) # class BaiduImg(object): def __init__(self): super(BaiduImg,self).__init__() print(‘开始采集图片‘) self.page = 30 def request(self): while self.page <= 30: request_url=‘http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%93%88%E5%A3%AB%E5%A5%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&word=%E5%93%88%E5%A3%AB%E5%A5%87&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&pn=30&rn=30&gsm=1e&1520997014923=‘ # print(len(request_url)) headers = { ‘user-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36‘ # ‘Content-type‘:‘text/html‘ # ‘Content-type‘: ‘text/html‘ } req=request.Request(request_url,headers=headers) with request.urlopen(req) as f: if f.status == 200: content = f.read().decode(‘utf-8‘) content_dict = json.loads(content) self.download2(content_dict[‘data‘]) self.page += 30 #下载图片的方法 def dowload(self,data): for image in data: if image.get(‘middleURL‘): url = image[‘middleURL‘] elif image.get(‘thumbURL‘): url = image[‘thumbURl‘] elif image.get(‘hoverURL‘): url = image[‘hoverURL‘] else: url=‘‘ if url: data = request.urlopen(url).read() imageName = strip(image[‘fromPageTitleEnc‘]) FileName = str(‘images/‘)+imageName+str(‘.jpg‘) with open(FileName,‘wb‘) as f: f.write(data) #使用urllib.request.urlretrive()保存图片 # 使用urllib.request.urlretrieve()保存图片 def download2(self, data): for image in data: if image.get(‘middleURL‘): url = image[‘middleURL‘] elif image.get(‘thumbURL‘): url = image[‘thumbURL‘] else: url = "" if url: imageName = strip(image[‘fromPageTitleEnc‘]) filePath = str(‘images/‘) + imageName + str(‘.jpg‘) request.urlretrieve(url, filePath) #过滤函数 def strip(path): path = re.sub(r‘[?\\*|"<>:/!?]‘,‘‘,str(path)) return path if __name__ == ‘__main__‘: bi = BaiduImg() bi.request()
原文地址:https://www.cnblogs.com/hellangels333/p/8591684.html
时间: 2024-11-06 16:35:34