from bs4 import BeautifulSoupimport requestsimport osfrom time import sleep class get_img(object): def get_url(self): self.new_url = [] url = "http://www.gumua.com/Manhua/28307.html" #首页网址 headers = {‘User-Agent‘: ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0‘} res = requests.get(url=url,headers=headers) #获取网页信息 soup = BeautifulSoup(res.content,"lxml") #通过bs4对网页进行解析 divs =soup.find(class_="d_menu") #查找body lis = divs.select("li > a") #查找li下的a标签 for i in lis: link = i.get("href") self.new_url.append("http://www.gumua.com"+link) #获取漫画的所有网址 num=i.string #漫画集数 self.new_url.reverse() #倒叙输出所有地址 return self.new_url def img(self): self.get_url() imgs = [] headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"} url = self.new_url for link in url: res = requests.get(url=link,headers=headers) #获取所有网址的内容 soup = BeautifulSoup(res.content,"lxml") #通过bs4对网页进行解析 #print(soup) body = soup.find("div",class_="r_img ") #查找属性 srcs = body.select("img") #查找图片 for link in srcs: imgs.append(link.get("src")) #查找图片链接地址,追加到list中 for i in range(len(imgs)): root = "/Users/caojialin/work2018/木乃伊新娘/" #本地目录 path = root + str(i)+".jpg" #路径和名称 try: if not os.path.exists(root): os.mkdir(root) if not os.path.exists(path): r = requests.get(imgs[i]) r.raise_for_status() sleep(0.1) # 使用with语句可以不用自己手动关闭已经打开的文件流 with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件 f.write(r.content) print("爬取完成") else: print("文件已存在") except Exception as e: print("爬取失败:" + str(e))
原文地址:https://www.cnblogs.com/paoye/p/9767194.html
时间: 2024-10-09 21:05:29