闲的无事,看着知乎里种种python优点,按捺不住,装起python3.4。
网上找了点爬行图片的代码,修改至兼容3.4,成功爬行指定url所有jpg图片,代码段如下:
import os import urllib import urllib.request import re #爬行图片 download_path = os.path.dirname(os.path.abspath(__file__)) class spider(object): def __init__(self, url): self.url = url def parse(self,content): pattern = ‘src="(http://.*\.jpg)\s*"‘ matchs = re.findall(pattern,content,re.M) return matchs def downloads(self,urls): d_path = download_path + "/test" if not os.path.exists(d_path): os.mkdir(d_path) for url in urls: filename = url.split("/")[-1] print (url) print ("Downloads %s" % (filename)) output = "%s/%s" % (d_path, filename) urllib.request.urlretrieve(url,output) def run(self): d_url = self.url fd = urllib.request.urlopen(d_url) try: content = fd.read() content = content.decode("UTF-8") urls = self.parse(content) self.downloads(urls) finally: fd.close() if __name__ == "__main__": sp = spider("http://news.cnfol.com/img/20150814/17638.shtml") sp.run()
时间: 2024-10-24 18:45:37