以慕课网http://www.imooc.com/首页为例
源代码:
#coding:utf-8 class Outputer: def __init__(self,directory): self.directory=directory def output(self,file,content): try: f=open(self.directory+file,‘wb‘) f.write(content) except Exception as err: print(err) finally: f.close() print(file+"has writed!") class SpiderMan: def __init__(self,url,directory): self.url=url self.outputer=Outputer(directory) def crawl(self): resp=request.urlopen(self.url) buff=resp.read() pattern=r‘http:.+\.jpg‘ jpgurls=re.findall(pattern,buff.decode(‘utf-8‘),flags=0) count=1 for jurl in jpgurls: try: content=request.urlopen(jurl).read() print(count) count=count+1 self.outputer.output(‘imooc_‘+str(count)+‘.jpg‘,content) except UnicodeEncodeError as err: print(err) if __name__ == ‘__main__‘: from urllib import request import re #http://www.imooc.com/ url=‘http://www.imooc.com/‘ spider=SpiderMan(url,‘./spider_imooc/‘) spider.crawl()
运行图:
时间: 2024-11-13 00:30:24