#coding=utf-8 import os import urllib import urllib2 import re from bs4 import BeautifulSoup # 利用 urllib.urlretrieve() 函数进行下载。非常方便 import socket #超时时间 socket.setdefaulttimeout(5) # 一个空文件夹 basedir=r"E:\spring".decode(‘utf-8‘) os.chdir(basedir) host="http://www.springframework.org" def getFolderName(url): pattern = re.compile(r‘.*/(.*)‘) match = pattern.match(url) # 返回最后一个 / 后面的字符 if match: return match.group(1) # 存储spring 的集合 list = [‘http://www.springframework.org/schema/beans‘, ‘http://www.springframework.org/schema/aop‘, ‘http://www.springframework.org/schema/mvc‘, ‘http://www.springframework.org/schema/p‘, ‘http://www.springframework.org/schema/context‘, ‘http://www.springframework.org/schema/tx‘ ] # 获取页面中的 文件名称 def getFilesByUrl(url): #返回列表 list=[] try: html = urllib2.urlopen(url) except Exception as err: print err soup = BeautifulSoup(html,"lxml") for link in soup.find_all(‘a‘): fileName = link[‘href‘] if not fileName.startswith(‘/‘) and (fileName.endswith(‘xsd‘) or fileName.endswith(‘/‘)): list.append(fileName) return list list = getFilesByUrl("http://www.springframework.org/schema") print list for url in list: print url[:-1] url = "http://www.springframework.org/schema/"+url[:-1] folder = getFolderName(url) print folder if not os.path.exists(folder): os.mkdir(folder) files = getFilesByUrl(url) print files for fileName in files: try: print url+"/"+fileName urllib.urlretrieve(url+"/"+fileName,folder+"/"+fileName) except Exception as err: print err
时间: 2024-12-21 18:38:33