import reimport urllib.requestimport urllib.errorurl="http://blog.csdn.net"header=("User-Agent",‘User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36‘)opn=urllib.request.build_opener()opn.addheaders=[header]data=opn.open(url).read().decode()pat=‘<li class=""><a href="(.*?)">‘menu_data=re.compile(pat).findall(data)file_num=0for all_link in menu_data: data1=opn.open(‘http://blog.csdn.net/‘+all_link ).read().decode() pat1=‘<a href="(http://blog.csdn.net/.*[0-9].*?)" target=.*‘ sub_menu=re.compile(pat1).findall(data1) try: for link in sub_menu: file_num+=1 urllib.request.urlretrieve(link,"D:\\data\\"+str(file_num)+".html") except urllib.error.URLError as err: if hasattr(err,"code"): print(err.code) if hasattr(err,"reason"): print(err.reason)
时间: 2024-10-08 12:09:08