代码如下:用的 Python3.5—————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————import urllib.requestimport reimport os # 打开网页def url_open(url): req = urllib.request.Request(url) req.add_header(‘User-Agent‘, ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36‘) response = urllib.request.urlopen(url) html = response.read() return html # 下载并保存图片def download(urls): for link in urls: html = url_open(link).decode(‘utf-8‘) p = r‘<img class="BDE_Image" src="([^"]+\.jpg)‘ result = re.findall(p, html) for i in result: filename = i.split(‘/‘)[-1] urllib.request.urlretrieve(i, filename, None) # 爬取本页内全部帖子的链接def findlink(url): html = url_open(url).decode(‘utf-8‘) p = r‘<a href="/p/\d+"‘ result = re.findall(p, html) urls = [] for i in result: newurl = ‘http://tieba.baidu.com/‘+ i.split(‘"‘)[-2] urls.append(newurl) download(urls) # 创建文件夹def start(folder = ‘picture‘): os.mkdir(folder) os.chdir(folder) findlink(url) #name = urllib.parse.quote(input(‘请输入贴吧名:‘))#num = input(‘请输入页码,0代表第一页,50为第二页,100为第三页,以此类推:‘) # 贴吧网址,请根据需要自行替换,注意我这里的网址是处理过的,方便选页。#url = ‘http://tieba.baidu.com/f?kw=‘+name+‘&ie=utf-8&pn=‘ + ‘num‘url = ‘http://tieba.baidu.com/f?kw=%E6%9D%A8%E4%B8%9E%E7%90%B3&ie=utf-8&pn=100‘ # 运行脚本start()————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————————— 问题多多,继续学习再回头改进吧,有路过的朋友请不吝赐教,非常感谢。
时间: 2024-10-23 14:09:36