# -*- coding:utf-8 -*- import requests, re, os,urllib2 class TP: def __init__(self,baseUrl): #baseUrl是基本地址 #url = ‘http://tieba.baidu.com/p/5307547413‘ self.baseURL=baseUrl #传入页码,获取该页帖子的代码 def getPage(self,pageNum): try: url=self.baseURL+‘?pn=‘+str(pageNum) res=urllib2.Request(url) html=urllib2.urlopen(res).read().decode(‘UTF-8‘) return html except urllib2.UrlError,e:#为什么这么写 if hasattr(e,"reason"):#hasattr是什么意思 print u‘错误‘,e.reason return None def getPageNum(self,page): #获取页码 page=self.getPage(1)#这个是第一页的网址信息 pattern=re.compile(‘<li class="l_reply_num" .*?</span>.*?<span.*?>(.*?)</span>‘,re.S) result=re.search(pattern,page) if result: return result.group(1).strip() else: return None #提取图片 def getContent(self,html): header = { ‘Accept‘: ‘*/*‘, ‘Accept-Encoding‘:‘gzip,deflate,sdch‘, ‘Accept-Language‘:‘zh-CN,zh;q=0.8‘, ‘Connection‘:‘keep-alive‘ } html = requests.get(url,headers = header) data = html.content.decode(‘utf-8‘) find = re.compile(r‘<img class="BDE_Image" src="(.*?).jpg"‘) result = find.findall(data) for img_url in result: name = img_url.split(‘/‘)[-1] img_url = img_url+‘.jpg‘ html = requests.get(img_url,headers = header) im = html.content with open(name+‘.jpg‘,‘wb‘)as f: f.write(im) def start(self): indexPage=self.getPage(1) pageN=self.getPageNum(indexPage) if pageN==None: print "URL error" return try: print u‘该帖子有‘+str(pageN)+‘页!‘# for i in range(1,int(pageN)+1): print u‘正在读入第‘+str(i)+‘页数据‘ page=self.getPage(i) contents=self.getContent(page) except IOError,e: print u‘正在写入第‘+str(i)+‘页数据‘ finally: print u‘爬取任务完成^_^‘ print u‘请写入帖子号码‘ baseUrl=‘http://tieba.baidu.com/p/‘+str(raw_input(u‘http://tieba.baidu.com/p/‘)) pt=TP(baseUrl) pt.start()
问题尚未完成,无法翻页且图片出不来,明天瞅瞅语法仔细盘盘逻辑
时间: 2024-10-13 07:57:58