很久以前,很喜欢泡贴吧,因为那里有自己牵挂的人和事
一转眼过去好多年了......
一个简单的学习爬虫的例子,爬百度贴吧帖子内容
代码如下:
# -*- coding:utf-8 -*- #import urllib import urllib2 import re #功能性类,过滤掉html标签的类 class Tool: #去除img标签,7位长空格 removeImg = re.compile(‘<img.*?>| {7}|‘) #删除超链接标签 removeAddr = re.compile(‘<a.*?>|</a>‘) #把换行的标签换为\n replaceLine = re.compile(‘<tr>|<div>|</div>|</p>‘) #将表格制表<td>替换为\t replaceTD = re.compile(‘<td>‘) #把段落开头换为\n加空两格 replacePara = re.compile(‘<p.*?>‘) #将换行符或双换行符替换为\n replaceBR = re.compile(‘<br><br>|<br>‘) #将其余标签剔除 removeExtraTag = re.compile(‘<.*?>‘) def replace(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.replacePara, "\n ", x) x = re.sub(self.replaceBR, "\n", x) x = re.sub(self.removeExtraTag, "", x) #strip()将前后多余内容删除 return x.strip() #百度贴吧爬虫类 class BDTB: #初始化,传入网址,只看楼主的参数,传入1就只看楼主 def __init__(self, baseUrl, seeLZ): self.baseURL = baseUrl self.seeLZ = ‘?see_lz=‘ + str(seeLZ) self.tool = Tool() def getPage(self, pageNum): try: url = self.baseURL + self.seeLZ + ‘&pn=‘ + str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read().decode(‘utf-8‘) except urllib2.URLError, e: if hasattr(e, "reason"): print u"连接百度贴吧失败.错误原因", e.reason return None #获取帖子标题 def getTile(self): print ‘获取帖子标题开始‘ page = self.getPage(1) pattern = re.compile(‘<h3 class="core_title_txt.*?>(.*?)</h3>‘, re.S) result = re.search(pattern, page) if result: print result.group(1).strip() print ‘获取帖子标题结束‘ #提取帖子页数 #获取帖子一共有多少页 def getPageNum(self): print ‘获取帖子页数一共有多少页开始‘ page = self.getPage(1) pattern = re.compile(‘<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>‘, re.S) result = re.search(pattern, page) if result: print result.group(1).strip() print ‘获取帖子页数一共有多少页结束‘ return result.group(1).strip() #提取单页帖子内容 def getPageContent(self, page): pattern = re.compile(‘<div id="post_content_.*?>(.*?)</div>‘, re.S) items = re.findall(pattern, page) for item in items: print self.tool.replace(item) #汇总所有帖子内容 def getAllContent(self): print ‘获取正文内容开始------>‘ for i in range(int(self.getPageNum())): bdtb.getPageContent(self.getPage(i+1)) print ‘<------获取正文内容结束‘ baseURL = ‘http://tieba.baidu.com/p/4452150954‘ bdtb = BDTB(baseURL, 1) bdtb.getTile() bdtb.getAllContent()
时间: 2024-10-10 16:38:57