https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1
# -*- coding:utf-8 -*- import urllib import urllib2 import re #百度贴吧:纯原创我心中的NBA2014-2015赛季现役50大 # https://tieba.baidu.com/p/3138733512?see_lz=1&pn=1 #解释如下: # http:// 代表资源传输使用http协议 # tieba.baidu.com 是百度的二级域名,指向百度贴吧的服务器。 # /p/3138733512 是服务器某个资源,即这个帖子的地址定位符 # see_lz和pn是该URL的两个参数,分别代表了只看楼主和帖子页码,等于1表示该条件为真 class BDTB: #初始化方法,传入url,看是否只看楼主的参数 def __init__(self, baseUrl, seeLz): self.baseurl = baseUrl self.seelz = ‘?see_lz=‘ + str(seeLz) self.user_agent = ‘Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)‘ #初始化headers self.headers = { ‘User-Agent‘ : self.user_agent } #方法getPage,获取该页帖子代码的源码 def getPage(self, pageNum): try: url = self.baseurl + self.seelz + ‘&pn=‘ + str(pageNum) request = urllib2.Request(url,headers = self.headers) response = urllib2.urlopen(request) pageCode = response.read().decode(‘utf-8‘) return pageCode except urllib2.URLError, e: if hasattr(e,"reason"): print u"百度贴吧链接失败,错误原因是:",e.reason return None #获取帖子标题(使用正则表达式) def getTitle(self): page = self.getPage(1) if not page: print "页面加载失败..." return None pattern = re.compile(‘<h3.*?class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>‘, re.S) result = re.search(pattern, page) if result: print result.group(1).strip() else: print "None" baseURL = ‘https://tieba.baidu.com/p/3138733512‘ bdtb = BDTB(baseURL, 1) bdtb.getPage(1) bdtb.getTitle()
时间: 2024-10-19 21:23:58