爬贴吧小说。
爬取该链接中的楼主发言前10页另存为文本文件
python2.7
# *-* coding: UTF-8 *-* import urllib2 import re class BDTB: baseUrl = ‘http://tieba.baidu.com/p/4896490947?see_lz=&pn=‘ def getPage(self, pageNum): try: url = self.baseUrl+str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request).read() return response except Exception, e: print e def Title(self, pageNum): html = self.getPage(pageNum) reg = re.compile(r‘title="【原创】(.*?)"‘) items = re.findall(reg, html) for item in items: f = open(‘text.txt‘, ‘w‘) f.write(‘标题‘+‘\t‘+item) f.close() return items def Text(self, pageNum): html = self.getPage(pageNum) reg = re.compile(r‘d_post_content j_d_post_content "> (.*?)</div><br>‘, re.S) req = re.findall(reg, html) if pageNum == 1: req = req[2:] for i in req: removeAddr = re.compile(‘<a.*?>|</a>‘) i = re.sub(removeAddr, "", i) removeAddr = re.compile(‘<img.*?>‘) i = re.sub(removeAddr, "", i) removeAddr = re.compile(‘http.*?.html‘) i = re.sub(removeAddr, "", i) i = i.replace(‘<br>‘, ‘‘) f = open(‘text.txt‘, ‘a‘) f.write(‘\n\n‘+i) f.close() bdtb = BDTB() print ‘Crawl is starting....‘ try: for i in range(1, 10): print ‘Crawling Page %s...‘ % (i) bdtb.Title(i) bdtb.Text(i) except Exception, e: print e
时间: 2024-09-29 15:38:31