1.提取某一页的所有段子 # -*- coding:utf-8 -*- import urllib import urllib2 import re page = 1 url = ‘http://www.qiushibaike.com/hot/page/‘ + str(page) user_agent = ‘haha/4.0 (compatible; MSIE 5.5; Windows NT)‘ headers = { ‘User-Agent‘ : user_agent } try: request = urllib2.Request(url,headers = headers) response = urllib2.urlopen(request) content = response.read().decode(‘utf-8‘) pattern=re.compile(‘h2>(.*?)</h2.*?<span>(.*?)</.*?number">(.*?)</.*?number">(.*?)<‘, re.S) items = re.findall(pattern,content) for item in items: print u"----------------------------------------\n发布人:%s内容:%s赞:%s\t评论数:%s\n"%(item[0],item[1],item[2],item[3]) except urllib2.URLError, e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason
时间: 2024-11-09 04:53:44