原文地址:运用python抓取博客园首页的所有数据,而且定时持续抓取新公布的内容存入mongodb中
依赖包: 1.jieba 2.pymongo 3.HTMLParser # -*- coding: utf-8 -*- """ @author: jiangfuqiang """ from HTMLParser import HTMLParser import re import time from datetime import date import pymongo import urllib2 import sys import traceback import jieba default_encoding = ‘utf-8‘ if sys.getdefaultencoding() != default_encoding: reload(sys) sys.setdefaultencoding(default_encoding) isExist = False class FetchCnblog(HTMLParser): def __init__(self, id): HTMLParser.__init__(self) self.result = [] self.data = {} self.isTitleLink = False self.id = id self.isSummary = False self.isPostItem = False self.isArticleView = False def handle_data(self, data): if self.isTitleLink and self.isPostItem: self.data[‘title‘] = data self.isTitleLink = False elif self.isSummary and self.isPostItem: data = data.strip() if data: self.data[‘desc‘] = data def handle_starttag(self, tag, attrs): if tag == ‘a‘: for key, value in attrs: if key == ‘class‘: if value == ‘titlelnk‘: self.isTitleLink = True elif value == ‘gray‘ and self.isArticleView: self.isArticleView = False for key, value in attrs: if key == ‘href‘: self.data[‘readmoreLink‘] = value reg = ‘d+‘ result = re.search(reg,value) self.isPostItem = False if result: self.data[‘id‘] = int(result.group()) else: self.data = {} return if self.data[‘id‘] <= self.id: self.data = {} isExist = True return else: self.data[‘srouce‘] = "www.cnblogs.com" self.data[‘source_key‘] = ‘cnblogs‘ self.data[‘fetchTime‘] = str(date.today()) self.data[‘keyword‘] = ",".join(jieba.cut(self.data[‘title‘])) self.result.append(self.data) self.data = {} elif tag == ‘p‘: for key, value in attrs: if key == ‘class‘ and value == ‘post_item_summary‘: self.isSummary = True elif tag == ‘img‘: for key, value in attrs: if key == ‘class‘ and value == ‘pfs‘: for key, value in attrs: if key == ‘src‘: self.data[‘imgSrc‘] = value elif tag == ‘div‘: for key, value in attrs: if key == ‘class‘ and value == ‘post_item_foot‘: self.isSummary = False elif key == ‘class‘ and value == ‘post_item‘: self.isPostItem = True elif tag == ‘span‘: for key , value in attrs: if key == ‘class‘ and value == ‘article_view‘: self.isArticleView = True def getResult(self): return self.result if __name__ == "__main__": con = pymongo.Connection(‘localhost‘, 27017) db = con.blog fetchblog = db.fetch_blog record = db.record url = "http://www.cnblogs.com/sitehome/p/%d" count = 1 flag = False headers={ ‘User-Agent‘:‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US。 rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6‘} reco = record.find_one({"type":‘cnblogs‘}) id = 0 if reco: id = reco[‘maxId‘] while isExist == False: try: req = urllib2.Request(url%count,headers=headers) request = urllib2.urlopen(req) data = request.read() fj = FetchCnblog(id) fj.feed(data) result = fj.getResult() if len(result) < 1: isExist = True else: if flag == False: flag = True dic = result[0] id = int(dic[‘id‘]) record.update({"type":‘cnblogs‘},{"$set":{‘maxId‘:id}},True,False) result.reverse() for doc in result: fetchblog.insert(doc) print "page is %d"%count count += 1 time.sleep(5) except Exception, e: traceback.print_exc() print "parse error",e 程序假设在linux,mac下运行。在可在crontab -e中设置定时任务,假设在windows运行,则自己再在程序里加个定时器就可以
时间: 2024-10-12 17:51:39