用python爬博客
by 伍雪颖
以爬王垠的博客为例:
import re import urllib2 def getHtmlCode(url): return urllib2.urlopen(url).read() def findTitleUrl(htmlString): regTitleUrl = re.compile("href=\"(.+?)\"") return regTitleUrl.findall(htmlString) def findTitleContent(htmlString): regTitleContent = re.compile("\">(.+?)</a>") return regTitleContent.findall(htmlString) htmlCode = getHtmlCode('http://www.yinwang.org/') titleContent = findTitleContent(htmlCode) titleUrl = findTitleUrl(htmlCode) for i in range(0, len(titleUrl)): print titleContent[i+3] print titleUrl[i+8] htmlPage = getHtmlCode(titleUrl[i+8]) f = open("%s.html"%(titleContent[i+3]),'wb') f.write(htmlPage) f.close
时间: 2024-08-05 08:40:28