废话不多说,直接贴代码,主要采用BeautifulSoup写的
# -*- coding: utf-8 -*-
"""
Created on Mon May 18 19:12:06 2015
@author: Administrator
"""
import urllib
import os
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
i = 0
j = 0
list_a = []
def gettext(href):
global j,list_a
page = urllib.urlopen(href).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
div = soup.find_all("div",class_="content")
p_text = div[0].find_all("p")
for p in p_text:
fp = file("%s.txt" % list_a[j],"a")
fp.write(‘ ‘)
fp.write(p.get_text())
fp.write(" \n")
j+=1
def gethref(url): #获得所有链接
global i,list_a
fp = file("AllTitle.txt","w+")
page = urllib.urlopen(url).read()
soup = BeautifulSoup(page,from_encoding="gb18030")
ul = soup.find_all("ul",class_="row1")
li = ul[0].find_all("li")
for lia in li:
list_a.append(("%s、" % (i+1))+lia.h3.get_text())
href = lia.a.get(‘href‘)
# 将标题简介和链接有规则的写入文件中
fp.write("%s、" % (i+1))
i+=1
fp.write("标题:")
fp.write(lia.h3.get_text())
fp.write("\n 简介:")
fp.write(lia.p.get_text())
fp.write("\n 链接:")
fp.write(lia.a.get("href"))
fp.write("\n")
gettext(href)
if "__main__"==__name__:
url ="http://re.qq.com/biznext/zkht.htm"
gethref(url)
print "All Is OK!"