爬取网页的部分链接
#!/usr/bin/python
#coding = utf8
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import random
pages = set()
def getlink(pageurl):
global pages
html = urlopen(‘http://www.ftchinese.com‘ + pageurl)
bs_data = BeautifulSoup(html,‘lxml‘)
#from ipdb import set_trace
#set_trace()
for link in bs_data.find_all(‘a‘,href = re.compile("^(/m/)")):
if ‘href‘ in link.attrs:
if link.attrs[‘href‘] not in pages:
#我们遇到了新页面
newpage = link.attrs[‘href‘]
print(newpage)
pages.add(newpage)
getlink(newpage)
getlink("")
时间: 2024-11-22 19:52:17