获取中国比较有权重的网站
# encoding=utf-8 import re import requests from bs4 import BeautifulSoup class getUrl(object): def __init__(self,num): self.totle = num self.myheader = {‘Host‘: ‘top.chinaz.com‘, ‘Connection‘: ‘ keep-alive‘, ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36‘, ‘Accept‘:‘*/*‘,‘Referer‘:‘http://www.chinaz.com/‘, ‘Accept-Encoding‘:‘gzip, deflate, sdch‘,‘Accept-Language‘:‘zh-CN,zh;q=0.8‘} # 表头信息 def beginer(self): print ‘get start‘ page = 2 urlliset = [] while page < 1680: url = ‘http://top.chinaz.com/all/index_‘+str(page)+‘.html‘ r = requests.get(url,headers=self.myheader) soup = BeautifulSoup(r.text) list = soup.select(‘.col-gray‘) site = re.findall(‘<span.*?>(.*?)</span>‘,str(list)) del site[0] for elem in site: urlliset.append(elem) page += 1 self.writeQQ(text = urlliset,file_dir=‘site.text‘,mode=‘w‘) def writeQQ(self,text, file_dir, mode): with open(file_dir, mode) as f: for site in text: f.write(site) f.write("\n") spidre = getUrl(44) spidre.beginer()
时间: 2024-10-05 18:51:03