# encoding=utf-8 import re import requests class getUrl(object): def __init__(self,num): self.totle = num self.myheader = {‘Host‘: ‘www.wooyun.org‘, ‘Connection‘: ‘ keep-alive‘, ‘User-Agent‘:‘Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36‘, ‘Accept‘:‘*/*‘,‘Referer‘:‘http://www.wooyun.com/‘, ‘Accept-Encoding‘:‘gzip, deflate, sdch‘,‘Accept-Language‘:‘zh-CN,zh;q=0.8‘} # 表头信息 def beginer(self): print ‘get start‘ page = 1 urlliset = [] while page < 45: url = ‘http://www.wooyun.org/corps/page/‘+str(page) r = requests.get(url,headers=self.myheader) site = re.findall(‘href="http://(.*?)"‘,r.text) site = re.findall(‘(!www.)(.*?)‘,r.text) site2 = re.findall(‘href="https://(.*?)"‘,r.text) page += 1 for elem in site: urlliset.append(elem) for elem in site2: urlliset.append(elem) self.writeQQ(text = urlliset,file_dir=‘site.text‘,mode=‘w‘) def writeQQ(self,text, file_dir, mode): with open(file_dir, mode) as f: for site in text: f.write(site) f.write("\n") spidre = getUrl(44) spidre.beginer()
时间: 2024-12-20 17:53:01