用css选择器的时候有点生疏
1 import requests 2 from bs4 import BeautifulSoup 3 def get_url_list(url): 4 content = requests.get(url).content 5 soup = BeautifulSoup(content,‘lxml‘) 6 list = [] 7 for i in soup.select(‘#list dl dd a‘): 8 temp = url+i.get(‘href‘) 9 list.append(temp) 10 return list 11 12 def get_date(url,title1): 13 content = requests.get(url).content 14 soup = BeautifulSoup(content,‘lxml‘) 15 soup1 = str(soup.select(‘#content‘)) 16 text = soup1.replace(‘<br/>‘,‘\n‘).replace(‘</div>‘,‘\n‘).replace(‘<div id="content">‘,‘‘) 17 title = soup.select(‘.content_read .box_con .bookname h1‘)[0].get_text() 18 f = open(r‘F:\\‘+title1+‘.txt‘,‘a+‘,encoding = ‘utf-8‘) 19 f.write(title +"\n\n"+text) 20 print(title) 21 f.close() 22 23 if __name__=="__main__": 24 url = requests.get(‘http://www.biquge.info/list/1_1.html‘) 25 soup =BeautifulSoup(url.content,‘lxml‘) 26 #print(soup) 27 list =[] 28 for i in soup.select(‘#newscontent .l ul li .s2 a‘): 29 url1 = i.get(‘href‘) 30 title = i.get_text() 31 url_list = get_url_list(url1) 32 print(url_list[0]) 33 for i in url_list: 34 get_date(i,title)
原文地址:https://www.cnblogs.com/kangdong/p/8629774.html
时间: 2024-11-02 07:08:24