#python3.6 #爬取古诗文网的诗文 import requests from bs4 import BeautifulSoup import html5lib import re import os def content(soup): b = 1 poetrydict = dict() for i in soup.find_all(‘a‘)[8:]: if i.get(‘href‘): url = ‘%s%s‘ % ("https://so.gushiwen.org/",i.get(‘href‘)) if (i.get(‘href‘) == "/gushi/tangshi.aspx" or i.get(‘href‘) =="/gushi/xiaowen.aspx"): # 到唐诗三百或小学文言即结束 break else: url=i.get(‘href‘) sbody = re.sub(u"\\(.*?\\)", "", str(i.text)) # 抽取题目(汉字)去掉题目中的括号以及括号中的内容 # sbody=re.sub("\\·","",sbody).strip() # 去掉题目中的"·"符号 print(b, sbody,url) poetrydict[sbody]=url b=b+1 return poetrydict #返回一个键为诗的题目,值为诗文链接的字典 def fulltext(pdict): c=b=d=0 poetrydict=dict() for i in pdict: if pdict[i]: s= requests.get(pdict[i]) soup= BeautifulSoup(s.text, ‘html5lib‘) text2=soup.find_all(‘div‘)[9] poetry = str(text2.find_all(‘div‘)[4].text) poetry = re.sub(‘\s+‘, ‘‘, poetry).strip() #替换调空白(空格、换行) poetry = re.sub(u"\\(.*?\\)", "", poetry) #去掉括号括起来的通假字还有括号 poetrydict[i]=formattext(poetry) else: poetrydict[i]=None return poetrydict # 将诗文和题目存进一个字典中,并将此字典作为函数的返回 def formattext(s): s = re.sub(‘\\。‘, ‘。\n‘, s) s = re.sub(‘\\!‘, ‘!\n‘, s) s = re.sub(‘\\?‘, ‘?\n‘, s) s = re.sub(‘\\:‘, ‘:\n‘, s) s = re.sub(‘\\;‘, ‘:\n‘, s) s = re.sub(‘\\「‘,‘‘, s) s = re.sub(‘\\」‘, ‘‘, s) s = re.sub(‘\\《‘, ‘‘, s) s = re.sub(‘\\》‘, ‘‘, s).strip() return s # 对输出诗文做一些格式上的调整,还有待完善 def output(dict,text): for i in dict: # print(dict[i]) print("####", i, "\n", dict[i], file=text) print( i, "\n", dict[i]) text.close() if __name__ == ‘__main__‘: strc = input("请输入(类别)链接:") ## 如,https://so.gushiwen.org/gushi/tangshi.aspx #网页右侧边栏 sc = input("请输入文件名:") sw = requests.get(strc) soup = BeautifulSoup(sw.text, ‘html5lib‘) s=‘D:\\pythonPROJECT\\‘+sc+".md" # 存储目录 file = open(s, "w", encoding=‘utf-8‘) output(fulltext(content(soup)),file)
原文地址:https://www.cnblogs.com/loeFairy/p/12244110.html
时间: 2024-10-11 05:29:35