学会了怎么使用os模块
#!/usr/bin/python# -*- encoding:utf-8 -*- import requestsimport osfrom bs4 import BeautifulSoup def urlBS(url): response = requests.get(url) # print response.encoding #查看request解析的网页的编码 response.encoding = ‘utf-8‘ #requests自动识别的编码有误,强制更改编码方式 soup = BeautifulSoup(response.text,‘lxml‘) #前面用的soup = BeautifulSoup(response)一直报错,后面参考用etree的方式,我加了个.text搞定 # print soup.original_encoding #查看BeautifulSoup解析的网页的编码 return soup def get_article(url): #得到文档的链接 booklist_soup=urlBS(url) herf_list=booklist_soup.select(‘.booklist a‘) path = os.getcwd()+u‘/读者文章保存‘+time+u‘/‘ #取得当前目录 if not os.path.isdir(path): #如果当前目录不存在读者文章保存这个文件夹,则新建它 os.mkdir(path) for herf in herf_list: newurl=baseurl+herf[‘href‘] # print newurl result=urlBS(newurl) title=result.find("h1").string #获得文章标题 filename = path + title + ‘.txt‘ #print filename author=result.find(id=‘pub_date‘).string.strip() #获得作者,strip去除前后空格 print filename+‘ ‘+author #写入文档 new=open(filename,"w") new.write("<<" + title.encode("utf-8") + ">>\n\n") new.write(author.encode("utf-8")+"\n\n") #文章信息 text=result.select(".blkContainerSblkCon p") # file=open(‘testfile.text‘,‘w‘) for p in text: # print p.text #content=p.string.strip() #用strip导致不会换行了,写进去后所有的资料都变成了一行 content=p.text #直接用.text原文的换行都会存在,保存较完好 new.write(content.encode("utf-8")) new.close()
时间: 2024-10-08 01:13:46