bilibili弹幕词云
美国历史词云
结巴分词
import jieba
txt=" **** "
精确模式: 全模式: 搜索模式:
res = jieba.cut(txt) res =jieba.cut(txt ,cut_all=True) res=jieba.cut_for_search(txt)
for i in res: for i in res: for i in res:
print(i) print(i) print(i)
或者
res = jieba.lcut(txt) res=jieba.lcut(txt,cut_all=True) res=jieba.lcut_for_search(txt)
print(res)
词云
1.小段文字的词云
from wordcloud import WordCloud
import matplotlib.pylot as plt
from imageio import imread
txt="小段文字"
color_mask=imread(‘图片路径‘)
wc=WordCloud(
width: ** ,
height: ** ,
background_color: ** ,
font_path:r‘c:\windows\Fonts\***‘,
mask=color_mask
)
wc.generate( txt )
wc.to_file(‘cy.pg‘)
plt.imshow(wc)
plt.show()
2.文本文件的词云
from wordcloud import WordCloud
import matplotlib.pylot as plt
from imageio import imread
f.open(‘文本文件名.txt‘,encoding=‘utf8‘)
data=f.read
result="".join(jieba.lcut(data))
color_mask=imread(‘图片名.jpg‘)
wc=WordCloud(
height:**,
width:**,
font_path=r‘c:\windows\Fonts\***‘,
mask=color_mask
)
wc.generate(result)
wc.to_file(‘图片名.png‘)
plt.imshow(wc)
plt.show()
案例()
1.爬取bilibili弹幕
import requests
from bs4 import BeautifulSoup
import pandas as pd
imort datetime
import re
header={
‘User-Agent‘:‘*********‘
}
url=‘http:\\comment.bilibili.com/codecid.xml‘
#向对方服务器发送请求 response=requests.get(url=url,headers=header)
#设置字符码 response.encoding = response.apparent_encoding
#获取文本 data=response.txt
#解析 soup=BeautifulSoup(data,‘lxml‘)
#获取所有的d标签 d_list=soup.find_all(‘d‘)
dlist[]
#循环所有的d标签 for d in d_list:
danmu={}
danmu[‘弹幕‘]=d.txt #获取文本信息
danmu[‘时间‘]=datetime.datetime.now()
danmu[‘地址‘]=url
dlist.append(danmu)
#转换为二维数组,类似于excel表格 df=pd.DataFrame(dlist)
f=open(‘sign.txt‘,‘w‘,encoding=‘utf8‘)#打开文件
#循环所有的文件信息 for i in df[‘弹幕‘].values:
pat=re.compile(r‘[一-龥]+‘)#定义过滤的规则(所有的汉字)
filter_data=re.findall(pattern=pat,string=i)#执行过滤操作
f.write("".join(filter_data))#写入文本
f.close()
2.bilibili弹幕分析
import jieba
from wordcloud import WordCloud
import matplotlib.pylot as plt
from imageio import imread
f=open(‘sign.txt‘,‘r‘,encoding=‘utf8‘)
data=f.read()
result="".join(jieba.lcut(data))
f.close
color_mask=imread(‘图片名称.jpg‘)
wc=WordCloud(
height=***,
width=***,
background_color=‘**‘,
font_path=r‘c:\windows\Fonts\***‘,
mask=color_mask
)
wc.generate(result)
wc.to_file(‘bilibili.png‘)
plt.imshow(wc)
plt.show()
————————godlover
原文地址:https://www.cnblogs.com/godlover/p/11754311.html