最近在做词语的相似度做比较,就选用了gensim
首先要安装gensim库,此处省略,参看官网http://radimrehurek.com/gensim/install.html
在网上下了一些词库
{"date": "2016-05-01", "content": "京东家电 沸腾五一\n买家电 上京东4.28-5.7\n邯郸京东帮联动会\n馆陶站 大型文艺汇演\n五一会员狂欢会,凭券入场更实惠\n买家电 ,上京东,突破底线 限时抢购!\n京东帮\n馆陶文卫街服务店\n大型文艺汇演现场\n活动日期2}
import sysimport jiebareload(sys)sys.setdefaultencoding("utf-8")from gensim import corpora,models,similaritiesalist = [] import json def fenci(): for i_text in open("xaa.json"): #读取文文件 f_json = json.loads(i_text) kk = f_json["content"] item_str = jieba.cut(kk.encode("utf-8"),cut_all=True) #使用jieba分词 a = " ".join(item_str) alist.append(a) fenci() class MyCorpus(object): def __iter__(self): for item_str in alist: yield item_str.split(‘ ‘) Corp = MyCorpus()dictionary = corpora.Dictionary(Corp)corpus = [dictionary.doc2bow(text) for text in Corp] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus]def test_kk(test): test_cut_raw_1 = jieba.cut(test) doc_new = " ".join(test_cut_raw_1) test_corpus_1 = dictionary.doc2bow(doc_new.split()) vec_tfidf = tfidf[test_corpus_1] index = similarities.MatrixSimilarity(corpus_tfidf) sims = index[vec_tfidf] similarit = list(sims) #print(list(enumerate(sims))) sims = sorted(enumerate(sims), key=lambda item: -item[1]) print(sims) #打印出相似度结果 def buss_mian(): while True: test = raw_input("please input test:") test_kk(test) if __name__ == "__main__": buss_mian() 欢迎指导学习交流!!!!!!!
时间: 2024-10-29 10:48:17