# -*- coding: utf-8 -*-
# author: huihui
# date: 2020/1/31 7:58 下午
'''
根据语料训练词向量,并保存向量文件
'''
import os
import sys
import gensim
os.reload(sys)
sys.setdefaultencoding('utf-8')
# 需要提前分词
input_file = "corp_seg.txt"
sentences = gensim.models.word2vec.Text8Corpus(input_file)
# 训练词向量
model = gensim.models.word2vec.Word2Vec(sentences, sg=1, size=100, window=5, min_count=1, negative=3, sample=0.001,
hs=1, workers=40)
# 保存词向量文件
model.save("corp_word2vec.model")
model.wv.save_word2vec_format("corp_word2vec.txt")
# 加载词向量文件
model = gensim.models.word2vec.Word2Vec.load("corp_word2vec.model")
model = gensim.models.KeyedVectors.load_word2vec_format("corp_word2vec.txt")
原文地址:https://www.cnblogs.com/xuehuiping/p/12246510.html
时间: 2024-09-29 23:11:18