知识点
""" 1) from gensim.model import Word2Vec import jieba 2) opencc :将繁体字转换为简体字 转换命令:opencc -i texts.txt -o test.txt -c t2s.json 3) 自然语言处理: 1、拼写检查、关键字检索 2、文本挖掘 3、文本分类 (二分类) 4、机器翻译 5、客服系统 6、复杂对话系统 4) p(S)=p(w1,w2,w3,w4,w5,…,wn) =p(w1)p(w2|w1)p(w3|w1,w2)...p(wn|w1,w2,...,wn-1) 计算方式:p(wi|w1,w2,...,wi-1) = p(w1,w2,...,wi-1,wi) / p(w1,w2,...,wi-1) 5) 语言模型问题:(1)数据过于稀疏 (2)参数空间太大 针对上述两个问题两种优化方法: 1)假设下一个词的出现依赖它前面的一个词: p(S)=p(w1)p(w2|w1)p(w3|w1,w2)...p(wn|w1,w2,...,wn-1)=p(w1)p(w2|w1)p(w3|w2)...p(wn|wn-1) 2)假设下一个词的出现依赖它前面的两个词: p(S)=p(w1)p(w2|w1)p(w3|w1,w2)...p(wn|w1,w2,...,wn-1)=p(w1)p(w2|w1)p(w3|w1,w2)...p(wn|wn-1,wn-2) 6) word2vec过程: 1、数据获取 2、分词 3、gensim的word2vec建模 4、model.similarity()进行预测 """
1、wiki词库处理----繁体字转为简体字
#!/usr/bin/env python # -*- coding: utf-8 -*- # 修改后的代码如下: import logging import os.path import sys from gensim.corpora import WikiCorpus if __name__ == ‘__main__‘: program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format=‘%(asctime)s: %(levelname)s: %(message)s‘) logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ‘ ‘.join(sys.argv)) # check and process input arguments if len(sys.argv) < 3: print (globals()[‘__doc__‘] % locals()) sys.exit(1) inp, outp = sys.argv[1:3] space = b‘ ‘ i = 0 output = open(outp, ‘w‘,encoding=‘utf-8‘) wiki = WikiCorpus(inp, lemmatize=False, dictionary={}) for text in wiki.get_texts(): s=space.join(text) s=s.decode(‘utf8‘) + "\n" output.write(s) i = i + 1 if (i % 10000 == 0): logger.info("Saved " + str(i) + " articles") output.close() logger.info("Finished Saved " + str(i) + " articles") #python process.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text
2、对简体语料库进行jieba分词
import jieba import jieba.analyse import jieba.posseg as pseg import codecs,sys def cut_words(sentence): #print sentence return " ".join(jieba.cut(sentence)).encode(‘utf-8‘) f=codecs.open(‘wiki.zh.jian.text‘,‘r‘,encoding="utf8") target = codecs.open("zh.jian.wiki.seg-1.3g.txt", ‘w‘,encoding="utf8") print (‘open files‘) line_num=1 line = f.readline() while line: print(‘---- processing ‘, line_num, ‘ article----------------‘) line_seg = " ".join(jieba.cut(line)) target.writelines(line_seg) line_num = line_num + 1 line = f.readline() f.close() target.close() exit() while line: curr = [] for oneline in line: #print(oneline) curr.append(oneline) after_cut = map(cut_words, curr) target.writelines(after_cut) print (‘saved‘,line_num,‘articles‘) exit() line = f.readline1() f.close() target.close() # python Testjieba.py
3、word2vec建模
import logging import os.path import sys import multiprocessing from gensim.corpora import WikiCorpus from gensim.models import Word2Vec from gensim.models.word2vec import LineSentence if __name__ == ‘__main__‘: program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format=‘%(asctime)s: %(levelname)s: %(message)s‘) logging.root.setLevel(level=logging.INFO) logger.info("running %s" % ‘ ‘.join(sys.argv)) # check and process input arguments if len(sys.argv) < 4: print (globals()[‘__doc__‘] % locals()) sys.exit(1) inp, outp1, outp2 = sys.argv[1:4] model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count()) model.save(outp1) model.model.wv.save_word2vec_format(outp2, binary=False) #python word2vec_model.py zh.jian.wiki.seg.txt wiki.zh.text.model wiki.zh.text.vector #opencc -i wiki_texts.txt -o test.txt -c t2s.json
4、加载模型,预测
from gensim.models import Word2Vec en_wiki_word2vec_model = Word2Vec.load(‘wiki.zh.text.model‘) testwords = [‘苹果‘,‘数学‘,‘学术‘,‘白痴‘,‘篮球‘] for i in range(5): res = en_wiki_word2vec_model.most_similar(testwords[i]) print (testwords[i]) print (res)
原文地址:https://www.cnblogs.com/ywjfx/p/11002561.html
时间: 2024-10-30 04:48:43