第一部分 引言
关于隐马尔柯夫模型的详细内容在此就不详细介绍了,介绍HMM模型的文章很多,请读者自行去学习。二阶隐马尔柯夫模型解决问题有两个假设:其一是当前的状态仅与它前面相邻的状态有关;其二是状态转换和从某个状态发射某个观察符号的概率与时间t无关(即不动性假设)。HMM是在这两个假设的前提下解决各种各样的问题的。
对于第二个假设,我们不去讨论它。现在来看第一个假设,二阶马尔柯夫过程假设当前状态仅与前面相邻的一个状态有关,那么对于分词来说,有些词可能会满足这样的情况,但也有可能会有些词并不这么简单的处理,他们可能和前面的数个状态有关。基于以上考虑,本文就二阶马尔柯夫过程和三阶马尔柯夫过程对中文分词做了一个实验,验证二者哪个更适合于中文的分词。
第二部分 试验结果
语料来源:1998年1月《人民日报》
预处理:首先将原始语料去掉段首的编号,然后将每个句子作为一行,编码转换为utf-8格式。最后按照9:1的比例将语料分为训练语料train.txt和测试语料test.txt。
分别用二阶马尔柯夫过程和三阶马尔柯夫过程进行分词。试验结果如下图所示:
从上图中可以看出,
(1)三阶HMM的分词准确率和召回率均优于二阶HMM。
(2)三阶HMM的切分词的总数要少于二阶HMM,但是切分正确的数量要大于二阶HMM。换句话说,二阶HMM倾向于将句子切的更细,而三阶HMM会比较保守,将句子切分出来的词少一些。
(3)另外,对于交集型的歧义,三阶HMM明显要少于二阶HMM。说明马尔柯夫链增长之后,对于减少交集型歧义是很有作用的。
第三部分 源代码
以下为本次实验过程的源代码,好多代码和之前讲述HMM分词的帖子里面的代码是一样的,只是一少部分作了修改,为了本文内容的完整性,我还是把代码全都在粘一遍把。注意之前的代码处理的是gb2312格式的文本,本文的代码文本本的格式要转换为utf-8哦,要不然,你把gb2312读进去出现了很多乱码,别说我没告诉你哦····
(1)文件名:util.h。下面好几个文件都要用到该文件,如将测试文件中的/去掉
#ifndef UTIL_H #define UTIL_H #include <string> using namespace std; /* * 函数功能:将字符串中的所有特定子串置换为新的字符串 * 函数输入:str 需要进行操作的字符串 * old_str 旧的字符串 * new_str 新的字符串 * 函数输出:置换完毕的字符串 */ string& replace_all(string &str, string old_str, string new_str){ while(1){ string::size_type pos(0); if((pos = str.find(old_str)) != string::npos){ str.replace(pos, old_str.length(), new_str); }else{ break; } } return str; } #endif
(2)文件名:prehmm.cpp。对文件进行预处理工作,函数的功能请参见代码中的注释。
#include <iostream> #include <fstream> #include <sstream> #include <string> #include <cstdlib> #include <map> #include "util.h" using namespace std; /* * 函数功能:将训练语料和测试语料中出现的汉字进行编码,将他们的对应关系存入文件 * 格式为:汉字-编码,编码从0开始 * 函数输入:infile_1 训练语料文件名 * infile_2 测试语料文件名 * outfile 指定的输出文件名 * 函数输出:名为outfile的文件 */ void makeDB(string infile_1, string infile_2, string outfile){ //读取输入文件 ifstream fin_1(infile_1.c_str()); ifstream fin_2(infile_2.c_str()); if(!(fin_1 && fin_2)){ cerr << "makeDB : Open input file fail !" << endl; exit(-1); } //打开输出文件 ofstream fout(outfile.c_str()); if(!fout){ cerr << "makeDB : Open output file fail !" << endl; exit(-1); } map<string, int> map_cchar; int id = -1; string line = ""; string cchar = ""; //读取输入文件内容 while(getline(fin_1, line)){ line = replace_all(line, "/", ""); if(line.size() >= 3){ //逐字读取 for(int i = 0; i < line.size() - 2; i += 3){ cchar = line.substr(i, 3); if(map_cchar.find(cchar) == map_cchar.end()){ ++id; map_cchar[cchar] = id; } } } } while(getline(fin_2, line)){ line = replace_all(line, "/", ""); if(line.size() >= 3){ //逐字读取 for(int i = 0; i < line.size() - 2; i += 3){ cchar = line.substr(i, 3); if(map_cchar.find(cchar) == map_cchar.end()){ ++id; map_cchar[cchar] = id; } } } } //输出到文件 map<string, int>::iterator iter; for(iter = map_cchar.begin(); iter != map_cchar.end(); ++iter){ //cout << iter -> first << " " << iter -> second << endl; fout << iter -> first << " " << iter -> second << endl; } fin_1.close(); fin_2.close(); fout.close(); } /* * 函数功能:将训练语料每个汉字后面加入对应的BMES状态 * 函数输入:infile 训练语料文件名 * outfile 指定的输出文件名 * 函数输出:名为outfile的文件 */ void makeBMES(string infile, string outfile){ ifstream fin(infile.c_str()); ofstream fout(outfile.c_str()); if(!(fin && fout)){ cerr << "makeBMES : Open file failed !" << endl; exit(-1); } string word_in = ""; string word_out = ""; string line_in = ""; string line_out = ""; while(getline(fin, line_in)){ if(line_in.size() >= 3){ line_out.clear(); line_in = replace_all(line_in, "/", " "); istringstream strstm(line_in); while(strstm >> word_in){ word_out.clear(); if(word_in.size()%3 != 0){ cout << "单词不符合要求:" << word_in << endl; continue; } int num = word_in.size()/3; //单词中包含多少个汉字 if(num == 0){ continue; } if(num == 1){ word_out = word_in; word_out += "/S"; }else{ //复制单词中的第一个字 word_out.insert(word_out.size(), word_in, 0, 3); word_out += "/B"; //逐个复制单词中间的字 for(int i = 1; i < num - 1; i++){ word_out.insert(word_out.size(), word_in, 3*i, 3); word_out += "/M"; } //复制单词中最后的汉字 word_out.insert(word_out.size(), word_in, 3*num - 3, 3); word_out += "/E"; } line_out += word_out; } fout << line_out << endl; } } } /* * 主函数 */ int main(int argc, char *argv[]){ if(argc < 5){ cout << "Usage: " << argv[0] << " train_file test_file db_file bmes_file" << endl; exit(-1); } //构造DB文件,输入训练语料、测试语料、输出文件名 makeDB(argv[1], argv[2], argv[3]); //构造BMES文件,输入训练语料、输出文件名 makeBMES(argv[1], argv[4]); }
(3)文件名:db.h。将汉字和编码的映射文件内存,构造为map,供其他程序使用
#ifndef DB_H #define DB_H #include <iostream> #include <fstream> #include <map> #include <vector> #include <cstdlib> #include "util.h" using namespace std; /* * 转换类,获取编号 */ class DB{ private: map<string, int> cchar_map; //汉字-编码映射 map<int, string> index_map; //编码-汉字映射 public: DB(); DB(string file); string getCchar(int id); //根据编码获得汉字 int getObservIndex(string cchar); //根据汉字获得编码 int getStateIndex(char state); //根据状态获得状态编号 vector<int> makeObservs(string line); //将输入的句子构造为发射符号序列 }; //无参构造函数 DB::DB(){ } //有参构造函数 DB::DB(string file){ ifstream fin(file.c_str()); if(!fin){ cout << "Open input file fail ! Can't init Trans !" << endl; exit(-1); } string line = ""; string word = ""; string cchar = ""; int id = 0; while(getline(fin, line)){ istringstream strstm(line); strstm >> word; cchar = word; strstm >> word; id = atoi(word.c_str()); //加入map cchar_map[cchar] = id; index_map[id] = cchar; } cout << "cchar_map大小: " << cchar_map.size() << endl; cout << "index_map大小: " << index_map.size() << endl; } //将状态转换为数字编号 int DB::getStateIndex(char state){ switch(state){ case 'B' : return 0; break; case 'M' : return 1; break; case 'E' : return 2; break; case 'S' : return 3; break; default : return -1; break; } } //将汉字转换为数字编号 int DB::getObservIndex(string cchar){ map<string, int>::iterator iter = cchar_map.find(cchar); if(iter != cchar_map.end()){ return iter -> second; }else{ return -1; } } //将数字编号转换为汉字 string DB::getCchar(int id){ map<int, string>::iterator iter = index_map.find(id); if(iter != index_map.end()){ return iter -> second; }else{ return NULL; } } //将输入的句子构造为发射符号序列 vector<int> DB::makeObservs(string line){ vector<int> vec_observ; //输出符号的集合 string cchar = ""; //存放每个汉字 string word = ""; //存放一个单词 int num = 0; //单词的字数 int index = -1; //单词对应的编号 line = replace_all(line, "/", " "); cout << line << endl; istringstream strstm(line); while(strstm >> word){ if(word.size()%3 != 0){ cout << "单词不符合要求:" << word << endl; continue; } num = word.size()/3; if(num == 0){ continue; }else{ for(int i = 0; i < num; i++){ cchar = word.substr(3*i, 3); index = getObservIndex(cchar); vec_observ.push_back(index); //cout << "cchar = " << cchar << " index = " << index << endl; } } } return vec_observ; } #endif
(4)文件名:matrix.cpp。用最大似然估计的方法建立HMM的模型参数。
#include <iostream> #include <fstream> #include <sstream> #include <string> #include <iomanip> #include <cmath> #include <list> #include "db.h" using namespace std; const int N = 4; //隐藏状态的数目 const int M = 4677; //汉字的个数 const double VALUE = 1.0; //平滑算法增加的值 //定义字典对象 DB db("db.txt"); /* * 模型训练,将频数转换为频率(加1平滑) */ void turingAdd(const int count[], double prob[], int len){ double sum = 0.0; for(int i = 0; i < len; ++i){ sum += count[i]; } sum = sum + VALUE * len; for(int i = 0; i < len; ++i){ prob[i] = -log((count[i] + VALUE) / sum);//取对数 } } /* * 模型训练,将发射频数转换为频率(古德-图灵平滑) */ void turingGood(const int count[], double prob[], int len){ map<int, list<int> > freq_map; //key为词频,value为该词频对应的汉字列表 map<int, list<int> >::iterator iter; //迭代器 int sum = 0; //词频总和 //初始化freq_map for(int i = 0; i < len; i++){ int freq = count[i]; //词频 sum += freq; iter = freq_map.find(freq); if(iter != freq_map.end()){ //该词频已经存在,把当前词加入相应的list freq_map[freq].push_back(i); }else{ //该词频不存在,建立对应的汉字list list<int> lst; lst.push_back(i); freq_map[freq] = lst; } } //若sum=0,则结果初始化为0.0即可 if(sum == 0){ for(int i = 0; i < len; i++){ prob[i] = 0.0; } return; } //数据平滑处理 iter = freq_map.begin(); while(iter != freq_map.end()){ double pr; //频率 int freq = iter -> first; int freqsize = iter -> second.size(); if(++iter != freq_map.end()){ int freq_2 = iter -> first; if(freq_2 = freq + 1){ int freqsize_2 = iter -> second.size(); pr = ((1.0 + freq) * freqsize_2) / (sum * freqsize); }else{ pr = 1.0 * freq / sum; } }else{ pr = 1.0 * freq / sum; } //计算结果 list<int> lst = (--iter) -> second; list<int>::iterator iter_in = lst.begin(); while(iter_in != lst.end()){ int index = *iter_in; prob[index] = pr; ++iter_in; } //准备下次迭代 ++iter; } //概率归一化 double total = 0.0; for(int i = 0; i < len; i++){ total += prob[i]; } for(int i = 0; i < len; i++){ prob[i] = -log((double)prob[i] / total);//取对数 } } /* * 主函数,生成HMM模型的参数 * 状态转移概率矩阵、初始状态概率矩阵、符号发射概率矩阵 */ int main(int argc, char *argv[]){ if(argc < 2){ cout << "Usage: " << argv[0] << " bmes_file !" << endl; exit(-1); } ifstream fin(argv[1]); if(!fin){ cerr << "Open input file " << argv[1] << "filed !" << endl; exit(-1); } int Pi[N] = {0}; //初始状态出现次数 int A1[N][N] = {0}; //二阶状态转移次数 int A2[N][N][N] = {0}; //三阶状态转移次数 int B1[N][M] = {0}; //二阶符号发射次数 int B2[N][N][M] = {0}; //三阶符号发射次数 //抽取文件中的状态和观察值 string line = ""; //存放每一行的内容 int line_num = 0; //句子编号 int count = 0; while(getline(fin, line)){ line_num++; char state; //状态 string cchar = ""; //一个汉字 int i, j, k, m; string::size_type pos = 0; //当前处理位置 if((pos = line.find("/", pos + 1)) != string::npos){ //抽取句子的第一个状态 state = line.at(pos + 1); i = db.getStateIndex(state); Pi[i]++; //抽取句子的第一个观察值 cchar = line.substr(pos - 3, 3); m = db.getObservIndex(cchar); B1[i][m]++; if((pos = line.find("/", pos + 1)) != string::npos){ //抽取句子的第二个状态 state = line.at(pos + 1); j = db.getStateIndex(state); A1[i][j]++; //抽取句子的第二个观察值 cchar = line.substr(pos - 3, 3); m = db.getObservIndex(cchar); B1[j][m]++; B2[i][j][m]++; while((pos = line.find("/", pos + 1)) != string::npos){ //抽取句子的其他状态 state = line.at(pos + 1); k = db.getStateIndex(state); A1[j][k]++; A2[i][j][k]++; //抽取句子的其他观察值 cchar = line.substr(pos - 3, 3); m = db.getObservIndex(cchar); B1[k][m]++; B2[j][k][m]++; //准备下次迭代 i = j; j = k; } } } } fin.close(); //打开输出流 ofstream fout_1("Pi.mat"); //初始概率矩阵 ofstream fout_2("A1.mat"); //二阶状态转移矩阵 ofstream fout_3("A2.mat"); //三阶状态转移矩阵 ofstream fout_4("B1.mat"); //二阶发射概率矩阵 ofstream fout_5("B2.mat"); //三阶发射概率矩阵 if(!(fout_1 && fout_2 && fout_3 && fout_4 && fout_5)){ cerr << "Create Matrix file failed !" << endl; exit(-1); } fout_1 << setprecision(8); fout_2 << setprecision(8); fout_3 << setprecision(8); fout_4 << setprecision(8); fout_5 << setprecision(8); //初始状态矩阵写入文件 double arr_pi[N] = {0.0}; //turingGood(Pi, arr_pi, N); turingAdd(Pi, arr_pi, N); for(int i = 0; i < N; i++){ fout_1 << arr_pi[i] << "\t"; } fout_1 << endl; //二阶状态转移矩阵写入文件 double arr_a_1[N] = {0.0}; for(int i = 0; i < N; i++){ //turingGood(A1[i], arr_a_1, N); turingAdd(A1[i], arr_a_1, N); for(int j = 0; j < N; j++){ fout_2 << arr_a_1[j] << "\t"; } fout_2 << endl; } //三阶状态转移矩阵写入文件 double arr_a_2[N] = {0.0}; for(int i = 0; i < N; i++){ for(int j = 0; j < N; j++){ //turingGood(A2[i][j], arr_a_2, N); turingAdd(A2[i][j], arr_a_2, N); for(int k = 0; k < N; k++){ fout_3 << arr_a_2[k] << "\t"; } fout_3 << endl; } } //二阶发射概率矩阵写入文件 double arr_b_1[M] = {0.0}; for(int i = 0; i < N; i++){ //turingGood(B1[i], arr_b_1, M); turingAdd(B1[i], arr_b_1, M); for(int j = 0; j < M; j++){ fout_4 << arr_b_1[j] << "\t"; } fout_4 << endl; } //三阶发射概率矩阵写入文件 double arr_b_2[M] = {0.0}; for(int i = 0; i < N; i++){ for(int j = 0; j < N; j++){ //turingGood(B2[i][j], arr_b_2, M); turingAdd(B2[i][j], arr_b_2, M); for(int k = 0; k < M; k++){ fout_5 << arr_b_2[k] << "\t"; } fout_5 << endl; } } fout_1.close(); fout_2.close(); fout_3.close(); fout_4.close(); fout_5.close(); return 0; }
(5)文件名:hmm.h。将存储在文件中的HMM的模型参数读取到内存中,构造为一个HMM对象,供其他程序使用。
#ifndef HMM_H #define HMM_H #include <fstream> #include <sstream> #include <string> #include <cstdlib> const int N = 4; const int M = 4677; using namespace std; //定义HMM模型 class HMM{ public: int n; //状态数目 int m; //可能的观察符号数目 double Pi[N]; //初始状态概率 double A1[N][N]; //状态转移概率矩阵 double A2[N][N][N]; //状态转移概率矩阵 double B1[N][M]; //符号发射概率矩阵 double B2[N][N][M]; //符号发射概率矩阵 HMM(); HMM(string f_pi, string f_a1, string f_a2, string f_b1, string f_b2); }; //无参构造函数 HMM::HMM(){ } //有参构造函数 HMM::HMM(string f_pi, string f_a1, string f_a2, string f_b1, string f_b2){ ifstream fin_1(f_pi.c_str()); ifstream fin_2(f_a1.c_str()); ifstream fin_3(f_a2.c_str()); ifstream fin_4(f_b1.c_str()); ifstream fin_5(f_b2.c_str()); if(!(fin_1 && fin_2 && fin_3 && fin_4 && fin_5)){ exit(-1); } n = N; m = M; string line = ""; string word = ""; //读取Pi getline(fin_1, line); istringstream strstm_1(line); for(int i = 0; i < N; i++){ strstm_1 >> word; Pi[i] = atof(word.c_str()); } //读取A1 for(int i = 0; i < N; i++){ getline(fin_2, line); istringstream strstm_2(line); for(int j = 0; j < N; j++){ strstm_2 >> word; A1[i][j] = atof(word.c_str()); } } //读取A2 for(int i = 0; i < N; i++){ for(int j = 0; j < N; j++){ getline(fin_3, line); istringstream strstm_3(line); for(int k = 0; k < N; k++){ strstm_3 >> word; A2[i][j][k] = atof(word.c_str()); } } } //读取B1 for(int i = 0; i < N; i++){ getline(fin_4, line); istringstream strstm_4(line); for(int j = 0; j < M; j++){ strstm_4 >> word; B1[i][j] = atof(word.c_str()); } } //读取B2 for(int i = 0; i < N; i++){ for(int j = 0; j < N; j++){ getline(fin_5, line); istringstream strstm_5(line); for(int k = 0; k < M; k++){ strstm_5 >> word; B2[i][j][k] = atof(word.c_str()); } } } fin_1.close(); fin_2.close(); fin_3.close(); fin_4.close(); fin_5.close(); } #endif
(6)文件名:viterbi.cpp。维特比算法,用于分词。这是最核心的部分哦···
#include <iostream> #include <fstream> #include <sstream> #include <string> #include <stack> #include "hmm.h" #include "db.h" using namespace std; HMM hmm("Pi.mat", "A1.mat", "A2.mat", "B1.mat", "B2.mat"); //初始化HMM模型 DB db("db.txt"); //初始化字典 /* * Viterbi算法进行分词,二阶马尔柯夫过程 */ string viterbiTwo(string str_in){ //计算输入句子中的汉字个数 int row = str_in.size() / 3; string str_out = ""; //如果输入字符串为空,则直接返回空 if(row == 0){ return str_out; } //如果只有一个字的话,则直接输出即可 if(row < 2){ str_out = str_in + "/"; return str_out; } //分配矩阵空间 double **delta = new double *[row]; int **path = new int *[row]; for(int i = 0; i < row; i++){ delta[i] = new double[N](); path[i] = new int[N](); } //中间变量 string cchar = ""; //存放汉字 int min_path = -1; double val = 0.0; double min_val = 0.0; //初始化矩阵,给delta和path矩阵的第一行赋初值 cchar = str_in.substr(0, 3); int cchar_num = db.getObservIndex(cchar); for(int i = 0; i < N; i++){ delta[0][i] = hmm.Pi[i] + hmm.B1[i][cchar_num]; //对数 path[0][i] = -1; } //给delta和path的后续行赋值(对数) for(int t = 1; t < row; t++){ cchar = str_in.substr(3*t, 3); cchar_num = db.getObservIndex(cchar); for(int j = 0; j < N; j++){ min_val = 100000.0; min_path = -1; for(int i = 0; i < N; i++){ val = delta[t-1][i] + hmm.A1[i][j]; if(val < min_val){ min_val = val; min_path = i; } } delta[t][j] = min_val + hmm.B1[j][cchar_num]; path[t][j] = min_path; } } //找delta矩阵最后一行的最大值 min_val = 100000.0; min_path = -1; for(int i = 0; i < N; i++){ if(delta[row-1][i] < min_val){ min_val = delta[row-1][i]; min_path = i; } } //从min_path出发,回溯得到最可能的路径 stack<int> path_st; path_st.push(min_path); for(int i = row - 1; i > 0; i--){ min_path = path[i][min_path]; path_st.push(min_path); } //释放二维数组 for(int i = 0; i < row; i++){ delete []delta[i]; delete []path[i]; } delete []delta; delete []path; //根据标记好的状态序列分词 int pos = 0; int index = -1; while(!path_st.empty()){ index = path_st.top(); path_st.pop(); str_out.insert(str_out.size(), str_in, pos, 3); if(index == 2 || index == 3){ //状态为E或S str_out.append("/"); } pos += 3; } } /* * Viterbi算法进行分词:三阶马尔柯夫过程 */ string viterbiThree(string str_in){ //计算输入句子中的汉字个数 int row = str_in.size() / 3; string str_out = ""; //如果输入字符串为空,则直接返回空 if(row == 0){ return str_out; } //如果只有一个字的话,则直接输出即可 if(row < 2){ str_out = str_in + "/"; return str_out; } //分配矩阵空间 double ***delta = new double **[row]; int ***path = new int **[row]; for(int i = 0; i < row; i++){ delta[i] = new double *[N]; path[i] = new int *[N]; for(int j = 0; j < N; j++){ delta[i][j] = new double[N]; path[i][j] = new int[N]; for(int k = 0; k < N; k++){ delta[i][j][k] = 0.0; path[i][j][k] = 0; } } } //初始化矩阵,给delta和path矩阵的第1个面赋初值 //初始状态需要两个面,第0面不赋值,只给第1个面赋值 string cchar_1 = str_in.substr(0, 3); //第1个字 string cchar_2 = str_in.substr(3, 3); //第2个字 int num_1 = db.getObservIndex(cchar_1); //第1个字的编号 int num_2 = db.getObservIndex(cchar_2); //第2个字的编号 for(int i = 0; i < N; i++){ for(int j = 0; j < N; j++){ delta[1][i][j] = hmm.Pi[i] + hmm.B1[i][num_1] + hmm.A1[i][j] + hmm.B2[i][j][num_2]; //对数 path[1][i][j] = -1; } } //中间变量 string cchar_3 = ""; //存放汉字 int min_path = -1; double val = 0.0; double min_val = 0.0; //给delta和path的后续面赋值(对数) //第0、1面为初始面,后续面从2开始,到row-1为止 for(int t = 2; t < row; t++){ cchar_3 = str_in.substr(3*t, 3); int num_3 = db.getObservIndex(cchar_3); for(int j = 0; j < N; j++){ for(int k = 0; k < N; k++){ min_val = 100000.0; min_path = -1; for(int i = 0; i < N; i++){ val = delta[t-1][i][j] + hmm.A2[i][j][k]; if(val < min_val){ min_val = val; min_path = i; } } delta[t][j][k] = min_val + hmm.B2[j][k][num_3]; path[t][j][k] = min_path; } } } //找delta矩阵最后一个面的最大值,最后一个面为row-1 min_val = 100000.0; int min_path_i = -1; int min_path_j = -1; for(int i = 0; i < N; i++){ for(int j = 0; j < N; j++){ if(delta[row-1][i][j] < min_val){ min_val = delta[row-1][i][j]; min_path_i = i; min_path_j = j; } } } //从min_path_i和min_path_j出发,回溯得到最可能的路径 //回溯从row-1开始,到2为止 stack<int> path_st; path_st.push(min_path_j); path_st.push(min_path_i); for(int t = row - 1; t > 1; t--){ int min_path_k = path[t][min_path_i][min_path_j]; path_st.push(min_path_k); min_path_j = min_path_i; min_path_i = min_path_k; } //释放三维数组 for(int i = 0; i < row; i++){ for(int j = 0; j < N; j++){ delete []delta[i][j]; delete []path[i][j]; } delete []delta[i]; delete []path[i]; } delete []delta; delete []path; //根据标记好的状态序列分词 int pos = 0; int index = -1; while(!path_st.empty()){ index = path_st.top(); path_st.pop(); str_out.insert(str_out.size(), str_in, pos, 3); if(index == 2 || index == 3){ //状态为E或S str_out.append("/"); } pos += 3; } }
(7)文件名:main.cpp。主函数,调用维特比算法进行分词工作,并对分词结果进行比对,统计后输出结果。
#include <cstdlib> #include <vector> #include <iomanip> #include <map> #include <algorithm> #include <sys/time.h> #include <sys/stat.h> #include "util.h" #include "viterbi.cpp" const long MaxCount = 50000; //需要切分的最大句子数量,若该值大于文件中 //实际的句子数量,以实际句子数量为准。 //获取当前时间(ms) long getCurrentTime(){ struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec*1000 + tv.tv_usec/1000; } //获取文件大小 unsigned long getFileSize(string file_path){ unsigned long filesize = -1; struct stat statbuff; if(stat(file_path.c_str(), &statbuff) < 0){ return filesize; }else{ filesize = statbuff.st_size; } return filesize; } /* * 函数功能:计算切分标记的位置 * 函数输入:1.strline_in未进行切分的汉字字符串 2.strline_right进行切分后的汉字字符串 * 函数输出:vecetor,其中存放了strline_in中哪些位置放置了分词标记 * 注意:vector中不包含最后标记的位置,但是包含位置0。 */ vector<int> getPos(string strline_right, string strline_in){ int pos_1 = 0; int pos_2 = -1; int pos_3 = 0; string word = ""; vector<int> vec; int length = strline_right.length(); while(pos_2 < length){ //前面的分词标记 pos_1 = pos_2; //后面的分词标记 pos_2 = strline_right.find('/', pos_1 + 1); if(pos_2 > pos_1){ //将两个分词标记之间的单词取出 word = strline_right.substr(pos_1 + 1, pos_2 - pos_1 - 1); //根据单词去输入序列中查出出现的位置 pos_3 = strline_in.find(word, pos_3); //将位置存入数组 vec.push_back(pos_3); pos_3 = pos_3 + word.size(); }else{ break; } } return vec; } /* * 获取标准切分和程序切分的结果 */ string getString(string word, int pos, vector<int> vec_right){ char ss[1000]; int i = 0; int k = 0; if(vec_right.size() == 0){ return word; } while(vec_right[i] < pos){ i++; } for(int j = 0; j < word.size(); j++){ if(j == vec_right[i] - pos){ if(j != 0){ ss[k] = '/'; ++k; } ++i; } ss[k] = word[j]; ++k; } ss[k] = '\0'; string word_str = ss; return word_str; } /* * 函数功能:获取单个句子切分的结果统计 * 函数输入:1.vec_right 正确的分词标记位置集合 * 2.vec_out 函数切分得到的分词标记位置集合 * 函数输出:返回一个veceor,含有4个元素,分别为: * 切分正确、组合型歧义、未登录词、交集型歧义的数量 * */ vector<int> getCount_2(string strline, vector<int> vec_right, vector<int> vec_out, vector<string> &vec_err){ vector<int> vec(4, 0); //存放计算结果 //建立map map<int, int> map_result; for(int i = 0; i < vec_right.size(); i++){ map_result[vec_right[i]] += 1; } for(int i = 0; i < vec_out.size(); i++){ map_result[vec_out[i]] += 2; } //统计map中的信息 //若value=1,只在vec_right中 //若value=2,只在vec_out中 //若value=3,在vec_right和vec_out中都有 map<int, int>::iterator p_pre, p_cur; int count_value_1 = 0; int count_value_2 = 0; int count_value_3 = 0; p_pre = map_result.begin(); p_cur = map_result.begin(); while(p_cur != map_result.end()){ while(p_cur != map_result.end() && p_cur -> second == 3){ p_pre = p_cur; ++count_value_3; //切分正确的数目 ++p_cur; //迭代器后移 } while(p_cur != map_result.end() && p_cur -> second != 3){ if(p_cur -> second == 1){ ++count_value_1; }else if(p_cur -> second == 2){ ++count_value_2; } ++p_cur; } //确定切分错误的字符串 if(p_cur == map_result.end() && p_cur == (++p_pre)){ continue; } int pos_1 = p_pre -> first; int pos_2 = p_cur -> first; string word = strline.substr(pos_1, pos_2 - pos_1); //切分错误的单词 string word_right = getString(word, pos_1, vec_right); //正确的切分方式 string word_out = getString(word, pos_1, vec_out); //得到的切分方式 string str_err = ""; //不同的错误类型 if(count_value_1 > 0 && count_value_2 == 0){ str_err = " 组合型歧义: " + word + " 正确切分: " + word_right + " 错误切分: " + word_out; vec_err.push_back(str_err); cout << str_err << endl; vec[1] += count_value_1; }else if(count_value_1 == 0 && count_value_2 > 0){ str_err = " 未登录词语: " + word + " 正确切分: " + word_right + " 错误切分: " + word_out; vec_err.push_back(str_err); cout << str_err << endl; vec[2] += count_value_2; }else if(count_value_1 > 0 && count_value_2 > 0){ str_err = " 交集型歧义: " + word + " 正确切分: " + word_right + " 错误切分: " + word_out; vec_err.push_back(str_err); cout << str_err << endl; vec[3] += count_value_2; } //计数器复位 count_value_1 = 0; count_value_2 = 0; } vec[0] += count_value_3; return vec; } /* * 主函数:进行分词并统计分词结果 * */ int main(int argc, char *argv[]){ if(argc < 3){ cout << "Usage: " << argv[0] << " test_file result_file" << endl; exit(-1); } long time_1 = getCurrentTime(); string strline_right; //输入语料:用作标准分词结果 string strline_in; //去掉分词标记的语料(用作分词的输入) string strline_out_1; //隐马尔科夫模型(二阶)分词完毕的语料 string strline_out_2; //隐马尔科夫模型(三阶)分词完毕的语料 ifstream fin(argv[1]); //打开输入文件 if(!fin){ cout << "Unable to open input file !" << argv[1] << endl; exit(-1); } ofstream fout(argv[2]); //确定输出文件 if(!fout){ cout << "Unable to open output file !" << endl; exit(-1); } long count = 0; //句子编号 long count_1 = 0; //隐马尔科夫模型(二阶)切分完全正确的句子总数 long count_2 = 0; //隐马尔科夫模型(三阶)切分完全正确的句子总数 long count_right_all = 0; //准确的切分总数 //二阶 long count_out_1_all = 0; //隐马尔科夫模型切分总数 long count_out_1_right_all = 0; //隐马尔科夫模型切分正确总数 long count_out_1_fail_1_all = 0;//隐马尔科夫模型(组合型歧义) long count_out_1_fail_2_all = 0;//隐马尔科夫模型(未登录词语) long count_out_1_fail_3_all = 0;//隐马尔科夫模型(交集型歧义) //三阶 long count_out_2_all = 0; //隐马尔科夫模型切分总数 long count_out_2_right_all = 0; //隐马尔科夫模型切分正确总数 long count_out_2_fail_1_all = 0;//隐马尔科夫模型(组合型歧义) long count_out_2_fail_2_all = 0;//隐马尔科夫模型(未登录词语) long count_out_2_fail_3_all = 0;//隐马尔科夫模型(交集型歧义) vector<string> vec_err_1; //隐马尔科夫模型(二阶)切分错误的词 vector<string> vec_err_2; //隐马尔科夫模型(三阶)切分错误的词 while(getline(fin, strline_right, '\n') && count < MaxCount){ if(strline_right.length() > 1){ //去掉分词标记 strline_in = strline_right; strline_in = replace_all(strline_in, "/", ""); //隐马尔科夫模型分词 strline_out_1 = strline_right; istringstream strstm(strline_in); string sentence; string result_1; string result_2; string line_out_1; string line_out_2; while(strstm >> sentence){ //二阶切分 result_1 = viterbiTwo(sentence); line_out_1 += result_1; //三阶切分 result_2 = viterbiThree(sentence); line_out_2 += result_2; } strline_out_1 = line_out_1; strline_out_2 = line_out_2; //输出分词结果 count++; cout << "----------------------------------------------" << endl; cout << "句子编号:" << count << endl; cout << endl; cout << "待分词的句子长度: " << strline_in.length() << " 句子:" << endl; cout << strline_in << endl; cout << endl; cout << "标准比对结果长度: " << strline_right.length() << " 句子:" << endl; cout << strline_right << endl; cout << endl; cout << "隐马尔科夫模型(二阶)分词长度: " << strline_out_1.length() << " 句子:" << endl; cout << strline_out_1 << endl; cout << endl; cout << "隐马尔科夫模型(三阶)分词长度: " << strline_out_2.length() << " 句子:" << endl; cout << strline_out_2 << endl; cout << endl; //输出分词结果的数字序列表示 vector<int> vec_right = getPos(strline_right, strline_in); vector<int> vec_out_1 = getPos(strline_out_1, strline_in); vector<int> vec_out_2 = getPos(strline_out_2, strline_in); cout << "标准结果:" << endl; for(int i = 0; i < vec_right.size(); i++){ cout << setw(4) << vec_right[i]; } cout << endl; cout << "隐马尔科夫模型(二阶)分词结果:" << endl; for(int i = 0; i < vec_out_1.size(); i++){ cout << setw(4) << vec_out_1[i]; } cout << endl; cout << "隐马尔科夫模型(三阶)分词结果:" << endl; for(int i = 0; i < vec_out_2.size(); i++){ cout << setw(4) << vec_out_2[i]; } cout << endl; //输出匹配的错误列表 cout << endl; if(vec_right == vec_out_1){ cout << "隐马尔科夫模型(二阶)分词完全正确!" << endl; count_1++; }else{ cout << "隐马尔科夫模型(二阶)分词错误列表:" << endl; } vector<int> vec_count_1 = getCount_2(strline_in, vec_right, vec_out_1, vec_err_1); cout << endl; if(vec_right == vec_out_2){ cout << "隐马尔科夫模型(三阶)分词完全正确!" << endl; count_2++; }else{ cout << "隐马尔科夫模型(三阶)分词错误列表:" << endl; } vector<int> vec_count_2 = getCount_2(strline_in, vec_right, vec_out_2, vec_err_2); //准确的切分数量 int count_right = vec_right.size(); //切分得到的数量 int count_out_1 = vec_out_1.size(); int count_out_2 = vec_out_2.size(); //切分正确的数量 int count_out_1_right = vec_count_1[0]; cout << "切分得到:" << count_out_1 << endl; cout << "切分正确:" << count_out_1_right << endl; cout << "隐马尔科夫模型(二阶):" << endl; cout << " 组合型歧义:" << vec_count_1[1] << endl; cout << " 未登录词语:" << vec_count_1[2] << endl; cout << " 交集型歧义:" << vec_count_1[3] << endl; int count_out_2_right = vec_count_2[0]; cout << "切分得到:" << count_out_2 << endl; cout << "切分正确:" << count_out_2_right << endl; cout << "隐马尔科夫模型(三阶):" << endl; cout << " 组合型歧义:" << vec_count_2[1] << endl; cout << " 未登录词语:" << vec_count_2[2] << endl; cout << " 交集型歧义:" << vec_count_2[3] << endl; count_right_all += count_right; count_out_1_all += count_out_1; count_out_1_right_all += count_out_1_right; count_out_1_fail_1_all += vec_count_1[1]; count_out_1_fail_2_all += vec_count_1[2]; count_out_1_fail_3_all += vec_count_1[3]; count_out_2_all += count_out_2; count_out_2_right_all += count_out_2_right; count_out_2_fail_1_all += vec_count_2[1]; count_out_2_fail_2_all += vec_count_2[2]; count_out_2_fail_3_all += vec_count_2[3]; } } long time_2 = getCurrentTime(); unsigned long file_size = getFileSize("test.txt"); //打印错误的切分内容 cout << endl; cout << "---------------------------------" << endl; cout << "错误样例(已排序):" << endl; //对错误切分内容进行排序并掉重复的 sort(vec_err_1.begin(), vec_err_1.end()); sort(vec_err_2.begin(), vec_err_2.end()); vector<string>::iterator end_unique_1 = unique(vec_err_1.begin(), vec_err_1.end()); vector<string>::iterator end_unique_2 = unique(vec_err_2.begin(), vec_err_2.end()); int num_1 = end_unique_1 - vec_err_1.begin(); int num_2 = end_unique_2 - vec_err_2.begin(); cout << "----------------------------------" << endl; cout << "隐马尔科夫模型(二阶)切分错误数量:" << num_1 << endl; for(int i = 0; i < num_1; i++){ cout << vec_err_1[i] << endl; } cout << endl; cout << "----------------------------------" << endl; cout << "隐马尔科夫模型(三阶)切分错误数量:" << num_2 << endl; for(int i = 0; i < num_2; i++){ cout << vec_err_2[i] << endl; } cout << endl; //计算准确率和召回率 double kk_1 = (double)count_out_1_right_all / count_out_1_all; //隐马尔科夫模型(二阶)准确率 double kk_2 = (double)count_out_1_right_all / count_right_all; //隐马尔科夫模型(二阶)召回率 double kk_3 = (double)count_out_2_right_all / count_out_2_all; //隐马尔科夫模型(三阶)准确率 double kk_4 = (double)count_out_2_right_all / count_right_all; //隐马尔科夫模型(三阶)召回率 //集中输出结果 cout << endl; cout << "---------------------------------" << endl; cout << "分词消耗时间:" << time_2 - time_1 << "ms" << endl; cout << "测试文件大小:" << file_size/1024 << " KB" << endl; cout << "分词速度为: " << (double)file_size*1000/((time_2 - time_1)*1024) << " KB/s" << endl; cout << endl; cout << "句子总数:" << count << endl; cout << "隐马尔科夫模型(二阶)切分完全正确的句子数目: " << count_1 << "\t ( " << (double)count_1*100/count << " % )" << endl; cout << "隐马尔科夫模型(三阶)切分完全正确的句子数目: " << count_2 << "\t ( " << (double)count_2*100/count << " % )" << endl; cout << endl; cout << "准确的切分总数:" << count_right_all << endl; //准确的切分总数 cout << "隐马尔科夫模型(二阶)切分总数:" << count_out_1_all << endl; //隐马尔科夫模型切分总数 cout << "隐马尔科夫模型(三阶)切分总数:" << count_out_2_all << endl; //隐马尔科夫模型切分总数 cout << "隐马尔科夫模型(二阶)切分正确总数:" << count_out_1_right_all << endl; //隐马尔科夫模型切分正确总数 cout << "隐马尔科夫模型(三阶)切分正确总数:" << count_out_2_right_all << endl; //隐马尔科夫模型切分正确总数 cout << endl; cout << "隐马尔科夫模型(二阶):" << endl; long count_out_1_fail_all = count_out_1_fail_1_all + count_out_1_fail_2_all + count_out_1_fail_3_all; cout << " 组合型歧义:" << count_out_1_fail_1_all << "\t ( " << (double)count_out_1_fail_1_all*100/count_out_1_fail_all << " % )" << endl; cout << " 未登录词语:" << count_out_1_fail_2_all << "\t ( " << (double)count_out_1_fail_2_all*100/count_out_1_fail_all << " % )" << endl; cout << " 交集型歧义:" << count_out_1_fail_3_all << "\t ( " << (double)count_out_1_fail_3_all*100/count_out_1_fail_all << " % )" << endl; cout << endl; cout << "隐马尔科夫模型(三阶):" << endl; long count_out_2_fail_all = count_out_2_fail_1_all + count_out_2_fail_2_all + count_out_2_fail_3_all; cout << " 组合型歧义:" << count_out_2_fail_1_all << "\t ( " << (double)count_out_2_fail_1_all*100/count_out_2_fail_all << " % )" << endl; cout << " 未登录词语:" << count_out_2_fail_2_all << "\t ( " << (double)count_out_2_fail_2_all*100/count_out_2_fail_all << " % )" << endl; cout << " 交集型歧义:" << count_out_2_fail_3_all << "\t ( " << (double)count_out_2_fail_3_all*100/count_out_2_fail_all << " % )" << endl; cout << endl; cout << "统计结果:" << endl; cout << "隐马尔科夫模型(二阶) 准确率:" << kk_1*100 << "% \t召回率:" << kk_2*100 << "%" << endl; cout << "隐马尔科夫模型(三阶) 准确率:" << kk_3*100 << "% \t召回率:" << kk_4*100 << "%" << endl; return 0; }
二阶和三阶隐马尔柯夫过程(HMM)进行中文分词的效果对比