统计词频,中文字体编码格式:GB2312。
1 #include <iostream> 2 #include <fstream> 3 #include <algorithm> 4 #include <functional> 5 #include <string> 6 #include <vector> 7 #include <map> 8 #include <unordered_map> 9 #include <sstream> 10 #include <ctime> 11 using namespace std; 12 13 typedef long clock_t; 14 typedef pair<string, int> Pair_StrInt; 15 typedef string::iterator StrItr; 16 typedef vector<Pair_StrInt>::iterator Vec_Pair_StrInt_Itr; 17 #define ERROR0 cerr << "Open error !!!" << endl; exit(1); 18 #define ERROR1 cerr << "无法识别 !!!" << endl; exit(1); 19 #define Lim 100 20 21 string infile = "Ci.txt"; 22 string outfile1 = "out1.txt"; 23 string outfile2 = "out2.txt"; 24 string outfile3 = "out3.txt"; 25 string project_time = "project_time.txt"; 26 string One_strArr[100]; 27 string Two_strArr[100]; 28 string Three_strArr[100]; 29 ifstream fin; 30 ofstream fout; 31 string Text; 32 33 struct myNode { 34 string Chant; // 词牌名 35 string Rules; // 格式 36 }; 37 38 bool Pair_StrInt_Cmp(const Pair_StrInt& p0, const Pair_StrInt& p1) { return (p0.second > p1.second); } 39 unordered_map<string, int> StrInt_Hash; 40 41 void InitText(string _infile) { 42 fin.open(_infile); 43 if (!fin) { ERROR0; } 44 45 ////////////////////////////////////////////////////////////////////////// 46 // 将整个文件读入 string : 流迭代器 47 std::ostringstream tmp; 48 tmp << fin.rdbuf(); 49 string Text_tmp = tmp.str(); 50 ////////////////////////////////////////////////////////////////////////// 51 52 StrItr str_itr; 53 string strTmp; 54 unsigned char Judge; 55 56 for (str_itr = Text_tmp.begin(); str_itr != Text_tmp.end();) { 57 Judge = (*str_itr); 58 if (Judge >= 0xB0 && Judge <= 0xF7) { 59 strTmp = ""; 60 strTmp += (*str_itr); 61 strTmp += (*(str_itr + 1)); 62 str_itr += 2; 63 Text += strTmp; 64 } 65 else { ++str_itr; } 66 67 } 68 69 fin.close(); 70 fin.clear(); 71 } 72 73 // 输出到文件 74 void myOutput(const vector<Pair_StrInt> &StrInt_Vec, string out) { 75 fout.open(out); 76 if (!fout) { ERROR0; } 77 78 vector<Pair_StrInt>::const_iterator pair_itr; 79 for (pair_itr = StrInt_Vec.begin(); pair_itr != StrInt_Vec.end(); ++pair_itr) { 80 fout << pair_itr->first << "\t" << pair_itr->second << endl; 81 } 82 83 fout.close(); 84 fout.clear(); 85 } 86 87 // 获取一个中文字的词频 88 void getOneWord(string out1) { 89 string strTmp; 90 91 int str_len = Text.size(); 92 for (int i = 0; i < str_len; i += 2) { 93 strTmp = Text.substr(i, 2); 94 StrInt_Hash[strTmp] += 1; 95 } 96 97 vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end()); 98 StrInt_Hash.clear(); 99 std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp); 100 101 myOutput(StrInt_Vec, out1); 102 103 StrInt_Vec.clear(); 104 } 105 106 // 获取两个中文字的词频 107 void getTwoWord(string out2) { 108 string strTmp; 109 110 int str_len = Text.size(); 111 for (int i = 0; i < (str_len - 2); i += 2) { 112 strTmp = Text.substr(i, 4); 113 StrInt_Hash[strTmp] += 1; 114 } 115 116 vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end()); 117 StrInt_Hash.clear(); 118 std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp); 119 120 myOutput(StrInt_Vec, out2); 121 122 StrInt_Vec.clear(); 123 } 124 125 // 获取三个中文字的词频 126 void getThreeWord(string out3) { 127 string strTmp; 128 129 int str_len = Text.size(); 130 for (int i = 0; i < (str_len - 4); i += 2) { 131 strTmp = Text.substr(i, 6); 132 StrInt_Hash[strTmp] += 1; 133 } 134 135 vector<Pair_StrInt> StrInt_Vec(StrInt_Hash.begin(), StrInt_Hash.end()); 136 StrInt_Hash.clear(); 137 std::sort(StrInt_Vec.begin(), StrInt_Vec.end(), Pair_StrInt_Cmp); 138 139 myOutput(StrInt_Vec, out3); 140 141 StrInt_Vec.clear(); 142 } 143 144 // 自动生成词 145 void Poetry(string _strTmp) { 146 int len = _strTmp.size(); 147 int myRandom; 148 srand((unsigned)(time(NULL))); 149 for (int i = 0; i < len; ++i) { 150 switch (_strTmp[i]) 151 { 152 case ‘2‘: { 153 myRandom = rand() % Lim; 154 cout << Two_strArr[myRandom]; 155 break; 156 } 157 case ‘1‘: { 158 myRandom = rand() % Lim; 159 cout << One_strArr[myRandom]; 160 break; 161 } 162 case ‘3‘: { 163 myRandom = rand() % Lim; 164 cout << Three_strArr[myRandom]; 165 break; 166 } 167 case ‘0‘: { 168 cout << ‘\n‘; 169 break; 170 } 171 case ‘-‘: { 172 cout << " "; 173 break; 174 } 175 default: { 176 cout << _strTmp.substr(i, 2); 177 ++i; 178 break; 179 } 180 } 181 } 182 cout << endl; 183 } 184 185 // 生成词前的预处理 186 void makePoetry(string out1, string out2, string out3) { 187 ifstream fin1, fin2, fin3; 188 ofstream fout1, fout2, fout3; 189 fin1.open(out1); 190 if (!fin1) { ERROR0; } 191 fin2.open(out2); 192 if (!fin2) { ERROR0; } 193 fin3.open(out3); 194 if (!fin3) { ERROR0; } 195 string strTmp; 196 for (int i = 0; i < Lim; ++i) { 197 getline(fin1, strTmp); 198 One_strArr[i] = strTmp.substr(0, 2); 199 getline(fin2, strTmp); 200 Two_strArr[i] = strTmp.substr(0, 4); 201 getline(fin3, strTmp); 202 Three_strArr[i] = strTmp.substr(0, 6); 203 } 204 205 myNode node0; 206 node0.Chant = "念奴娇"; 207 node0.Rules = "·220-22,12,222。22,21:222。22,22,23。22,222。0-222,23,22。22,3222。22,23,22。22,222。0"; 208 209 string strTmp0 = "---" + node0.Chant + node0.Rules; 210 Poetry(strTmp0); 211 system("pause"); 212 } 213 214 void Solve() { 215 216 InitText(infile); 217 218 ofstream fout; 219 fout.open(project_time); 220 clock_t myStart, myFinish; 221 double totaltime; 222 ////////////////////////////////////////////////////////////////////////// 223 myStart = clock(); 224 ////////////////////////////////////////////////////////////////////////// 225 getOneWord(outfile1); 226 ////////////////////////////////////////////////////////////////////////// 227 getTwoWord(outfile2); 228 ///////////////////////////////////////////////////////////////////////// 229 getThreeWord(outfile3); 230 ////////////////////////////////////////////////////////////////////////// 231 232 myFinish = clock(); 233 totaltime = (double)(myFinish - myStart) / CLOCKS_PER_SEC; 234 235 fout << "运行时间为: " << totaltime << " 秒。" << endl; 236 fout.close(); 237 fout.clear(); 238 239 240 makePoetry(outfile1, outfile2, outfile3); 241 } 242 243 int main() { 244 Solve(); 245 return 0; 246 }
时间: 2024-12-10 04:05:45