本博文我们通过三个程序比较统计词频问题的时间复杂度问题;
问题描述;
1)、找一篇文章,将所有单词输入至程序;(The Bible Holy为例)
2)、统计出每个单词的数量,即词频问题;
3)、增加停用词功能;(遇到此类词,直接略过)(网上搜)
4)、分别统计出读取文件并计算词频时间、排序所用时间;
5)、用 类 实现各函数(处统计时间的函数除外)。
vector、map、hash_map 都要处理字符串的 去除标点符号、将大写字母转换成小写字母、不对数字进行统计 问题。因此,我们可以将处理这些函数进行封装。
StringUtil.h
1 #ifndef STRING_UTIL_H_ 2 #define STRING_UTIL_H_ 3 4 #include <string> 5 6 namespace stringutil 7 { 8 void erasePunct(std::string &s); 9 void stringToLower(std::string &s); 10 bool isAllDigit(const std::string &s); 11 } 12 13 #endif /* STRING_UTIL_H_ */
StringUtil.cpp
1 #include "StringUtil.h" 2 #include <ctype.h> 3 4 using namespace std; 5 6 namespace stringutil 7 { 8 9 void erasePunct(string &s) 10 { 11 string::iterator it = s.begin(); 12 while(it != s.end()) 13 { 14 if(ispunct(*it)) 15 it = s.erase(it); 16 else 17 ++it; 18 } 19 } 20 void stringToLower(string &s) 21 { 22 for(string::iterator it = s.begin(); 23 it != s.end(); 24 ++it) 25 { 26 if(isupper(*it)) 27 *it = tolower(*it); 28 } 29 } 30 31 bool isAllDigit(const std::string &s) 32 { 33 for(string::const_iterator it = s.begin(); 34 it != s.end(); 35 ++it) 36 { 37 if(!isdigit(*it)) 38 return false; 39 } 40 41 return true; 42 } 43 44 }
一、采用vector、pair;
采用的数据结构:
1)、vector<string> stopList用于存放停用词;
2)、vector<pair<string, int> > words 用于存放 除禁用词之外的其他The Bible Holy 的单词;
思路:
1)、用ReadStopFile函数 读取停用词到 stopList中;
2)、用ReadWordFile函数读取The Bible Holy到 words中;在我们读取的过程中首先要 去除标点符号、将word转化为小写、 跳过数字;
3)、然后判断该 单词 是否在 stoplist 中,若不在,最后添加单词 至 words中。
4)、对词频排序,这里我们采用algorithm库中的sort函数;
5)、对words vector进行遍历,打印之;
这里注意一点:除了在 main.cpp中显式调用的函数,我们将其他函数声明为 类 的私有函数。
函数声明如下:
1 #ifndef WORDFREQUENCY_H_ 2 #define WORDFREQUENCY_H_ 3 4 #include <vector> 5 #include <string> 6 #include <utility> 7 8 class WordFrequency 9 { 10 public: 11 WordFrequency(const std::string &filename, const std::string &stopFile);//初始化 12 void ReadStopFile(); 13 void ReadWordFile(); 14 void sortWordByFrequency(); 15 void printWordFrequency()const; 16 private: 17 void addWordToDict(const std::string &word); 18 bool isStopWord(const std::string &word)const; 19 20 typedef std::vector<std::pair<std::string, int> >::iterator Wordit;//为了方便 21 typedef std::vector<std::pair<std::string, int> >::const_iterator Wordkit; 22 23 std::string filename_; 24 std::string stopFile_; 25 26 std::vector<std::string> stopList_; 27 std::vector<std::pair<std::string, int> > words_; 28 29 }; 30 31 #endif
函数实现如下:
1 //WordFrequency.cpp 2 #include "WordFrequency.h" 3 #include "StringUtil.h" 4 #include <algorithm> 5 #include <stdexcept> 6 #include <stdlib.h> 7 #include <fstream> 8 9 using namespace std; 10 using namespace stringutil; 11 12 WordFrequency::WordFrequency(const std::string &filename, const std::string &stopFile) 13 :filename_(filename),stopFile_(stopFile) 14 { } 15 16 void WordFrequency::ReadStopFile() 17 { 18 ifstream in(stopFile_.c_str()); 19 if( !in ) 20 throw runtime_error("open file failure"); 21 string word; 22 while(in >> word) 23 { 24 stopList_.push_back(word); 25 } 26 in.close(); 27 } 28 29 void WordFrequency::ReadWordFile() 30 { 31 ifstream infile(filename_.c_str()); 32 if( !infile ) 33 throw runtime_error("open file failure!"); 34 string word; 35 while(infile >> word) 36 { 37 erasePunct(word); 38 if( isAllDigit(word)) 39 continue; 40 stringToLower(word); 41 if( !isStopWord(word)) 42 addWordToDict(word); 43 } 44 infile.close(); 45 } 46 47 bool WordFrequency::isStopWord(const string &word)const 48 { 49 vector<string>::const_iterator it = stopList_.begin(); 50 while( it != stopList_.end()) 51 { 52 if( *it == word) 53 break; 54 it ++; 55 } 56 return (it != stopList_.end()); 57 } 58 59 void WordFrequency::addWordToDict(const string &word) 60 { 61 Wordit it = words_.begin(); 62 while( it != words_.end()) 63 { 64 if( it->first == word) 65 { 66 ++ it->second ; 67 break; 68 } 69 it ++; 70 } 71 if(it == words_.end()) 72 words_.push_back(make_pair(word, 1)) ; 73 } 74 75 bool cmp(const pair<string, int> &a, const pair<string, int>&b) 76 { 77 return a.second > b.second; 78 } 79 80 void WordFrequency::sortWordByFrequency() 81 { 82 sort(words_.begin(), words_.end(), cmp); 83 } 84 85 void WordFrequency::printWordFrequency()const 86 { 87 Wordkit it = words_.begin(); 88 while(it != words_.end()) 89 { 90 printf("words: %s, frequency: %d\n",it->first.c_str(),it->second); 91 it ++; 92 } 93 }
main.cpp
1 //main.cpp 2 #ifndef _STDC_FORMAT_MACROS 3 #define _STDC_FORMAT_MACROS 4 #endif /* _STDC_FORMAT_MACROS */ 5 #include <inttypes.h> 6 7 #include "WordFrequency.h" 8 #include <iostream> 9 #include <vector> 10 #include <stdexcept> 11 #include <sys/time.h> 12 #include <stdio.h> 13 #include <string.h> 14 using namespace std; 15 16 int64_t gettime(); 17 18 int main(int argc, const char *argv[]) 19 { 20 if(argc < 3) 21 throw runtime_error("exe,filename, stopfile"); 22 int64_t starttime =gettime(); 23 WordFrequency wf(argv[1], argv[2]); 24 wf.ReadStopFile(); 25 wf.ReadWordFile(); 26 27 int64_t readtime = gettime(); 28 wf.sortWordByFrequency(); 29 30 int64_t sorttime = gettime(); 31 32 printf("readfile time: %lld ms\n",(readtime-starttime)/1000); 33 printf("sortfile time: %lld ms\n",(sorttime-readtime)/1000); 34 35 wf.printWordFrequency(); 36 return 0; 37 } 38 39 int64_t gettime() 40 { 41 struct timeval tv; 42 ::memset(&tv, 0, sizeof tv); 43 if(::gettimeofday(&tv, NULL) == -1) 44 { 45 throw runtime_error("gettimeofday"); 46 } 47 int64_t t = tv.tv_usec; 48 t += tv.tv_sec * 1000 * 1000; 49 return t; 50 }
时间: 2024-10-13 02:19:45