WordNode.h
#ifndef __TOOLS_WORDNODE_H_INCLUDE__
#define __TOOLS_WORDNODE_H_INCLUDE__
#include <map>
class CWordNode
{
typedef std::map<std::string, CWordNode*> umap;
public:
CWordNode(const std::string& word) { Reset(word); }
~CWordNode()
{
umap::iterator Ite = this->m_mapWordNodes.begin();
while (Ite != this->m_mapWordNodes.end())
{
CWordNode* pTmp = Ite->second;
delete pTmp;
pTmp = NULL;
++Ite;
}
this->m_mapWordNodes.clear();
this->m_nEndTag = 0;
}
void Reset(const std::string& word)
{
this->m_cWord = word;
this->m_nEndTag = 0;
this->m_mapWordNodes.clear();
}
public:
std::string m_cWord;
int m_nEndTag;
umap m_mapWordNodes;
};
#endif // __TOOLS_WORDNODE_H_INCLUDE__
WordsFilter.h
#ifndef __TOOLS_WORDSFILTER_H_INCLUDE__
#define __TOOLS_WORDSFILTER_H_INCLUDE__
#include <list>
#include "WordNode.h"
class CWordsFilter
{
typedef std::map<std::string, CWordNode*> umap;
private:
std::list<std::string> m_lsAllSensitiveWords; // 所有敏感词列表
CWordNode* m_rootWordNode;
bool m_bIsInit;
public:
CWordsFilter();
~CWordsFilter();
static CWordsFilter& GetInstance(); // 获取共享实例
void InitSensitiveWords(std::string strWord); // 初始化敏感词集
void InitSensitiveWords(std::list<std::string> lsAllSensitiveWords); // 初始化敏感词集
std::string FilterSensitiveWords(const std::string& strContent); // 过滤敏感词
private:
void BuildWordTree(); // 构建敏感词树
void InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex);
CWordNode* FindNode(CWordNode* pNode, const std::string& word);
int GetFirstBytes(const std::string& str); // 获取字符串中的第一个字符字节长度
};
#endif // __TOOLS_WORDSFILTER_H_INCLUDE__
WordsFilter.cpp
#include <iostream>
#include <sstream>
#include <fstream>
#include <cmath>
#include "WordsFilter.h"
int nStep = 2;
typedef std::vector<std::string> Tokens;
Tokens StrSplit(const std::string &src, const std::string &sep)
{
Tokens r;
std::string s;
for (std::string::const_iterator i = src.begin(); i != src.end(); i++)
{
if (sep.find((*i)) != std::string::npos)
{
if (s.length())
{
r.push_back(s);
}
s = "";
}
else
{
s += (*i);
}
}
if (s.length())
{
r.push_back(s);
}
return r;
};
int CWordsFilter::GetFirstBytes(const std::string& str)
{
for (int i = 0; i < (int)str.size(); ++i)
{
unsigned char chr = (unsigned char)str.at(i);
// 如果是该字节是 0XXX XXXX 样式,说明其是一个英文文字,占1字节
if ((chr >> 7) == 0)
{
return 1;
}
// 如果该字节是 1111 110X 样式,说明其是一个文字的头,且该文字占6字节
else if ((chr >> 1) == 126)
{
return 6;
}
// 如果该字节是 1111 10XX 样式,说明其是一个文字的头,且该文字占5字节
else if ((chr >> 2) == 62)
{
return 5;
}
// 如果该字节是 1111 0XXX 样式,说明其是一个文字的头,且该文字占4字节
else if ((chr >> 3) == 30)
{
return 4;
}
// 如果该字节是 1110 XXXX 样式,说明其是一个文字的头,且该文字占3字节
else if ((chr >> 4) == 14)
{
return 3;
}
// 如果该字节是 110X XXXX 样式,说明其是一个文字的头,且该文字占2字节
else if ((chr >> 5) == 6)
{
return 2;
}
else
{
continue;
}
}
return 1;
}
CWordsFilter::CWordsFilter():
m_bIsInit(false),
m_rootWordNode(NULL)
{
m_lsAllSensitiveWords.clear();
}
CWordsFilter::~CWordsFilter()
{
this->m_lsAllSensitiveWords.clear();
delete this->m_rootWordNode;
this->m_rootWordNode = NULL;
}
void CWordsFilter::InitSensitiveWords(std::string strWord)
{
Tokens token = StrSplit(strWord, ",");
std::list<std::string> lsAllSensitiveWords;
Tokens::iterator Ite = token.begin();
while (Ite != token.end())
{
lsAllSensitiveWords.push_back(*Ite);
++Ite;
}
InitSensitiveWords(lsAllSensitiveWords);
}
void CWordsFilter::InitSensitiveWords(std::list<std::string> lsAllSensitiveWords)
{
std::cout << "start init sensitive words" << std::endl;
this->m_lsAllSensitiveWords.clear();
this->m_lsAllSensitiveWords = lsAllSensitiveWords;
BuildWordTree();
this->m_bIsInit = true;
}
std::string CWordsFilter::FilterSensitiveWords(const std::string& strContent)
{
if (!this->m_bIsInit || NULL == this->m_rootWordNode)
{
std::cout << "the sensitive words is not init" << std::endl;
return "";
}
CWordNode* pNode = this->m_rootWordNode;
std::string strBuffer = "";
std::list<std::string> lsBad;
int a = 0;
while ( a < strContent.size() )
{
std::string strContentTmp = strContent.substr(a, strContent.size());
nStep = GetFirstBytes(strContentTmp);
std::string strTmp = "";
if (nStep <= strContentTmp.size())
{
strTmp = strContentTmp.substr(0, nStep);
}
pNode = FindNode(pNode, strTmp);
if (pNode == NULL)
{
pNode = this->m_rootWordNode;
int nSize = 0;
std::list<std::string>::iterator Ite = lsBad.begin();
while (Ite != lsBad.end())
{
nSize += (*Ite).size();
++Ite;
}
if (lsBad.size() > 0)
{
lsBad.clear();
}
a = a - nSize;
if (a < 0)
{
a = 0;
}
std::string strContentTmp = strContent.substr(a, strContent.size());
nStep = GetFirstBytes(strContentTmp);
strTmp = "";
if (nStep <= strContentTmp.size())
{
strTmp = strContentTmp.substr(0, nStep);
}
strBuffer.append(strTmp);
}
else if (pNode->m_nEndTag == 1)
{
lsBad.push_back(strTmp);
for (int nIndex = 0; nIndex < lsBad.size(); ++nIndex)
{
strBuffer.append("*");
}
pNode = this->m_rootWordNode;
lsBad.clear();
}
else
{
lsBad.push_back(strTmp);
if (a == strContent.size() - nStep)
{
std::list<std::string>::const_iterator cIte = lsBad.begin();
while (cIte != lsBad.end())
{
strBuffer.append(*cIte);
++cIte;
}
}
}
strContentTmp = strContentTmp.substr(nStep, strContentTmp.size());
a += nStep;
}
return strBuffer;
}
void CWordsFilter::BuildWordTree()
{
if ( this->m_rootWordNode == NULL )
{
this->m_rootWordNode = new CWordNode("R");
if (NULL == this->m_rootWordNode)
{
return;
}
}
this->m_rootWordNode->Reset("R");
std::list<std::string>::const_iterator cIte = this->m_lsAllSensitiveWords.begin();
while (cIte != this->m_lsAllSensitiveWords.end())
{
std::string strTmp = (*cIte);
if (strTmp.size() > 0)
{
InsertNode(this->m_rootWordNode, strTmp, 0);
}
++cIte;
}
}
void CWordsFilter::InsertNode(CWordNode* pNode, const std::string& strContent, int nIndex)
{
if (NULL == pNode)
{
return;
}
nStep = GetFirstBytes(strContent);
std::string strTmp = "";
if (nStep <= strContent.size())
{
strContent.substr(0, nStep);
}
CWordNode* pN = FindNode(pNode, strTmp);
if (NULL == pN)
{
pN = new CWordNode(strTmp);
if (NULL == pN)
{
return;
}
pNode->m_mapWordNodes[strTmp] = pN;
}
if (nIndex == strContent.size() - nStep)
{
pN->m_nEndTag = 1;
}
strTmp = strContent.substr(nStep, strContent.size());
if (strTmp.size() > 0)
{
InsertNode(pN, strTmp, 0);
}
}
CWordNode* CWordsFilter::FindNode(CWordNode* pNode, const std::string& word)
{
if ( NULL == pNode )
{
return NULL;
}
umap::iterator Ite = pNode->m_mapWordNodes.find(word);
if (Ite != pNode->m_mapWordNodes.end())
{
return Ite->second;
}
return NULL;
}
CWordsFilter& CWordsFilter::GetInstance()
{
static CWordsFilter inst;
return inst;
}