/**
* KMP algorithm is a famous way to find a substring from a text. To understand its‘ capacity, we should acquaint onself with the normal algorithm.
*/
/**
* simple algorithm
*
* workflow:
* (say, @ct means for currently position of search text)
*
* step 1: match text from index @ct with pattern.
* step 2: if success, step to next character. or,
*
* The most straightforward algorithm is to look for a character match at successive values of index @ct, the position in the string being searched, i.e. S[ct]. If the index
* if the index m reaches the end of the string, then there is no match. In which case the search is said to "fail". At each position m, the algorithm first check whether
* S[ct] and P[0] is equal. If success, check the rest of characters. if fail, check next position, ct+1;
*
* put all of detail into this code.
*/
int work( char *s, char *pat) { int ct = 0; int mi = 0; while( s[ct]!='\0' ) { while( s[ct + mi]==pat[mi]) { mi++; if( pat[mi]=='\0') return ct; } ct++; } return -1; }
/**
* It‘s simple and have a good performance usually. But thing will become terrible when we deal with some special string. Just like:
*
* S[m] = 00000000....00000001
* P[n] = 000001
* wow, the result is really awful. In conclusion, if we take a comparison operation as the basic element, we could get the time complexity.At ideal case,
* T(m,n) = O( m+n)
*
* that is reasonable.But at worst case,
* T(m,n) = O( m*n)
*
* Now , It‘s time to KMP show its excellent performance.
*/
/**
* KMP
* KMP is a algorithm for search a substring from a string. At case above, the situation become awful, because of some special situation: we always
* be told we are wrong when we almost success. There are so many traps.
*
* Q: Could I do something with those traps?
*/
/*
* Of course, we can. Image this, you encounter a trap when you check substring from position S[ct], and you realize this when you match S[ct+m] with P[m].
* At this moment ,except for fail, we also know another thing.
*
* S[ct]S[ct+1]....S[ct+m-1] = P[0]P[1]....p[m-1]
*
* Have you find anything ?
* All of traps must be the substring of the pattern and must be started by index 0. That‘s means we can do some funny thing , since we know what we will face. Now,
* pick a trap up to analysis. In the case above
*
* P[0]P[1]P[2]....P[m].
*
* what we are interested is whether a really substring is remain under this.
*
* Q: How can we know that?
*/
/*
* if a really substring is remain under this substring, It must have
*
* P[0]P[1]P[2]....P[n] = P[m-n]...P[m-1]P[m] n<m
* (this is the key point for the KMP algorithm.)
*
* Of course, some other false one may meet with this formula. But who care about it, we just want to find the doubtful one. So , by the help of this conclusion,
* we could know whether a doubtful substring is here and where is it. Then if we know where is the doubtful substring, we could skip some undoubtful one.That‘s all.
*/
/*
* Here is a examples for KMP.
*/
#include <stdio.h> #include <iostream> typedef int INDEX; class KMP { public: KMP( ); ~KMP( ); bool set( char *s, char *patt ); bool work( void); bool showPT( void); bool show( void); private: /* * initialize the table of partial match string. */ bool createPT( void); bool computePTV( INDEX mmi); /* * text string */ char *str; /* * pattern string */ char *pattern; /* * the length of pattern */ int p_len; /* * a pointer for the table of */ int *ptab; INDEX ip; //index position of the string }; KMP::KMP( ) { this->str = NULL; this->pattern = NULL; this->p_len = 0; this->ptab = NULL; this->ip = -1; } KMP::~KMP() { if( this->ptab!=NULL) { delete []this->ptab; this->ptab = NULL; } } bool KMP::set(char * s,char * patt) { this->str = s; this->pattern = patt; this->ip = -1; this->p_len = strlen( this->pattern); this->ptab = new int[this->p_len]; return true; } bool KMP::work(void) { /* * As the analysis above, To skip some traps, we need a table to record * some information about those traps. */ this->createPT( ); // current match index INDEX cmi = 0; //index of text string, this is a relative length INDEX si = 0 ; //index of pattern //INDEX pi = 0; /* * the process of search is almost same as the simple algorithm. The difference * of operation is occur when we encounter a trap. Based on our theory, we * could skip some traps. */ while( this->str[cmi]!='\0' ) { /* * match pattern with the text, this is same as the normal algorithm. */ while( this->str[cmi + si]==this->pattern[si] ) { si++; //pi++; if( this->pattern[si]=='\0') { this->ip = cmi; return true; } } /* * when we encounter a trap, we need to revise the start position * in next search. In this expression, @(cmi + si) is the last position * in the text where we compare with our pattern. @( this->ptab[si]) * is the length of possible string in this template. */ cmi = cmi + si -this->ptab[si]; /* * update the relative length to the corresponding value. */ if( si!=0) { si = this->ptab[si]; //pi = this->ptab[pi]; } } return false; } /** * */ bool KMP::createPT( void) { //mismatch index INDEX mmi = 1; /* * deal with every template string that is possible to become a trap. */ while( mmi<this->p_len) { this->computePTV( mmi); mmi++; } /* * Actually, the following is not a trap, but for make our program * more concise , we do a trick. */ this->ptab[0] = -1; return true; } /** * This function work for compute the max match string. That's comformation * will be used to skip some traps. */ bool KMP::computePTV( INDEX mmi) { //max match index INDEX max_mi = mmi -1; //initial position for search substring INDEX i = max_mi -1; //the length of substring INDEX ss_len = 0; while( i>=0 ) { while( this->pattern[ max_mi - ss_len]==this->pattern[ i-ss_len] ) { if( i==ss_len) { this->ptab[ mmi] = ss_len + 1; return true; } ss_len ++; } ss_len = 0; i--; } this->ptab[ mmi] = 0; return false; } bool KMP::showPT(void) { int i = 0; for( i=0; i<this->p_len; i++) { printf( "%2c", this->pattern[i] ); } printf("\n"); for( i=0; i<this->p_len; i++) { printf("%2d", this->ptab[i]); } printf("\n"); return true; } bool KMP::show(void) { printf("text: %s\n", this->str); printf("pattern: %s\n", this->pattern); printf(" index = %d\n", this->ip); return true; } #define TEXT "aababcaababcabcdabbabcdabbaababcabcdabbaababaababcabcdabbcaabaacaabaaabaabaababcabcdabbabcabcdaababcabcdabbabbcdabb" #define PATTERN "abaabcac" int main( ) { KMP kmp; kmp.set( TEXT, PATTERN); kmp.work( ); kmp.showPT( ); kmp.show( ); return 0; }
字符串查找算法-KMP