用过strings.NewReplacer,replacer.Replace(),它按对传入参数后,能依优先级替换,并能处理中文字符串参数.
觉得功能强大,特别好用.对它的查找和优先级怎么处理有点兴趣,花时间研究了下源码,在这记录一下个人理解.
package main //author:xcl //2014-1-20 记录 import ( "fmt" "strings" ) func main(){ patterns := []string{ "y","25", "中","国", "中工","家伙", } /* patterns := make([]string,270 * 2) for i :=0;i< 270 *2;i++{ patterns[i] = fmt.Sprintf("%d",i) } */ replacer := strings.NewReplacer(patterns...) format := "中(国)--中工(家伙)" strfmt := replacer.Replace(format) NewReplacer(patterns...); fmt.Println("\nmain() replacer.Replace old=",format) fmt.Println("main() replacer.Replace new=",strfmt) } func NewReplacer(oldnew ...string){ r := makeGenericReplacer(oldnew) val,keylen,found := r.lookup("中",true) fmt.Println("\nNewReplacer() 中 val:",val," keylen:",keylen," found:",found) val,keylen,found = r.lookup("中工",true) fmt.Println("NewReplacer() 中工 val:",val," keylen:",keylen," found:",found) val,keylen,found = r.lookup("y",false) fmt.Println("NewReplacer() y val:",val," keylen:",keylen," found:",found) /* val,keylen,found := r.lookup("2",true) fmt.Println("\nNewReplacer() 2 val:",val," keylen:",keylen," found:",found) val,keylen,found = r.lookup("3",true) fmt.Println("\nNewReplacer() 3 val:",val," keylen:",keylen," found:",found) */ } type genericReplacer struct { root trieNode //一个字典树 // tableSize is the size of a trie node's lookup table. It is the number // of unique key bytes. tableSize int // mapping maps from key bytes to a dense index for trieNode.table. mapping [256]byte } func makeGenericReplacer(oldnew []string) *genericReplacer { r := new(genericReplacer) // Find each byte used, then assign them each an index. for i := 0; i < len(oldnew); i += 2 { //步长2. 第一个为pattern key := oldnew[i] fmt.Println("\nmakeGenericReplacer() for key=",key) //key[j]=utf8存储汉字的三个编码位置中的一个如228,则将其对应位置设置为1 //即 r.mapping[228] = 1 for j := 0; j < len(key); j++ { r.mapping[key[j]] = 1 fmt.Println("makeGenericReplacer() key[",j,"]=",key[j]) } } for _, b := range r.mapping { r.tableSize += int(b) } fmt.Println("makeGenericReplacer() r.tableSize=",r.tableSize) var index byte for i, b := range r.mapping { if b == 0 { r.mapping[i] = byte(r.tableSize) } else { //依数组字符编码位置,建立索引 r.mapping[i] = index fmt.Println("makeGenericReplacer() r.mapping[",i,"] =",r.mapping[i] ) index++ } } // Ensure root node uses a lookup table (for performance). r.root.table = make([]*trieNode, r.tableSize) //将key,val放入字典树,注意priority=len(oldnew)-i,即越数组前面的,值越大.级别越高 for i := 0; i < len(oldnew); i += 2 { r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r) } return r } type trieNode struct { value string priority int prefix string next *trieNode table []*trieNode } func (t *trieNode) add(key, val string, priority int, r *genericReplacer) { fmt.Println("trieNode->add() val=",val," key=",key) if key == "" { if t.priority == 0 { t.value = val t.priority = priority fmt.Println("trieNode->add() t.priority==",priority) } return } if t.prefix != "" { //处理已有前缀的node // Need to split the prefix among multiple nodes. var n int // length of the longest common prefix for ; n < len(t.prefix) && n < len(key); n++ { //prefix与key的比较 if t.prefix[n] != key[n] { break } } if n == len(t.prefix) { //相同,继续放下面 t.next.add(key[n:], val, priority, r) } else if n == 0 { //没一个相同 // First byte differs, start a new lookup table here. Looking up // what is currently t.prefix[0] will lead to prefixNode, and // looking up key[0] will lead to keyNode. var prefixNode *trieNode if len(t.prefix) == 1 { //如果prefix只是一个字节的字符编码,则挂在节点下面 prefixNode = t.next } else { //如果不是,将余下的新建一个trie树 prefixNode = &trieNode{ prefix: t.prefix[1:], next: t.next, } } keyNode := new(trieNode) t.table = make([]*trieNode, r.tableSize) //lookup()中的if node.table != nil t.table[r.mapping[t.prefix[0]]] = prefixNode t.table[r.mapping[key[0]]] = keyNode t.prefix = "" t.next = nil keyNode.add(key[1:], val, priority, r) } else { // Insert new node after the common section of the prefix. next := &trieNode{ prefix: t.prefix[n:], next: t.next, } t.prefix = t.prefix[:n] t.next = next next.add(key[n:], val, priority, r) } } else if t.table != nil { // Insert into existing table. m := r.mapping[key[0]] if t.table[m] == nil { t.table[m] = new(trieNode) } t.table[m].add(key[1:], val, priority, r) //构建树 } else { t.prefix = key t.next = new(trieNode) t.next.add("", val, priority, r) } } func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) { // Iterate down the trie to the end, and grab the value and keylen with // the highest priority. bestPriority := 0 node := &r.root n := 0 for node != nil { if node.priority > bestPriority && !(ignoreRoot && node == &r.root) { bestPriority = node.priority val = node.value keylen = n found = true } if s == "" { break } if node.table != nil { index := r.mapping[s[0]] if int(index) == r.tableSize { //字符编码第一个字节就没在table中,中断查找 break } node = node.table[index] s = s[1:] n++ } else if node.prefix != "" && HasPrefix(s, node.prefix) { //字符编码非第一个字节的节点会保留key在prefix中,所以通过分析prefix来继续找其它字节 n += len(node.prefix) s = s[len(node.prefix):] node = node.next //继续找相同prefix以外其它字符 } else { break } } return } // HasPrefix tests whether the string s begins with prefix. func HasPrefix(s, prefix string) bool { return len(s) >= len(prefix) && s[0:len(prefix)] == prefix }
记录:
ascii范围内的只占一个字节,如y(121)
utf8中每个汉字占三个字节.如中(228,184,173)
构建树:
如果是新的第一个单词或词组
先进 } else if t.table != nil {
然后再进 else,这中间会把 t.prefix = key,把key值存放在prefix,将""传给下一个node
最后执行 if key == "" && t.priority == 0 { ,将 t.value = val
即key的字符编码(第一个字节)对应的root.table位置开始,依次指向另外的字符编译node,中间node的prefix存下key值.
最末一个node,存下对应的val及priority.
如果是后传入的单词或词组,先从key字符编码首个字节对应的root.table位置开始,依次查找,
} else if t.table != nil {
如果已有前缀的,进行比较 if t.prefix != "" {
1, 如目前prefix与key完全一致,则继续构建树子节点
2. 如prefix与key完全不同,则另起炉灶,构建一条新的tree
prefixNode 承上,keyNode 启下
至于为什么t.table = make([]*trieNode, r.tableSize),是为了预留映射空间.
所以它是这么弄的,而不是t.table[0],t.table[1].
t.table[r.mapping[t.prefix[0]]] = prefixNode
t.table[r.mapping[key[0]]] = keyNode
3.有部份相同, 直接跳到t.prefix[n:],然后从key[n:]开始继续构建树子节点
priority:
在这的定义是数字越大,优先级别越高
if key == "" { //字符编码中间的字节
if t.priority == 0 { //如果有定义过priority的就略过,新加的,把现有的级别加上
//对应{中,中工}这种,虽然后面有"中工",但"中",的priority要高,所以"中工"对应的值虽找到但不会返回.
if node.priority > bestPriority { bestPriority = node.priority}
例如:中工(priority=4),中(priority=2)
patterns:
"中工","家伙",
"中","国",
则:
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ??
lookup() bestPriority: 0 node.priority: 2 value: 国 prefix: 工
NewReplacer() 中 val: 国 keylen: 3 found: true
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ??
lookup() bestPriority: 0 node.priority: 2 value: 国 prefix: 工
lookup() bestPriority: 2 node.priority: 4 value: 家伙 prefix:
NewReplacer() 中工 val: 家伙 keylen: 6 found: true
main() replacer.Replace old= 中(国)--中工(家伙)
main() replacer.Replace new= 国(国)--家伙(家伙)
如果调整下顺序,把中->国提前,则会发现,下面的结果:
patterns:
"中","国",
"中工","家伙",
则:
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ??
lookup() bestPriority: 0 node.priority: 4 value: 国 prefix: 工
NewReplacer() 中 val: 国 keylen: 3 found: true
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ??
lookup() bestPriority: 0 node.priority: 4 value: 国 prefix: 工
lookup() bestPriority: 4 node.priority: 2 value: 家伙 prefix:
NewReplacer() 中工 val: 国 keylen: 3 found: true
main() replacer.Replace old= 中(国)--中工(家伙)
main() replacer.Replace new= 国(国)--国工(家伙)
还有,刚发现 lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) {}中
定义在返回值中的变量,原来可以直接在函数中使用,
至于返回,直接return就行了,都不用写全返回值的,好省事.
MAIL: [email protected]
BLOG:http://blog.csdn.net/xcl168