在 文章 http://blog.csdn.net/watkinsong/article/details/37697451 里面提到的FM中文分词算法中, 最大的问题就是将用户的query切分的太碎, 切分太碎也会对检索结果造成一定的影响。
这里, 可以采用FMM算法进行切分, 首先切分出最大的正向匹配, 然后进行交集歧义检测, 如果检测到存在交集歧义, 那么对可能存在歧义的一段文字进行FM切分, 获取所有的可能切分结果; 然后对剩下的query子句重复进行FMM切分, 直到query == null
例如, 用户查询 query = 123456789, 假设首先FMM切分到了“123”, 交集歧义检测长度为6, 这时候存在歧义, 那么对“123456”进行FM切分, 获取“123456”的所有的切分可能, 然后再对剩下的子句“789”循环进行FMM切分。
交集歧义检测算法描述:
假设 query = "互联网金宝", 首先进行FMM, 切分出“互联网”, 然后我们要检测是否存在交集歧义, 将“互联网”的长度作为输入, 这里我们使用变量word_len表示FMM切分结果的长度, 这里word_len = 3; 同时将query的子句“联网金宝”作为输入, 进行以下迭代:
如此循环下去, 循环结束的条件是 i < word_length && i < str.length
1.
对输入的 str = "联网金宝" 进行FMM切分, 获取切分后长度, 假设为“联网”, len = 2
如果此时 word_length < i + len, 则 word_length = i + length
此时, i = 1, len = 2, word_len = 3
str = str.slice(1)
2.
对 str = "网金宝" 进行FMM切分, 获取切分后长度, 假设为“网金宝”, len = 3
如果此时 word_length < i + len, 则 word_length = i + length, 这里, i + len = 5, i + len > word_len, 设置word_len = 5
此时, i = 2, len = 3, word_len = 5
str = str.slice(1)
3.
此时 i =3, str.length 为, 循环条件不成立, 退出迭代。
代码实现:
var lunr = require("./lunr.js") var idxdata = require("./idx.json") var idx = lunr.Index.load(idxdata) var ii = idx.tokenStore var query1 = "中国人民银行指出我国最近经济不景气" var query2 = "习近平今日出席了中央气象台的联欢晚会" var query3 = "中国银行今日出台了最新的贷款政策" var query4 = "习近平的中央气象台" var query5 = "全部门" var query6 = "互联网金宝" var query7 = "上下级别" var query8 = "互联网中国人民银行" var query9 = "引领土完整" query = query8 var result = tokenizer(ii.root, query) console.log(result) /* tokenizer */ /* do FMM first and then detect ambiguity, if ambiguity detected, do FM again*/ function tokenizer(root, str) { if ( root == null || root == undefined ) return [] if ( str == null || str == undefined || str.length == 0 ) return [] var out = [] while ( str.length > 0 ) { var ret = matchLongest(root, str) var ambiguityLength = getAmbiguiousLength(root, str, ret.length) console.log("FMM: " + ret + ", ambituity length: " + ambiguityLength) if ( ret.length >= ambiguityLength) { out.push(ret) } else { console.log("ambiguity detected!!!") var ambiguityStr = str.substr(0, ambiguityLength) console.log("do FM again for ambiguity piece: " + ambiguityStr) var ret = ambiguityTokenizer(root, ambiguityStr) out = out.concat(ret) } str = str.slice(ambiguityLength) } return out } function matchLongest(root, str) { if ( root == null || root == undefined ) return if ( str == null || str == undefined || str.length == 0 ) return var maxMatch = "" var currentNode = root for( var i = 0; i < str.length; i++ ) { if (str[i] in currentNode ) { maxMatch += str[i] currentNode = currentNode[str[i]] } else { if ( maxMatch.length == 0 ) maxMatch = str[i] // un-board word found break } } return maxMatch } /* tokenizer for ambigutiy part */ function ambiguityTokenizer(root, str) { if ( root == null || root == undefined ) return [] if ( str == null || str == undefined || str.length == 0 ) return [] var out = [] var query = str while ( str.length > 0 ) { var ret = forwardMatching(root, str) out = out.concat(ret) str = str.slice(1) } return out } /* FM, this will return all the possible terms in along the longest search path */ function forwardMatching(root, str) { if ( root == null || root == undefined ) return if ( str == null || str == undefined || str.length == 0 ) return var out = [] var matches = "" var currentNode = root for( var i = 0; i < str.length; i++ ) { if (str[i] in currentNode ) { matches += str[i] currentNode = currentNode[str[i]] docs = currentNode.docs || {} if ( Object.keys(docs).length ) { out.push(matches) } } else { if ( matches.length == 0 ) { // un-board word found // do not add un-board word, because when doing search, un-board word is bad and will affect the search results //out.push(str[i]) } break } } return out } function getAmbiguiousLength(root, str, word_length) { var i = 1 while ( i < word_length && i < str.length ) { var wid = matchLongest(root, str.slice(i)) var length = wid.length if ( word_length < i + length ) word_length = i + length i += 1 } return word_length }
测试:
query: "互联网金宝"
结果:
FMM: 互联网, ambituity length: 5 ambiguity detected!!! do FM again for ambiguity piece: 互联网金宝 [ '互联网', '网', '网金宝', '金', '宝' ]
query: "互联网中国人民银行"
结果:
FMM: 互联网, ambituity length: 3 FMM: 中国人民银行, ambituity length: 6 [ '互联网', '中国人民银行' ]
NLP: 中文分词算法---交集歧义检测 (cross ambiguity detect)