# 1、统计分词词典,确定词典中最长词条的字符m;# 2、从左向右取待切分语句的m个字符作为匹配字段,查找词典,如果匹配成功,则作为一个切分后的词语,# 否则,去掉待匹配字符的最后一个继续查找词典,重复上述步骤直到切分出所有词语。 dictA = [‘南京市‘, ‘南京市长‘, ‘长江大桥‘, ‘大桥‘] maxDictA = max([len(word) for word in dictA]) sentence = "南京市长江大桥" def cutA(sentence): result = [] sentenceLen = len(sentence) n = 0 while n < sentenceLen: matched = 0 for i in range(maxDictA, 0, -1): piece = sentence[n:n+i] if piece in dictA: result.append(piece) matched = 1 n = n + i break if not matched: result.append(sentence[n]) n += 1 print(result) cutA(sentence) # [‘南京市长‘, ‘江‘, ‘大桥‘] dictB = [‘南京市‘, ‘南京市长‘, ‘长江大桥‘, ‘大桥‘] maxDictB = max([len(word) for word in dictA]) sentence = "南京市长江大桥"def cutB(sentence): result = [] sentenceLen = len(sentence) while sentenceLen > 0: word = ‘‘ for i in range(maxDictB, 0, -1): piece = sentence[sentenceLen-i:sentenceLen] if piece in dictB: word = piece result.append(word) sentenceLen -= i break if word is ‘‘: sentenceLen -= 1 result.append(sentence[sentenceLen]) print(result[::-1]) cutB(sentence) # [‘南京市‘, ‘长江大桥‘] # 双向最大匹配法# 思想: 将正向最大匹配和逆向匹配得到的分词结果进行比较,按照最大匹配原则,选择切分总词数最少的作为最终分词结果。
原文地址:https://www.cnblogs.com/hapyygril/p/9916351.html
时间: 2024-11-09 06:13:12