1 #!/usr/bin/env python 2 #encoding=gbk 3 import os 4 import sys 5 6 G_ENCODING="gbk" 7 """ 8 =============================== 9 中文分词 10 1. 机械分词 11 2. 统计分词 12 3. 理解分词 13 =============================== 14 基于树形结构分词策略(结合机械分词,统计分词) 15 例:笔记本电脑 16 dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9} 17 ------------------------------- 18 | <s> | 19 ------------------------------- 20 / / \ 21 [笔] [笔记] [笔记本] [笔记本电脑] 22 / / / 23 [记] [本] [电] [电脑] 24 / / 25 [本] [电] [电脑] 26 / \ / 27 [电] [电脑] [脑] 28 / 29 [脑] 30 ------------------------------- 31 path: 笔 记 本 电 脑 -- score: [0.32768] 32 path: 笔 记 本 电脑 -- score: [0.4608] 33 path: 笔记 本 电 脑 -- score: [0.4608] 34 path: 笔记 本 电脑 -- score: [0.648] 35 path: 笔记本 电 脑 -- score: [0.576] 36 path: 笔记本 电脑 -- score: [0.81] 37 path: 笔记本电脑 -- score: [0.9] 38 39 best path: 笔记本电脑 -- score: [0.9] 40 41 ------------------------------- 42 1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重) 43 2、最少切分、OOV、最少单字等策略 44 ==获取最佳分词路径 45 ------------------------------- 46 Q1、如果句子过长,树非常大,遍历费时(需优化) 47 Q2、字典加载(需优化) 48 以下给出该思想的简单实现[python]: 49 """ 50 51 class Stack(): 52 def __init__(self, volume = 0): 53 self.list = [] if volume == 0 else [0 for i in range(0,volume)] 54 self.top = 0 55 56 def push(self, element): 57 if self.list != None: 58 self.top += 1 59 self.list[self.top] = element 60 61 def pop(self): 62 if self.list != None and self.top >= 0: 63 ele = self.list[self.top] 64 self.list[self.top] = None 65 self.top -= 1 66 return ele 67 return None 68 def empty(self): 69 return self.top == 0 70 71 class Node(): 72 def __init__(self, data, next = None, prev = None, depth = 0, wlen = 0, weight = 0.0): 73 self.data = data 74 self.next = next if next != None else [] 75 self.prev = prev 76 self.depth = depth 77 self.wlen = wlen 78 self.weight = weight 79 80 def isLeaf(self): 81 return self.next == None or self.next == [] 82 83 class Tree(): 84 def __init__(self, root = None): 85 self.root = root 86 """append a child node to child""" 87 def append(self, node, cnode): 88 if node != None and cnode != None: 89 node.next.append(cnode) 90 cnode.prev = node 91 cnode.depth = node.depth + 1 92 return 0 93 return -1 94 95 """depth first search(binary preorder)""" 96 def depth_first_search(self, node): 97 list = [] 98 if node != None: 99 stack = Stack(30) 100 stack.push(node) 101 while not stack.empty(): 102 tmp = stack.pop() 103 list.append(tmp) 104 for i in range(len(tmp.next) - 1, -1, -1): 105 stack.push(tmp.next[i]) 106 return list 107 108 class Tokenizer(): 109 """init the tree""" 110 def load(self, tree, pnode, cache, dict): 111 clen = len(cache) 112 for node in tree.depth_first_search(pnode): 113 if node.isLeaf(): 114 i = node.wlen 115 j = i 116 while j < clen: 117 j += 1 118 tmp = cache[i:j].encode(G_ENCODING) 119 if dict.has_key(tmp) or len(tmp) == 1: 120 tnode = Node(tmp, wlen = j, weight = dict.get(tmp)) 121 tree.append(node, tnode) 122 self.load(tree, tnode, cache, dict) 123 return 0 124 """backtrance""" 125 def backtrance(self, node, list): 126 if node.prev != None and node.prev.data != "<s>": 127 list.append(node.prev) 128 self.backtrance(node.prev, list) 129 return 0 130 131 def bestpath(self, tree): 132 highestScore = 0 133 bestpath = "" 134 for node in tree.depth_first_search(tree.root): 135 """find the leaf node and backtrance to find the bese path""" 136 if node.isLeaf(): 137 list = [node] 138 self.backtrance(node, list) 139 list.reverse() 140 """ 141 1、路径加权(通过搜索引擎获取词语的词频,获得词语的权重) 142 2、最少切分、OOV、最少单字等策略 143 这里只是简单给出路径权重的乘积得分 144 145 """ 146 sc = 1.0 147 tp = "" 148 for xn in list: 149 sc *= xn.weight if xn.weight > 0 else 1 150 tp += xn.data + " " 151 if sc > highestScore: 152 highestScore = sc 153 bestpath = tp.strip() 154 print "path: %s -- score: [%s]"%(tp.strip(), sc) 155 print "\nbest path: %s -- score: [%s]"%(bestpath, highestScore) 156 return bestpath 157 def example(): 158 sent = "笔记本电脑" 159 dict = {"笔":0.8,"记":0.8,"本":0.8,"电":0.8,"脑":0.8,"笔记":0.9,"笔记本":0.9,"电脑":0.9,"笔记本电脑":0.9} 160 cache = unicode(sent, G_ENCODING) 161 tokenizer = Tokenizer() 162 tree = Tree(Node("<s>")) 163 """init tree""" 164 tokenizer.load(tree, tree.root, cache, dict) 165 """backtrance and find the best path""" 166 tokenizer.bestpath(tree) 167 example()
时间: 2024-11-09 01:39:54