#!/usr/bin/python
#encoding=gbk
import sys
dictMaxLength = 5
dctDict = {}
encoding=‘gbk‘
‘‘‘
初始化字典、初始化最大词长
‘‘‘
def initDct(dct):
global dctDict
global dictMaxLength
dctobj = open(dct)
for line in dctobj:
line = line.strip()
word = line.split("\t")[0].strip()
dctDict[word] = line
word = word.strip().decode(encoding)
if dictMaxLength < len(word):
dictMaxLength = len(word)
dctobj.close()
‘‘‘
正向最大匹配算法
‘‘‘
def maximunMathching(sent):
global dictMaxLength
global dctDict
index = 0
j = 0
result = ""
sent = sent.strip().decode(encoding)
sentLen = len(sent)
while(index < sentLen):
for i in range(dictMaxLength, 0, -1):
j = i + index
if j > sentLen:
j = sentLen
sub = sent[index:j]
if len(sub) > 1:
if dctDict.has_key(sub.encode(encoding)):
index += i
result += sub.encode(encoding) + " "
break;
else:
index += i
if not sub.encode(encoding) == " ":
result += sub.encode(encoding) + " "
break
return result.strip()
‘‘‘
逆向最大匹配算法
‘‘‘
def reverseMaximunMathching(sent):
global dctDict
global dictMaxLength
sb = ""
sent = sent.strip().decode(encoding)
index = len(sent)
j = 0
list = []
while index >= 0:
for i in range(dictMaxLength, 0, -1):
j = index - i
if j < 0: j = 0
sub = sent[j:index]
if len(sub) > 1:
if dctDict.has_key(sub.encode(encoding)):
list.append(sub.encode(encoding))
index = index - i
break;
else:
if not sub.encode(encoding) == " ":
list.append(sub.encode(encoding))
index = index - i
break
list.reverse()
return " ".join(list)
‘‘‘
非字典词、单字字典词、总词数 越少越好
‘‘‘
def segmenter(sent):
mm = maximunMathching(sent).strip()
rmm = reverseMaximunMathching(sent).strip()
if mm == rmm:
return mm
else:
return bmmResult(mm, rmm)
‘‘‘
非字典词、单字字典词、总词数 越少越好
‘‘‘
def bmmResult(mm, rmm):
#print mm
#print rmm
global dctDict
mmlist = mm.split(" ")
rmmlist = rmm.split(" ")
oovNum_mm = 0
oovNum_rmm = 0
sigNum_mm = 0
sigNum_rmm = 0
totNum_mm = len(mmlist)
totNum_rmm = len(rmmlist)
for word in mmlist:
if not dctDict.has_key(word):
oovNum_mm += 1
if len(word.decode(encoding)) == 1:
sigNum_mm += 1
for word in rmmlist:
if not dctDict.has_key(word):
oovNum_rmm += 1
if len(word.decode(encoding)) == 1:
sigNum_rmm += 1
MMWMix = 0
RMMNWMix = 0
if oovNum_mm > oovNum_rmm:
RMMNWMix += 1
elif oovNum_mm < oovNum_rmm:
MMWMix += 1
if sigNum_mm > sigNum_rmm:
RMMNWMix += 1
elif sigNum_mm < sigNum_rmm:
MMWMix += 1
if totNum_mm > totNum_rmm:
RMMNWMix += 1
elif totNum_mm < totNum_rmm:
MMWMix += 1
#print oovNum_mm, sigNum_mm, totNum_mm
#print oovNum_rmm, sigNum_rmm, totNum_rmm
if MMWMix < MMWMix:
return mm
else:
return rmm
def handleFile(input, output):
inputobj = open(input)
outputobj = open(output,"w")
index = 0
for line in inputobj:
index += 1
if index % 100000 == 0:
print str(index) + "\r"
line = line.strip().lower()
seg = segmenter(line)
outputobj.write(seg.strip() + "\n")
inputobj.close()
outputobj.close()
if __name__ == ‘__main__‘:
if len(sys.argv) != 4:
print "Usage %s dict[in] inFile[in] outFile[out]." %sys.argv[0]
sys.exit(-1)
dct = sys.argv[1]
input = sys.argv[2]
output = sys.argv[3]
initDct(dct)
#sent = "chien中华人民共和国在1949年成立了"
#print segmenter(sent)
handleFile(input, output)