1 from math import log 2 import operator 3 4 def createDataSet(): 5 dataSet = [[1, 1, ‘yes‘], 6 [1, 1, ‘yes‘], 7 [1, 0, ‘no‘], 8 [0, 1, ‘no‘], 9 [0, 1, ‘no‘]] 10 labels = [‘no surfacing‘,‘flippers‘] 11 #change to discrete values 12 return dataSet, labels 13 14 def calcShannonEnt(dataSet): 15 numEntries = len(dataSet) 16 labelCounts = {} 17 for featVec in dataSet: #the the number of unique elements and their occurance 18 currentLabel = featVec[-1] 19 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 20 labelCounts[currentLabel] += 1 21 shannonEnt = 0.0 22 for key in labelCounts: 23 prob = float(labelCounts[key])/numEntries 24 shannonEnt -= prob * log(prob,2) #log base 2 25 return shannonEnt 26 27 def splitDataSet(dataSet, axis, value): 28 retDataSet = [] 29 for featVec in dataSet: 30 if featVec[axis] == value: 31 reducedFeatVec = featVec[:axis] #chop out axis used for splitting 32 reducedFeatVec.extend(featVec[axis+1:]) 33 retDataSet.append(reducedFeatVec) 34 return retDataSet 35 36 def chooseBestFeatureToSplit(dataSet): 37 numFeatures = len(dataSet[0]) - 1 #the last column is used for the labels 38 baseEntropy = calcShannonEnt(dataSet) 39 bestInfoGain = 0.0; bestFeature = -1 40 for i in range(numFeatures): #iterate over all the features 41 featList = [example[i] for example in dataSet]#create a list of all the examples of this feature 42 uniqueVals = set(featList) #get a set of unique values 43 newEntropy = 0.0 44 for value in uniqueVals: 45 subDataSet = splitDataSet(dataSet, i, value) 46 prob = len(subDataSet)/float(len(dataSet)) 47 newEntropy += prob * calcShannonEnt(subDataSet) 48 infoGain = baseEntropy - newEntropy #calculate the info gain; ie reduction in entropy 49 if (infoGain > bestInfoGain): #compare this to the best gain so far 50 bestInfoGain = infoGain #if better than current best, set to best 51 bestFeature = i 52 return bestFeature #returns an integer 53 54 def majorityCnt(classList): 55 classCount={} 56 for vote in classList: 57 if vote not in classCount.keys(): classCount[vote] = 0 58 classCount[vote] += 1 59 sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True) 60 return sortedClassCount[0][0] 61 62 def createTree(dataSet,labels): 63 classList = [example[-1] for example in dataSet] 64 if classList.count(classList[0]) == len(classList): 65 return classList[0]#stop splitting when all of the classes are equal 66 if len(dataSet[0]) == 1: #stop splitting when there are no more features in dataSet 67 return majorityCnt(classList) 68 bestFeat = chooseBestFeatureToSplit(dataSet) 69 bestFeatLabel = labels[bestFeat] 70 myTree = {bestFeatLabel:{}} 71 del(labels[bestFeat]) 72 featValues = [example[bestFeat] for example in dataSet] 73 uniqueVals = set(featValues) 74 for value in uniqueVals: 75 subLabels = labels[:] #copy all of labels, so trees don‘t mess up existing labels 76 myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels) 77 return myTree 78 79 def classify(inputTree,featLabels,testVec): 80 firstStr = inputTree.keys()[0] 81 secondDict = inputTree[firstStr] 82 featIndex = featLabels.index(firstStr) 83 key = testVec[featIndex] 84 valueOfFeat = secondDict[key] 85 if isinstance(valueOfFeat, dict): 86 classLabel = classify(valueOfFeat, featLabels, testVec) 87 else: classLabel = valueOfFeat 88 return classLabel 89 90 def storeTree(inputTree,filename): 91 import pickle 92 fw = open(filename,‘w‘) 93 pickle.dump(inputTree,fw) 94 fw.close() 95 96 def grabTree(filename): 97 import pickle 98 fr = open(filename) 99 return pickle.load(fr) 100
[[1, 1, ‘yes‘], [1, 1, ‘yes‘], [1, 0, ‘no‘], [0, 1, ‘no‘], [0, 1, ‘no‘]] labelCounts是一个map结构
currentLabel labelCounts[currentLabel] prob
yes 2 0.4no 3 0.6 用信息论就可以得到0.4*log(-0.4)+0,6*log(-0.6)=0.971
eg. myDat为 [[1, 1, ‘yes‘], [1, 1, ‘yes‘], [1, 0, ‘no‘], [0, 1, ‘no‘], [0, 1, ‘no‘]]传入(myDat,0,1),输出 [[1, ‘yes‘],[1, ‘yes‘], [0, ‘no‘]]
eg. myDat为 [[1, 1, ‘yes‘], [1, 1, ‘yes‘], [1, 0, ‘no‘], [0, 1, ‘no‘], [0, 1, ‘no‘]]传入(myDat) 第一次就是按第一个特征,值为1划分 按第一个特征,值为0划分 得到该情况下的信息熵第二次就是按第二个特征,值为1划分 按第二个特征,值为0划分 得到该情况下的信息熵......选取信息熵最大时候的特征
时间: 2024-09-29 09:02:02