from math import log def calcShannonEnt(dataSet): numEntries = len(dataSet) print("样本总数:" + str(numEntries)) labelCounts = {} #记录每一类标签的数量 #定义特征向量featVec for featVec in dataSet: currentLabel = featVec[-1] #最后一列是类别标签 if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0; labelCounts[currentLabel] += 1 #标签currentLabel出现的次数 print("当前labelCounts状态:" + str(labelCounts)) shannonEnt = 0.0 for key in labelCounts: prob = float(labelCounts[key]) / numEntries #每一个类别标签出现的概率 print(str(key) + "类别的概率:" + str(prob)) print(prob * log(prob, 2) ) shannonEnt -= prob * log(prob, 2) print("熵值:" + str(shannonEnt)) return shannonEnt def createDataSet(): dataSet = [ # [1, 1, ‘yes‘], # [1, 0, ‘yes‘], # [1, 1, ‘no‘], # [0, 1, ‘no‘], # [0, 1, ‘no‘], # #以下随意添加,用于测试熵的变化,越混乱越冲突,熵越大 # [1, 1, ‘no‘], # [1, 1, ‘no‘], # [1, 1, ‘no‘], # [1, 1, ‘no‘], # [1, 1, ‘maybe‘], # [1, 1, ‘maybe1‘] # 用下面的8个比较极端的例子看得会更清楚。 [1,1,‘1‘], [1,1,‘2‘], [1,1,‘3‘], [1,1,‘4‘], [1,1,‘5‘], [1,1,‘6‘], [1,1,‘7‘], [1,1,‘8‘], ] labels = [‘no surfacing‘, ‘flippers‘] return dataSet, labels def testCalcShannonEnt(): myDat, labels = createDataSet() print(calcShannonEnt(myDat)) if __name__ == ‘__main__‘: testCalcShannonEnt() print(log(0.000002, 2))
输出结果
样本总数:8 当前labelCounts状态:{‘1‘: 1} 当前labelCounts状态:{‘1‘: 1, ‘2‘: 1} 当前labelCounts状态:{‘1‘: 1, ‘2‘: 1, ‘3‘: 1} 当前labelCounts状态:{‘1‘: 1, ‘2‘: 1, ‘3‘: 1, ‘4‘: 1} 当前labelCounts状态:{‘1‘: 1, ‘2‘: 1, ‘3‘: 1, ‘4‘: 1, ‘5‘: 1} 当前labelCounts状态:{‘1‘: 1, ‘2‘: 1, ‘3‘: 1, ‘4‘: 1, ‘5‘: 1, ‘6‘: 1} 当前labelCounts状态:{‘1‘: 1, ‘2‘: 1, ‘3‘: 1, ‘4‘: 1, ‘5‘: 1, ‘6‘: 1, ‘7‘: 1} 当前labelCounts状态:{‘1‘: 1, ‘2‘: 1, ‘3‘: 1, ‘4‘: 1, ‘5‘: 1, ‘6‘: 1, ‘7‘: 1, ‘8‘: 1} 1类别的概率:0.125 -0.375 熵值:0.375 2类别的概率:0.125 -0.375 熵值:0.75 3类别的概率:0.125 -0.375 熵值:1.125 4类别的概率:0.125 -0.375 熵值:1.5 5类别的概率:0.125 -0.375 熵值:1.875 6类别的概率:0.125 -0.375 熵值:2.25 7类别的概率:0.125 -0.375 熵值:2.625 8类别的概率:0.125 -0.375 熵值:3.0 3.0 -18.931568569324174 [Finished in 1.3s] |
原文地址:https://www.cnblogs.com/Sabre/p/8400744.html
时间: 2024-10-11 23:05:45