Weka源码学习-J48（一）-决策树的剪枝 / 憋错料

#J48.class 1 /**
 2    * Generates the classifier.
 3    *
 4    * @param instances the data to train the classifier with
 5    * @throws Exception if classifier can‘t be built successfully
 6    */
 7   @Override
 8   public void buildClassifier(Instances instances) throws Exception {
 9
10     ModelSelection modSelection;
11
12     if (m_binarySplits) {//是否为二分树
13       modSelection = new BinC45ModelSelection(m_minNumObj, instances,
14         m_useMDLcorrection, m_doNotMakeSplitPointActualValue);
15     } else {
16       modSelection = new C45ModelSelection(m_minNumObj, instances,
17         m_useMDLcorrection, m_doNotMakeSplitPointActualValue);
18     }
19     if (!m_reducedErrorPruning) {//构造相应的树
20       m_root = new C45PruneableClassifierTree(modSelection, !m_unpruned, m_CF,
21         m_subtreeRaising, !m_noCleanup, m_collapseTree);
22     } else {
23       m_root = new PruneableClassifierTree(modSelection, !m_unpruned,
24         m_numFolds, !m_noCleanup, m_Seed);
25     }
26     m_root.buildClassifier(instances);//在树上真正构建模型
27     if (m_binarySplits) {
28       ((BinC45ModelSelection) modSelection).cleanup();
29     } else {
30       ((C45ModelSelection) modSelection).cleanup();
31     }
32   }

#C45PruneableClassifierTree.class 1 /**
 2    * Method for building a pruneable classifier tree.
 3    *
 4    * @param data the data for building the tree
 5    * @throws Exception if something goes wrong
 6    */
 7   public void buildClassifier(Instances data) throws Exception {
 8
 9     // can classifier tree handle the data?
10     getCapabilities().testWithFail(data);//testWithFail检测传入的data是否能用该分类器进行分类，例如C45只能对要分类的属性的取值是离散值的Instances进行分类
11
12     // remove instances with missing class
13     data = new Instances(data);
14     data.deleteWithMissingClass();//清理instances里面的无效行（相应分类属性为空的行）
15
16    buildTree(data, m_subtreeRaising || !m_cleanup);
17    if (m_collapseTheTree) {
18      collapse();
19    }
20    if (m_pruneTheTree) {
21      prune();
22    }
23    if (m_cleanup) {
24      cleanup(new Instances(data, 0));
25    }
26   }

#ClassifierTree.class 1 /**
 2    * Builds the tree structure.
 3    *
 4    * @param data the data for which the tree structure is to be generated.
 5    * @param keepData is training data to be kept?
 6    * @throws Exception if something goes wrong
 7    */
 8   public void buildTree(Instances data, boolean keepData) throws Exception {
 9
10     Instances[] localInstances;
11
12     if (keepData) {//根据传入参数来判断是否应该持有数据
13       m_train = data;
14     }
15     m_test = null;
16     m_isLeaf = false;
17     m_isEmpty = false;
18     m_sons = null;
19     m_localModel = m_toSelectModel.selectModel(data);//根据m_toSelectModel来选择一个模型并把传入的数据集按相应的规则分成不同的subSet
20     if (m_localModel.numSubsets() > 1) {//判断subSet的数量，大于1
21       localInstances = m_localModel.split(data);//根据localModel将data分成不同的subInstances
22       data = null;
23       m_sons = new ClassifierTree[m_localModel.numSubsets()];//为每一个subInstances建立新的ClassifierTree节点作为自己的孩子节点
24       for (int i = 0; i < m_sons.length; i++) {
25         m_sons[i] = getNewTree(localInstances[i]);//调用getNewTree函数来为每一个subInstances构造新的tree
26         localInstances[i] = null;
27       }
28     } else {//subSet的数量等于1，那么它是叶子节点
29       m_isLeaf = true;
30       if (Utils.eq(data.sumOfWeights(), 0)) {
31         m_isEmpty = true;
32       }
33       data = null;
34     }
35   }

#C45PruneableClassifierTree.class 1 /**
 2    * Collapses a tree to a node if training error doesn‘t increase.
 3    */
 4   public final void collapse(){
 5
 6     double errorsOfSubtree;//该节点孩子节点的误差总和
 7     double errorsOfTree;//该节点的误差
 8     int i;
 9
10     if (!m_isLeaf){
11       errorsOfSubtree = getTrainingErrors();//估计孩子节点错误的总和
12       errorsOfTree = localModel().distribution().numIncorrect();//估计当前节点的错误，即获得当前训练集上的一个分布，然后找出该分布里数量最多的那个属性的数量，认为是“正确的”，则其余的就是错误的。
13       if (errorsOfSubtree >= errorsOfTree-1E-3){//如果该节点孩子节点并不能提高这颗分类树的准确度，则把这些孩子节点删除。
14
15     // Free adjacent trees
16     m_sons = null;
17     m_isLeaf = true;
18
19     // Get NoSplit Model for tree.
20     m_localModel = new NoSplit(localModel().distribution());
21       }else//否则就在每个孩子上递归此方法
22     for (i=0;i<m_sons.length;i++)
23       son(i).collapse();
24     }
25   }

#C45PruneableClassifierTree.class

 1 /**
 2    * Prunes a tree using C4.5‘s pruning procedure.
 3    *
 4    * @throws Exception if something goes wrong
 5    */
 6   public void prune() throws Exception {
 7
 8     double errorsLargestBranch;//如果该节点有孩子节点，那么该节点的孩子节点中分到的数据最多的节点的分类错误的用例数
 9     double errorsLeaf;//如果该节点是叶子节点，该节点分类错误的用例数
10     double errorsTree;//该节点目前情况下，分类错误用例数
11     int indexOfLargestBranch;////分到最多数据的孩子节点在son数组中的index
12     C45PruneableClassifierTree largestBranch;
13     int i;
14
15     if (!m_isLeaf){//如果当前节点不是叶子节点，则先对其所有孩子节点进行剪枝。
16
17       // Prune all subtrees.
18       for (i=0;i<m_sons.length;i++)
19     son(i).prune();
20
21       // Compute error for largest branch
22       indexOfLargestBranch = localModel().distribution().maxBag();
23       if (m_subtreeRaising) {///m_subtreeRaising代表可否使用该树的子树去替代该树，如果可以，就去计算最大的子树的错误数量
24     errorsLargestBranch = son(indexOfLargestBranch).
25       getEstimatedErrorsForBranch((Instances)m_train);//估计错误数量，根据分布做一个统计（还要加一个基于m_CF的修正），如果不是叶子节点则递归的进行统计。
26       } else {///否则就简单的标Double.Max_Value
27     errorsLargestBranch = Double.MAX_VALUE;
28       }
29
30       // Compute error if this Tree would be leaf
31       errorsLeaf =
32     getEstimatedErrorsForDistribution(localModel().distribution());////如果该节点是叶子节点，该节点分类错误的用例数
33
34       // Compute error for the whole subtree
35       errorsTree = getEstimatedErrors();//估计该节点目前情况下，分类错误用例数
36
37       // Decide if leaf is best choice.
38       if (Utils.smOrEq(errorsLeaf,errorsTree+0.1) &&
39       Utils.smOrEq(errorsLeaf,errorsLargestBranch+0.1)){//如果当前节点作为叶子节点的错误量比目前情况低，并且比最大的子树的错误量也低，那么就把当前节点作为叶子节点。
40
41     // Free son Trees
42     m_sons = null;
43     m_isLeaf = true;
44
45     // Get NoSplit Model for node.
46     m_localModel = new NoSplit(localModel().distribution());
47     return;//直接返回
48       }
49
50       // Decide if largest branch is better choice
51       // than whole subtree.
52       if (Utils.smOrEq(errorsLargestBranch,errorsTree+0.1)){//如果当前节点的错误用例数大于最大子树，则用最大子树替代当前节点。
53     largestBranch = son(indexOfLargestBranch);
54     m_sons = largestBranch.m_sons;
55     m_localModel = largestBranch.localModel();
56     m_isLeaf = largestBranch.m_isLeaf;
57     newDistribution(m_train);
58     prune();
59       }
60     }
61   }

#PruneableClassifierTree.class

 1 /**
 2    * Method for building a pruneable classifier tree.
 3    *
 4    * @param data the data to build the tree from
 5    * @throws Exception if tree can‘t be built successfully
 6    */
 7   public void buildClassifier(Instances data)
 8        throws Exception {
 9
10     // can classifier tree handle the data?
11     getCapabilities().testWithFail(data);
12
13     // remove instances with missing class
14     data = new Instances(data);
15     data.deleteWithMissingClass();
16
17    Random random = new Random(m_seed);
18    data.stratify(numSets);
19    buildTree(data.trainCV(numSets, numSets - 1, random),
20          data.testCV(numSets, numSets - 1), !m_cleanup);//训练集和测试集一起传入
21    if (pruneTheTree) {
22      prune();
23    }
24    if (m_cleanup) {
25      cleanup(new Instances(data, 0));
26    }
27   }

#ClassifierTree.class 

 1  /**
 2    * Builds the tree structure with hold out set
 3    *
 4    * @param train the data for which the tree structure is to be generated.
 5    * @param test the test data for potential pruning
 6    * @param keepData is training Data to be kept?
 7    * @throws Exception if something goes wrong
 8    */
 9   public void buildTree(Instances train, Instances test, boolean keepData)
10     throws Exception {
11
12     Instances[] localTrain, localTest;
13     int i;
14
15     if (keepData) {
16       m_train = train;
17     }
18     m_isLeaf = false;
19     m_isEmpty = false;
20     m_sons = null;
21     m_localModel = m_toSelectModel.selectModel(train, test);
22     m_test = new Distribution(test, m_localModel);
23     if (m_localModel.numSubsets() > 1) {
24       localTrain = m_localModel.split(train);
25       localTest = m_localModel.split(test);//selectModel时把test传进去
26       train = null;
27       test = null;
28       m_sons = new ClassifierTree[m_localModel.numSubsets()];
29       for (i = 0; i < m_sons.length; i++) {
30         m_sons[i] = getNewTree(localTrain[i], localTest[i]);
31         localTrain[i] = null;
32         localTest[i] = null;
33       }
34     } else {
35       m_isLeaf = true;
36       if (Utils.eq(train.sumOfWeights(), 0)) {
37         m_isEmpty = true;
38       }
39       train = null;
40       test = null;
41     }
42   }

#PruneableClassifierTree.class 1  /**
 2    * Prunes a tree.
 3    *
 4    * @throws Exception if tree can‘t be pruned successfully
 5    */
 6   public void prune() throws Exception {
 7
 8     if (!m_isLeaf) {
 9
10       // Prune all subtrees.
11       for (int i = 0; i < m_sons.length; i++)
12     son(i).prune();
13
14       // Decide if leaf is best choice.
15       if (Utils.smOrEq(errorsForLeaf(),errorsForTree())) {
16
17     // Free son Trees
18     m_sons = null;
19     m_isLeaf = true;
20
21     // Get NoSplit Model for node.
22     m_localModel = new NoSplit(localModel().distribution());
23       }
24     }
25   }

剪枝过程：根据已有数据集的分布，判断该树、该树的最大子树、以及该树作为叶子节点时的正确率，在此基础上进行剪枝。

时间： 2024-08-08 04:58:08

Weka源码学习-J48（一）-决策树的剪枝

Weka源码学习-J48（一）-决策树的剪枝的相关文章

weka源码在eclipse的配置和VerifyError异常的解决

FireMonkey 源码学习（5）

jquery源码学习

Hadoop源码学习笔记(1) ——第二季开始——找到Main函数及读一读Configure类

HSQLDB源码学习——数据库安装启动及JDBC连接

lodash源码学习(10)

lodash源码学习(2)

jQuery源码学习感想

Redis源码学习-Lua脚本