#---------------------------------------- # 功能描述:演示C50建模过程 # 数据集:汉堡大学信贷模型,信贷数据 # #---------------------------------------- #第一步:收集数据 # import the CSV file credit <- read.csv("/Users/chenyangang/R语言/data/credit.csv", stringsAsFactors = TRUE) # 检查数据 table(credit$checking_balance) table(credit$savings_balance) # 五数分析法 summary(credit$months_loan_duration) summary(credit$amount) # 查看分类变量 table(credit$default) # 利用随机数来获取训练数据和测试数据,如果需要重复这里的分析,可以使用随机种子set.seed set.seed(12345) credit_rand <- credit[order(runif(1000)), ] # 比较数据集 summary(credit$amount) summary(credit_rand$amount) head(credit$amount) head(credit_rand$amount) # 分割数据集 credit_train <- credit_rand[1:900, ] credit_test <- credit_rand[901:1000, ] # 查看分类变量的占比 prop.table(table(credit_train$default)) prop.table(table(credit_test$default)) ## 第三步: 训练模型 library(C50) #--------------------------------------------- # 创建分类器: # m <- C5.0(train, class, trials = 1, costs = NULL) # train: 一个包含训练数据的数据框 # class: 包含训练数据每一行的分类的一个因子向量 # trials: 为一个可选数值,用于控制自助法循环的次数(默认为1) # costs: 为一个可选矩阵,用于给出与各种类型错误相对应的成本 # 该函数返回一个C5.0模型对象,该对象可用于预测 # # 进行预测: # p <- predict(m, test, type = "class") # m: 由C5.0(train, class, trials = 1, costs = NULL) 训练的一个模型 # test: 一个包含测试数据的数据框,该数据框和用来创建分类器的训练数据有相同的特征 # type: 取值为“”或者“”标示预测是最可能的类别值或者是原始的预测概率 # 该函数返回一个向量,根据参数type的取值,该向量含有预测的类别值或者原始的预测概率 # # example: # credit_model <- C5.0(credit_train, loan_default) # credit_prediction <- predict(credit_model, credit_test) #---------------------------------------------- # 构建决策数据模型 credit_model <- C5.0(credit_train[-17], credit_train$default) # 显示决策树模型 credit_model # 显示模型详细信息 summary(credit_model) ## 第四步: 评估模型性能 # create a factor vector of predictions on test data credit_pred <- predict(credit_model, credit_test) # cross tabulation of predicted versus actual classes library(gmodels) CrossTable(credit_test$default, credit_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c(‘actual default‘, ‘predicted default‘)) ## 第五步: 提升模型性能 ## Boosting the accuracy of decision trees # boosted decision tree with 10 trials credit_boost10 <- C5.0(credit_train[-17], credit_train$default, trials = 10) credit_boost10 summary(credit_boost10) credit_boost_pred10 <- predict(credit_boost10, credit_test) CrossTable(credit_test$default, credit_boost_pred10, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c(‘actual default‘, ‘predicted default‘)) # boosted decision tree with 100 trials (not shown in text) credit_boost100 <- C5.0(credit_train[-17], credit_train$default, trials = 100) credit_boost_pred100 <- predict(credit_boost100, credit_test) CrossTable(credit_test$default, credit_boost_pred100, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c(‘actual default‘, ‘predicted default‘)) ## Making some mistakes more costly than others # create a cost matrix error_cost <- matrix(c(0, 1, 4, 0), nrow = 2) error_cost # apply the cost matrix to the tree credit_cost <- C5.0(credit_train[-17], credit_train$default, costs = error_cost) credit_cost_pred <- predict(credit_cost, credit_test) CrossTable(credit_test$default, credit_cost_pred, prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE, dnn = c(‘actual default‘, ‘predicted default‘)) #### Part 2: Rule Learners ------------------- ## Example: Identifying Poisonous Mushrooms ---- ## Step 2: Exploring and preparing the data ---- mushrooms <- read.csv("mushrooms.csv", stringsAsFactors = TRUE) # examine the structure of the data frame str(mushrooms) # drop the veil_type feature mushrooms$veil_type <- NULL # examine the class distribution table(mushrooms$type) ## Step 3: Training a model on the data ---- library(RWeka) # train OneR() on the data mushroom_1R <- OneR(type ~ ., data = mushrooms) ## Step 4: Evaluating model performance ---- mushroom_1R summary(mushroom_1R) ## Step 5: Improving model performance ---- mushroom_JRip <- JRip(type ~ ., data = mushrooms) mushroom_JRip summary(mushroom_JRip) # Rule Learner Using C5.0 Decision Trees (not in text) library(C50) mushroom_c5rules <- C5.0(type ~ odor + gill_size, data = mushrooms, rules = TRUE) summary(mushroom_c5rules)
时间: 2024-10-14 11:23:08