# 建立简单的some_data.frame
some_dataframe<-read.table("~\some_data.frame.txt",sep="\t",header=T)
some_dataframe
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## 1 2 7 0 0 0 0 0 0 0 0 0
## 2 0 0 3 0 0 0 0 0 0 0 0
## 3 0 0 0 6 1 0 0 0 0 0 0
## 4 0 0 0 2 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 12 0 1
## 6 0 0 0 0 0 25 0 0 0 0 1
## 7 1 0 0 0 2 0 0 0 0 0 0
## 8 0 0 0 2 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 14 0 1
## 10 0 0 0 0 0 21 0 0 0 0 1
## 11 0 0 0 0 0 0 28 0 0 0 1
## 12 0 0 0 0 0 0 0 35 0 0 1
## 13 0 0 0 0 0 0 0 0 42 0 1
## 14 0 0 0 0 0 0 0 0 0 49 1
some_matrix<-data.matrix(some_dataframe)
some_matrix
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## [1,] 2 7 0 0 0 0 0 0 0 0 0
## [2,] 0 0 3 0 0 0 0 0 0 0 0
## [3,] 0 0 0 6 1 0 0 0 0 0 0
## [4,] 0 0 0 2 0 0 0 0 0 0 0
## [5,] 0 0 0 0 0 0 0 0 12 0 1
## [6,] 0 0 0 0 0 25 0 0 0 0 1
## [7,] 1 0 0 0 2 0 0 0 0 0 0
## [8,] 0 0 0 2 0 0 0 0 0 0 0
## [9,] 0 0 0 0 0 0 0 0 14 0 1
## [10,] 0 0 0 0 0 21 0 0 0 0 1
## [11,] 0 0 0 0 0 0 28 0 0 0 1
## [12,] 0 0 0 0 0 0 0 35 0 0 1
## [13,] 0 0 0 0 0 0 0 0 42 0 1
## [14,] 0 0 0 0 0 0 0 0 0 49 1
# 显然,这跟data.frame格式差不多。为了将它转成稀疏矩阵,我们加载Matrix包,利用Matrix函数,将sparse参数设置为TRUE。
library(Matrix)
print(Matrix(some_matrix,sparse=T))
## 14 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names ‘c1‘, ‘c2‘, ‘c3‘ ... ]]
##
## [1,] 2 7 . . . . . . . . .
## [2,] . . 3 . . . . . . . .
## [3,] . . . 6 1 . . . . . .
## [4,] . . . 2 . . . . . . .
## [5,] . . . . . . . . 12 . 1
## [6,] . . . . . 25 . . . . 1
## [7,] 1 . . . 2 . . . . . .
## [8,] . . . 2 . . . . . . .
## [9,] . . . . . . . . 14 . 1
## [10,] . . . . . 21 . . . . 1
## [11,] . . . . . . 28 . . . 1
## [12,] . . . . . . . 35 . . 1
## [13,] . . . . . . . . 42 . 1
## [14,] . . . . . . . . . 49 1
# 在这里,它只保留了非零值。
接下来,让我们将data.frame数据分成两份:2/3做为训练集,1/3做为测试集。
set.seed(2)
split<-sample(nrow(some_dataframe),floor(0.7*nrow(some_dataframe)))
train<-some_dataframe[split,]
test<-some_dataframe[-split,]
train
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## 3 0 0 0 6 1 0 0 0 0 0 0
## 10 0 0 0 0 0 21 0 0 0 0 1
## 7 1 0 0 0 2 0 0 0 0 0 0
## 2 0 0 3 0 0 0 0 0 0 0 0
## 13 0 0 0 0 0 0 0 0 42 0 1
## 9 0 0 0 0 0 0 0 0 14 0 1
## 11 0 0 0 0 0 0 28 0 0 0 1
## 6 0 0 0 0 0 25 0 0 0 0 1
## 14 0 0 0 0 0 0 0 0 0 49 1
test
## c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 outcome
## 1 2 7 0 0 0 0 0 0 0 0 0
## 4 0 0 0 2 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 12 0 1
## 8 0 0 0 2 0 0 0 0 0 0 0
## 12 0 0 0 0 0 0 0 35 0 0 1
# 然后,我们用sparse.model.matrix函数创建稀疏矩阵
train_sparse<-sparse.model.matrix(~.,train[1:10])
test_sparse<-sparse.model.matrix(~.,test[1:10])
train_sparse
## 9 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names ‘(Intercept)‘, ‘c1‘, ‘c2‘ ... ]]
##
## 3 1 . . . 6 1 . . . . .
## 10 1 . . . . . 21 . . . .
## 7 1 1 . . . 2 . . . . .
## 2 1 . . 3 . . . . . . .
## 13 1 . . . . . . . . 42 .
## 9 1 . . . . . . . . 14 .
## 11 1 . . . . . . 28 . . .
## 6 1 . . . . . 25 . . . .
## 14 1 . . . . . . . . . 49
test_sparse
## 5 x 11 sparse Matrix of class "dgCMatrix"
## [[ suppressing 11 column names ‘(Intercept)‘, ‘c1‘, ‘c2‘ ... ]]
##
## 1 1 2 7 . . . . . . . .
## 4 1 . . . 2 . . . . . .
## 5 1 . . . . . . . . 12 .
## 8 1 . . . 2 . . . . . .
## 12 1 . . . . . . . 35 . .
library(glmnet)
## Loaded glmnet 1.9-8
fit<-glmnet(train_sparse,train[,11])
pred<-predict(fit,test_sparse,test[,11],type="class")
print(head(pred[,1:5]))
## 1 2 3 4 5
## 1 0.9898 0.9898 0.6667 0.9898 0.6667
## 4 0.8306 0.8306 0.6667 0.8306 0.6667
## 5 0.9898 0.9898 0.6667 0.9898 0.6667
## 8 0.8306 0.8306 0.6667 0.8306 0.6667
## 12 0.9898 0.9898 0.6667 0.9898 0.6667
# 利用cv.glmnet找出最好的lambda/penalty
cv<-cv.glmnet(train_sparse,train[,11],nfolds=3)
pred<-predict(fit,test_sparse,type="response",s=cv$lambda.min)
print(names(cv))
## [1] "lambda" "cvm" "cvsd" "cvup" "cvlo"
## [6] "nzero" "name" "glmnet.fit" "lambda.min" "lambda.1se"
print(pred)
## 1
## 1 0.9898
## 4 0.8306
## 5 0.9898
## 8 0.8306
## 12 0.9898
# receiver operating characteristic (ROC curves)
library(pROC)
## Type ‘citation("pROC")‘ for a citation.
##
## Attaching package: ‘pROC‘
##
## 下列对象被屏蔽了from ‘package:glmnet‘:
##
## auc
##
## 下列对象被屏蔽了from ‘package:stats‘:
##
## cov, smooth, var
auc<-roc(test[,11],pred)
print(auc$auc)
## Area under the curve: 0.833
# how does sparse deal with categorical data (adding mood feature with two levels)?
cat_dataframe<-data.frame(some_dataframe,
mood=c("happy","happy","happy","happy","sad","sad","happy","happy",
"sad","sad","sad","sad","sad","sad"))
cat_dataframe<-cat_dataframe[,c(colnames(cat_dataframe)[1:10],"mood","outcome")]
sparse.model.matrix(~.,cat_dataframe)
## 14 x 13 sparse Matrix of class "dgCMatrix"
## [[ suppressing 13 column names ‘(Intercept)‘, ‘c1‘, ‘c2‘ ... ]]
##
## 1 1 2 7 . . . . . . . . . .
## 2 1 . . 3 . . . . . . . . .
## 3 1 . . . 6 1 . . . . . . .
## 4 1 . . . 2 . . . . . . . .
## 5 1 . . . . . . . . 12 . 1 1
## 6 1 . . . . . 25 . . . . 1 1
## 7 1 1 . . . 2 . . . . . . .
## 8 1 . . . 2 . . . . . . . .
## 9 1 . . . . . . . . 14 . 1 1
## 10 1 . . . . . 21 . . . . 1 1
## 11 1 . . . . . . 28 . . . 1 1
## 12 1 . . . . . . . 35 . . 1 1
## 13 1 . . . . . . . . 42 . 1 1
## 14 1 . . . . . . . . . 49 1 1
print(levels(cat_dataframe$mood))
## [1] "happy" "sad"
dim(cat_dataframe)
## [1] 14 12
dim(sparse.model.matrix(~.,cat_dataframe))
## [1] 14 13