相关文章:https://github.com/JohnLangford/vowpal_wabbit/wiki
https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format
# 从网上下载titanicDF的数据源
titanicDF<-read.csv(‘http://math.ucdenver.edu/RTutorial/titanic.txt‘,sep=‘\t‘)
# 将数据源存到本地
# write.table(titanicDF,‘titanicDF.txt‘,row.names=F)
# write.csv(titanicDF,"titanicDF.csv",row.names=F)
# 创建新变量 title
titanicDF$Title<-ifelse(grepl(‘Mr ‘,titanicDF$Name),‘Mr‘,ifelse(grepl(‘Mrs ‘,titanicDF$Name),‘Mrs‘,ifelse(grepl(‘Miss‘,titanicDF$Name),‘Miss‘,‘Nothing‘)))
# 转换成factor类型
titanicDF$Title<-as.factor(titanicDF$Title)
head(titanicDF)
## Name PClass Age Sex
## 1 Allen, Miss Elisabeth Walton 1st 29.00 female
## 2 Allison, Miss Helen Loraine 1st 2.00 female
## 3 Allison, Mr Hudson Joshua Creighton 1st 30.00 male
## 4 Allison, Mrs Hudson JC (Bessie Waldo Daniels) 1st 25.00 female
## 5 Allison, Master Hudson Trevor 1st 0.92 male
## 6 Anderson, Mr Harry 1st 47.00 male
## Survived Title
## 1 1 Miss
## 2 0 Miss
## 3 0 Mr
## 4 0 Mrs
## 5 1 Nothing
## 6 1 Mr
str(titanicDF)
## ‘data.frame‘: 1313 obs. of 6 variables:
## $ Name : Factor w/ 1310 levels "Abbing, Mr Anthony",..: 22 25 26 27 24 31 45 46 50 54 ...
## $ PClass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : num 29 2 30 25 0.92 47 63 39 58 71 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 1 2 1 2 2 1 2 1 2 ...
## $ Survived: int 1 0 0 0 1 1 1 0 1 0 ...
## $ Title : Factor w/ 4 levels "Miss","Mr","Mrs",..: 1 1 2 3 4 2 1 2 3 2 ...
# 查看数据缺失情况
library(dfexplore)
## Loading required package: ggplot2
dfplot(titanicDF)
sum(is.na(titanicDF$Age)) # Age有557条记录数据缺失
## [1] 557
# 用中位数填补age变量的缺失数据
titanicDF$Age[is.na(titanicDF$Age)]<-median(titanicDF$Age,na.rm=T)
# 再次查看tittitanicDF数据是否有缺失值
dfplot(titanicDF)
# 对数据的列进行重组,将目标"Survived"放在最后面
titanicDF<-titanicDF[c("PClass","Age","Sex","Title","Survived")]
head(titanicDF)
## PClass Age Sex Title Survived
## 1 1st 29.00 female Miss 1
## 2 1st 2.00 female Miss 0
## 3 1st 30.00 male Mr 0
## 4 1st 25.00 female Mrs 0
## 5 1st 0.92 male Nothing 1
## 6 1st 47.00 male Mr 1
# binarize all factors:对所有因子变量进行虚拟变量(哑变量)处理
library(caret)
## Loading required package: lattice
titanicDummy<-dummyVars(~.,data=titanicDF,fullRank=F)
titanicDF<-as.data.frame(predict(titanicDummy,newdata=titanicDF))
head(titanicDF)
## PClass.1st PClass.2nd PClass.3rd Age Sex.female Sex.male Title.Miss
## 1 1 0 0 29.00 1 0 1
## 2 1 0 0 2.00 1 0 1
## 3 1 0 0 30.00 0 1 0
## 4 1 0 0 25.00 1 0 0
## 5 1 0 0 0.92 0 1 0
## 6 1 0 0 47.00 0 1 0
## Title.Mr Title.Mrs Title.Nothing Survived
## 1 0 0 0 1
## 2 0 0 0 0
## 3 1 0 0 0
## 4 0 1 0 0
## 5 0 0 1 1
## 6 1 0 0 1
https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format
[Label] [Importance [Tag]]|Namespace Features |Namespace Features … |Namespace Features
isclassification<-T
outcomeName<-"Survived"
labelName<-titanicDF[,"Survived"]
weightName<-""
objDF<-titanicDF
predictors<-names(objDF)[!names(objDF) %in% c(outcomeName,weightName)]
# LABELS & IMPORTANCE
if (is.null(labelName)) {
outcomeName<-"ignoreme"
objDF[,outcomeName]<-"0 |"
} else {
if (isclassification) {
# everything should be -1 1 for classification
objDF[,outcomeName]<-ifelse(objDF[,outcomeName]>0,1,-1)
}
if (weightName !=‘‘)
objDF[,outcomeName]<-psate(objDF[,outcomeName],objDF[,weightName],"|")
else
objDF[,outcomeName]<-paste(objDF[,outcomeName],"|")
}
# Pairing column names with data.. adding 1 blank character before each variable
for (i in predictors){
objDF[,i]<-ifelse(objDF[,i]==1,paste0(‘ ‘,i),
ifelse(objDF[,i]==0,‘‘,paste0(‘ ‘,i,":",objDF[,i])))
}
# reorder columns:重新组合列
objDF<-objDF[c(outcomeName,predictors)]
head(objDF)
## Survived PClass.1st PClass.2nd PClass.3rd Age Sex.female
## 1 1 | PClass.1st Age:29 Sex.female
## 2 -1 | PClass.1st Age:2 Sex.female
## 3 -1 | PClass.1st Age:30
## 4 -1 | PClass.1st Age:25 Sex.female
## 5 1 | PClass.1st Age:0.92
## 6 1 | PClass.1st Age:47
## Sex.male Title.Miss Title.Mr Title.Mrs Title.Nothing
## 1 Title.Miss
## 2 Title.Miss
## 3 Sex.male Title.Mr
## 4 Title.Mrs
## 5 Sex.male Title.Nothing
## 6 Sex.male Title.Mr
write.table(objDF,"vw.txt",sep="",quote=F,row.names=F,col.names=F)