cor(1:5,1:5)
## [1] 1
cor(1:5,5:1)
## [1] -1
cor(1:5,c(1,2,3,4,4))
## [1] 0.9701
cor(1:5,c(1,2,3,1,4))
## [1] 0.6063
library(RCurl)
## Loading required package: bitops
urlfile<-"http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
x<-getURL(urlfile,ssl.verifypeer=FALSE)
adults<-read.csv(textConnection(x),header=F)
head(adults,2)
## V1 V2 V3 V4 V5 V6
## 1 39 State-gov 77516 Bachelors 13 Never-married
## 2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## V7 V8 V9 V10 V11 V12 V13 V14
## 1 Adm-clerical Not-in-family White Male 2174 0 40 United-States
## 2 Exec-managerial Husband White Male 0 0 13 United-States
## V15
## 1 <=50K
## 2 <=50K
# 更改列名
names(adults)<-c("Age","Workclass","FinalWeight","Education","EducationNumer","MaritalStatus","Occupation","Relationship","Race","Sex","CapitalGain","CapitalLoss","HoursWeek","NativeCountry","Income")
head(adults,2)
## Age Workclass FinalWeight Education EducationNumer
## 1 39 State-gov 77516 Bachelors 13
## 2 50 Self-emp-not-inc 83311 Bachelors 13
## MaritalStatus Occupation Relationship Race Sex
## 1 Never-married Adm-clerical Not-in-family White Male
## 2 Married-civ-spouse Exec-managerial Husband White Male
## CapitalGain CapitalLoss HoursWeek NativeCountry Income
## 1 2174 0 40 United-States <=50K
## 2 0 0 13 United-States <=50K
adults$Income<-ifelse(adults$Income==" <=50K",0,1)
str(adults)
## ‘data.frame‘: 32561 obs. of 15 variables:
## $ Age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ Workclass : Factor w/ 9 levels " ?"," Federal-gov",..: 8 7 5 5 5 5 5 7 5 5 ...
## $ FinalWeight : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ Education : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ EducationNumer: int 13 13 9 7 13 14 5 9 14 13 ...
## $ MaritalStatus : Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ Occupation : Factor w/ 15 levels " ?"," Adm-clerical",..: 2 5 7 7 11 5 9 5 11 5 ...
## $ Relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ Race : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ Sex : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ CapitalGain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ CapitalLoss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HoursWeek : int 40 13 40 40 40 40 16 45 50 40 ...
## $ NativeCountry : Factor w/ 42 levels " ?"," Cambodia",..: 40 40 40 40 6 40 24 40 40 40 ...
## $ Income : num 0 0 0 0 0 0 0 1 1 1 ...
# 对因子变量进行哑变量处理
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
dmy<-dummyVars("~.",data=adults)
adultsTrsf<-data.frame(predict(dmy,newdata=adults))
dim(adults)
## [1] 32561 15
dim(adultsTrsf)
## [1] 32561 109
head(adultsTrsf)
## Age Workclass... Workclass..Federal.gov Workclass..Local.gov
## 1 39 0 0 0
## 2 50 0 0 0
## 3 38 0 0 0
## 4 53 0 0 0
## 5 28 0 0 0
## 6 37 0 0 0
## Workclass..Never.worked Workclass..Private Workclass..Self.emp.inc
## 1 0 0 0
## 2 0 0 0
## 3 0 1 0
## 4 0 1 0
## 5 0 1 0
## 6 0 1 0
## Workclass..Self.emp.not.inc Workclass..State.gov Workclass..Without.pay
## 1 0 1 0
## 2 1 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## FinalWeight Education..10th Education..11th Education..12th
## 1 77516 0 0 0
## 2 83311 0 0 0
## 3 215646 0 0 0
## 4 234721 0 1 0
## 5 338409 0 0 0
## 6 284582 0 0 0
## Education..1st.4th Education..5th.6th Education..7th.8th Education..9th
## 1 0 0 0 0
## 2 0 0 0 0
## 3 0 0 0 0
## 4 0 0 0 0
## 5 0 0 0 0
## 6 0 0 0 0
## Education..Assoc.acdm Education..Assoc.voc Education..Bachelors
## 1 0 0 1
## 2 0 0 1
## 3 0 0 0
## 4 0 0 0
## 5 0 0 1
## 6 0 0 0
## Education..Doctorate Education..HS.grad Education..Masters
## 1 0 0 0
## 2 0 0 0
## 3 0 1 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 1
## Education..Preschool Education..Prof.school Education..Some.college
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Education.Numer MaritalStatus..Divorced MaritalStatus..Married.AF.spouse
## 1 13 0 0
## 2 13 0 0
## 3 9 1 0
## 4 7 0 0
## 5 13 0 0
## 6 14 0 0
## MaritalStatus..Married.civ.spouse MaritalStatus..Married.spouse.absent
## 1 0 0
## 2 1 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 1 0
## MaritalStatus..Never.married MaritalStatus..Separated
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## MaritalStatus..Widowed Occupation... Occupation..Adm.clerical
## 1 0 0 1
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## Occupation..Armed.Forces Occupation..Craft.repair
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Occupation..Exec.managerial Occupation..Farming.fishing
## 1 0 0
## 2 1 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 1 0
## Occupation..Handlers.cleaners Occupation..Machine.op.inspct
## 1 0 0
## 2 0 0
## 3 1 0
## 4 1 0
## 5 0 0
## 6 0 0
## Occupation..Other.service Occupation..Priv.house.serv
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Occupation..Prof.specialty Occupation..Protective.serv Occupation..Sales
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 1 0 0
## 6 0 0 0
## Occupation..Tech.support Occupation..Transport.moving
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Relationship..Husband Relationship..Not.in.family
## 1 0 1
## 2 1 0
## 3 0 1
## 4 1 0
## 5 0 0
## 6 0 0
## Relationship..Other.relative Relationship..Own.child
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## Relationship..Unmarried Relationship..Wife Race..Amer.Indian.Eskimo
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 1 0
## 6 0 1 0
## Race..Asian.Pac.Islander Race..Black Race..Other Race..White Sex..Female
## 1 0 0 0 1 0
## 2 0 0 0 1 0
## 3 0 0 0 1 0
## 4 0 1 0 0 0
## 5 0 1 0 0 1
## 6 0 0 0 1 1
## Sex..Male CapitalGain CapitalLoss HoursWeek NativeCountry...
## 1 1 2174 0 40 0
## 2 1 0 0 13 0
## 3 1 0 0 40 0
## 4 1 0 0 40 0
## 5 0 0 0 40 0
## 6 0 0 0 40 0
## NativeCountry..Cambodia NativeCountry..Canada NativeCountry..China
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NativeCountry..Columbia NativeCountry..Cuba
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 1
## 6 0 0
## NativeCountry..Dominican.Republic NativeCountry..Ecuador
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## NativeCountry..El.Salvador NativeCountry..England NativeCountry..France
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NativeCountry..Germany NativeCountry..Greece NativeCountry..Guatemala
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NativeCountry..Haiti NativeCountry..Holand.Netherlands
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## NativeCountry..Honduras NativeCountry..Hong NativeCountry..Hungary
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NativeCountry..India NativeCountry..Iran NativeCountry..Ireland
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NativeCountry..Italy NativeCountry..Jamaica NativeCountry..Japan
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NativeCountry..Laos NativeCountry..Mexico NativeCountry..Nicaragua
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NativeCountry..Outlying.US.Guam.USVI.etc. NativeCountry..Peru
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## NativeCountry..Philippines NativeCountry..Poland NativeCountry..Portugal
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NativeCountry..Puerto.Rico NativeCountry..Scotland NativeCountry..South
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## NativeCountry..Taiwan NativeCountry..Thailand
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
## NativeCountry..Trinadad.Tobago NativeCountry..United.States
## 1 0 1
## 2 0 1
## 3 0 1
## 4 0 1
## 5 0 0
## 6 0 1
## NativeCountry..Vietnam NativeCountry..Yugoslavia Income
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
str(adultsTrsf)
## ‘data.frame‘: 32561 obs. of 109 variables:
## $ Age : num 39 50 38 53 28 37 49 52 31 42 ...
## $ Workclass... : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Workclass..Federal.gov : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Workclass..Local.gov : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Workclass..Never.worked : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Workclass..Private : num 0 0 1 1 1 1 1 0 1 1 ...
## $ Workclass..Self.emp.inc : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Workclass..Self.emp.not.inc : num 0 1 0 0 0 0 0 1 0 0 ...
## $ Workclass..State.gov : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Workclass..Without.pay : num 0 0 0 0 0 0 0 0 0 0 ...
## $ FinalWeight : num 77516 83311 215646 234721 338409 ...
## $ Education..10th : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..11th : num 0 0 0 1 0 0 0 0 0 0 ...
## $ Education..12th : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..1st.4th : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..5th.6th : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..7th.8th : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..9th : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Education..Assoc.acdm : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..Assoc.voc : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..Bachelors : num 1 1 0 0 1 0 0 0 0 1 ...
## $ Education..Doctorate : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..HS.grad : num 0 0 1 0 0 0 0 1 0 0 ...
## $ Education..Masters : num 0 0 0 0 0 1 0 0 1 0 ...
## $ Education..Preschool : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..Prof.school : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education..Some.college : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Education.Numer : num 13 13 9 7 13 14 5 9 14 13 ...
## $ MaritalStatus..Divorced : num 0 0 1 0 0 0 0 0 0 0 ...
## $ MaritalStatus..Married.AF.spouse : num 0 0 0 0 0 0 0 0 0 0 ...
## $ MaritalStatus..Married.civ.spouse : num 0 1 0 1 1 1 0 1 0 1 ...
## $ MaritalStatus..Married.spouse.absent : num 0 0 0 0 0 0 1 0 0 0 ...
## $ MaritalStatus..Never.married : num 1 0 0 0 0 0 0 0 1 0 ...
## $ MaritalStatus..Separated : num 0 0 0 0 0 0 0 0 0 0 ...
## $ MaritalStatus..Widowed : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation... : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Adm.clerical : num 1 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Armed.Forces : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Craft.repair : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Exec.managerial : num 0 1 0 0 0 1 0 1 0 1 ...
## $ Occupation..Farming.fishing : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Handlers.cleaners : num 0 0 1 1 0 0 0 0 0 0 ...
## $ Occupation..Machine.op.inspct : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Other.service : num 0 0 0 0 0 0 1 0 0 0 ...
## $ Occupation..Priv.house.serv : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Prof.specialty : num 0 0 0 0 1 0 0 0 1 0 ...
## $ Occupation..Protective.serv : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Sales : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Tech.support : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Occupation..Transport.moving : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Relationship..Husband : num 0 1 0 1 0 0 0 1 0 1 ...
## $ Relationship..Not.in.family : num 1 0 1 0 0 0 1 0 1 0 ...
## $ Relationship..Other.relative : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Relationship..Own.child : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Relationship..Unmarried : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Relationship..Wife : num 0 0 0 0 1 1 0 0 0 0 ...
## $ Race..Amer.Indian.Eskimo : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Race..Asian.Pac.Islander : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Race..Black : num 0 0 0 1 1 0 1 0 0 0 ...
## $ Race..Other : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Race..White : num 1 1 1 0 0 1 0 1 1 1 ...
## $ Sex..Female : num 0 0 0 0 1 1 1 0 1 0 ...
## $ Sex..Male : num 1 1 1 1 0 0 0 1 0 1 ...
## $ CapitalGain : num 2174 0 0 0 0 ...
## $ CapitalLoss : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HoursWeek : num 40 13 40 40 40 40 16 45 50 40 ...
## $ NativeCountry... : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Cambodia : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Canada : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..China : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Columbia : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Cuba : num 0 0 0 0 1 0 0 0 0 0 ...
## $ NativeCountry..Dominican.Republic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Ecuador : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..El.Salvador : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..England : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..France : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Germany : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Greece : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Guatemala : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Haiti : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Holand.Netherlands : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Honduras : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Hong : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Hungary : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..India : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Iran : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Ireland : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Italy : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Jamaica : num 0 0 0 0 0 0 1 0 0 0 ...
## $ NativeCountry..Japan : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Laos : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Mexico : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Nicaragua : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Outlying.US.Guam.USVI.etc.: num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Peru : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Philippines : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Poland : num 0 0 0 0 0 0 0 0 0 0 ...
## $ NativeCountry..Portugal : num 0 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
# 计算变量间相关系数的p值
cor.prob<-function(X,dfr=nrow(X)-2){
R<-cor(X,use="pairwise.complete.obs")
above<-row(R)<col(R)
r2<-R[above]^2
Fstat<-r2*dfr/(1-r2)
R[above]<-1-pf(Fstat,1,dfr)
R[row(R)==col(R)]<-NA
R
}
# 将数据整理成data.frame形式
flattenSquareMatrix<-function(m){
if((class(m) !="matrix") | (nrow(m) !=ncol(m))) stop("Must be asquare matrix.")
if(!identical(rownames(m),colnames(m))) stop("Row and column names must be equal.")
ut<-upper.tri(m)
data.frame(i=rownames(m)[row(m)[ut]],
j=rownames(m)[col(m)[ut]],
cor=t(m)[ut],
p=m[ut])
}
corMasterList<-flattenSquareMatrix(cor.prob(adultsTrsf))
dim(corMasterList)
## [1] 5886 4
head(corMasterList)
## i j cor p
## 1 Age Workclass... 0.04263 1.410e-14
## 2 Age Workclass..Federal.gov 0.05123 0.000e+00
## 3 Workclass... Workclass..Federal.gov -0.04261 1.454e-14
## 4 Age Workclass..Local.gov 0.06090 0.000e+00
## 5 Workclass... Workclass..Local.gov -0.06407 0.000e+00
## 6 Workclass..Federal.gov Workclass..Local.gov -0.04568 1.110e-16
corList<-corMasterList[order(-abs(corMasterList$cor)),]
head(corList)
## i j
## 1953 Sex..Female Sex..Male
## 597 Workclass... Occupation...
## 1256 MaritalStatus..Married.civ.spouse Relationship..Husband
## 1829 Race..Black Race..White
## 527 MaritalStatus..Married.civ.spouse MaritalStatus..Never.married
## 1881 Relationship..Husband Sex..Female
## cor p
## 1953 -1.0000 0
## 597 0.9980 0
## 1256 0.8932 0
## 1829 -0.7887 0
## 527 -0.6449 0
## 1881 -0.5801 0
#选择与Income相关系数绝对值大于0.2的数据集
selectedSub<-subset(corList,(abs(cor)>0.2 & j =="Income"))
bestSub<-as.character(selectedSub$i[c(1,3,5,6,8,9)])
library(psych)
##
## Attaching package: ‘psych‘
##
## 下列对象被屏蔽了from ‘package:ggplot2‘:
##
## %+%
pairs.panels(adultsTrsf[c(bestSub,"Income")])