2014年9月23日 星期二

Kaggle練習:Titanic實作(方法二)

原文出處:Data Analytics for Beginners

Part 1

#匯入資料(切記,stringsAsFactors要設為FALSE, 或是匯入設定要uncheck [Strings as factors]

trainData <- read.csv("train.csv", header = TRUE, stringsAsFactors = FALSE)
testData <- read.csv("test.csv", header = TRUE, stringsAsFactors = FALSE)

#瞭解資料,先列出前6筆資料看看資料長什麼樣
head(trainData)

#畫個密度圖看看資料的分佈
plot(density(trainData$Age, na.rm = TRUE))

#做一個存活和性別的table
counts <- table(trainData$Survived, trainData$Sex)

#畫一個bar char圖顯示性別與生還者的關係
barplot(counts, xlab = "Gender", ylab = "Number of People", main = "survived and deceased between male and female”)




























#算出性別的生還比例
counts[2] / (counts[1] + counts[2])
[1] 0.7420382
counts[4] / (counts[3] + counts[4])
[1] 0.1889081

#做出艙別與生還者的table
Pclass_survival <- table(trainData$Survived, trainData$Pclass)

#畫出bar chart
barplot(Pclass_survival, xlab = "Cabin Class", ylab = "Number of People",main = "survived and deceased between male and female”)




























#算出各艙等生還者比例
Pclass_survival[2] / (Pclass_survival[1] + Pclass_survival[2])
[1] 0.6296296
Pclass_survival[4] / (Pclass_survival[3] + Pclass_survival[4])
[1] 0.4728261
Pclass_survival[6] / (Pclass_survival[5] + Pclass_survival[6])
[1] 0.2423625

Part 2

#將用不到的欄位刪去
trainData = trainData[-c(1,9:12)]

#將female用1取代,male用0取代

trainData$Sex = gsub("female", 1, trainData$Sex)
trainData$Sex = gsub("^male", 0, trainData$Sex)

#將Name裡有出現同樣Title的資料列出
master_vector = grep("Master.",trainData$Name, fixed=TRUE)
miss_vector = grep("Miss.", trainData$Name, fixed=TRUE)
mrs_vector = grep("Mrs.", trainData$Name, fixed=TRUE)
mr_vector = grep("Mr.", trainData$Name, fixed=TRUE)
dr_vector = grep("Dr.", trainData$Name, fixed=TRUE)

#列出的資料如下
master_vector
 [1]   8  17  51  60  64  66  79 126 160 165 166 172 177
[14] 183 184 194 262 279 306 341 349 387 408 446 481 490
[27] 550 710 752 756 788 789 803 804 820 825 828 832 851
[40] 870

#將所有的Name用Title取代,如果這部份出現問題,就是匯入資料時沒有把stringsAsFactors設為FALSE
for(i in master_vector) {
  trainData$Name[i] = "Master"
}

for(i in miss_vector) {
  trainData$Name[i] = "Miss"
}

for(i in mrs_vector) {
  trainData$Name[i] = "Mrs"
}

for(i in mr_vector) {
  trainData$Name[i] = "Mr"
}

for(i in dr_vector) {
  trainData$Name[i] = "Dr"
}

#算出每一種Title的平均年齡
master_age = round(mean(trainData$Age[trainData$Name == "Master"], na.rm = TRUE), digits = 2)

miss_age = round(mean(trainData$Age[trainData$Name == "Miss"], na.rm = TRUE), digits =2)

mrs_age = round(mean(trainData$Age[trainData$Name == "Mrs"], na.rm = TRUE), digits = 2)

mr_age = round(mean(trainData$Age[trainData$Name == "Mr"], na.rm = TRUE), digits = 2)

dr_age = round(mean(trainData$Age[trainData$Name == "Dr"], na.rm = TRUE), digits = 2)

#將年齡為NA的資料,依照他的Title填入平均值
for (i in 1:nrow(trainData)) {
  if (is.na(trainData[i,5])) {   
    if (trainData$Name[i] == "Master") {   
      trainData$Age[i] = master_age     
    } else if (trainData$Name[i] == "Miss") {     
      trainData$Age[i] = miss_age     
    } else if (trainData$Name[i] == "Mrs") {     
      trainData$Age[i] = mrs_age    
    } else if (trainData$Name[i] == "Mr") {    
      trainData$Age[i] = mr_age     
    } else if (trainData$Name[i] == "Dr") {     
      trainData$Age[i] = dr_age   
    } else {
      print("Uncaught Title")
    }
  }
}

#新增一個欄位 “Child”, 定義12歲以下Child=1, 其它Child=2
for (i in 1:nrow(trainData)) {
  if (trainData$Age[i] <= 12) {   
    trainData$Child[i] = 1   
  } else { 
    trainData$Child[i] = 2  
  }
}

#新增一個欄位 “Family”, Family人數= SibSp + Parch +1
trainData["Family"] = NA

for(i in 1:nrow(trainData)) { 
  x = trainData$SibSp[i] 
  y = trainData$Parch[i] 
  trainData$Family[i] = x + y + 1 
}

#新增一個欄位 “Mother”, 如果Title是Mrs, Parch又不等於0, 那Mother就=1, 其它Mother為0
for(i in 1:nrow(trainData)) { 
  if(trainData$Name[i] == "Mrs" & trainData$Parch[i] > 0) { 
    trainData$Mother[i] = 1  
  } else {   
    trainData$Mother[i] = 2 
  }
}

#清空testData無需使用的資料,並將格式改為和trainData相同
PassengerId = testData[1]
testData = testData[-c(1, 8:11)]
testData$Sex = gsub("female", 1, testData$Sex)
testData$Sex = gsub("^male", 0, testData$Sex)
test_master_vector = grep("Master.",testData$Name)
test_miss_vector = grep("Miss.", testData$Name)
test_mrs_vector = grep("Mrs.", testData$Name)
test_mr_vector = grep("Mr.", testData$Name)
test_dr_vector = grep("Dr.", testData$Name)

for(i in test_master_vector) { 
  testData[i, 2] = "Master"
}
for(i in test_miss_vector) { 
  testData[i, 2] = "Miss" 
}
for(i in test_mrs_vector) { 
  testData[i, 2] = "Mrs" 
}
for(i in test_mr_vector) { 
  testData[i, 2] = "Mr" 
}
for(i in test_dr_vector) { 
  testData[i, 2] = "Dr" 
}

test_master_age = round(mean(testData$Age[testData$Name == "Master"], na.rm = TRUE), digits = 2)
test_miss_age = round(mean(testData$Age[testData$Name == "Miss"], na.rm = TRUE), digits =2)
test_mrs_age = round(mean(testData$Age[testData$Name == "Mrs"], na.rm = TRUE), digits = 2)
test_mr_age = round(mean(testData$Age[testData$Name == "Mr"], na.rm = TRUE), digits = 2)
test_dr_age = round(mean(testData$Age[testData$Name == "Dr"], na.rm = TRUE), digits = 2)

for (i in 1:nrow(testData)) { 
  if (is.na(testData[i,4])) {   
    if (testData[i, 2] == "Master") {     
      testData[i, 4] = test_master_age     
    } else if (testData[i, 2] == "Miss") {     
      testData[i, 4] = test_miss_age     
    } else if (testData[i, 2] == "Mrs") {     
      testData[i, 4] = test_mrs_age     
    } else if (testData[i, 2] == "Mr") {     
      testData[i, 4] = test_mr_age     
    } else if (testData[i, 2] == "Dr") {     
      testData[i, 4] = test_dr_age     
    } else {     
      print(paste("Uncaught title at: ", i, sep=""))     
      print(paste("The title unrecognized was: ", testData[i,2], sep=""))     
    }   
  } 
}

[1] "Uncaught title at: 89”
[1] "The title unrecognized was: O'Donoghue, Ms. Bridget”

testData[89, 4] = test_miss_age
testData["Child"] = NA

for (i in 1:nrow(testData)) { 
  if (testData[i, 4] <= 12) {   
    testData[i, 7] = 1   
  } else {   
    testData[i, 7] = 1   
  } 
}

testData["Family"] = NA

for(i in 1:nrow(testData)) { 
  testData[i, 8] = testData[i, 5] + testData[i, 6] + 1 
}
testData["Mother"] = NA
for(i in 1:nrow(testData)) { 
  if(testData[i, 2] == "Mrs" & testData[i, 6] > 0) {   
    testData[i, 9] = 1   
  } else {   
    testData[i, 9] = 2   
  }
}

Part 3: Prediction Model

#使用glm來建立線性回歸模型
train.glm <- glm(Survived ~ Pclass + Sex + Age + Child + Sex*Pclass + Family + Mother, family = binomial, data = trainData)

 train.glm

Call:  glm(formula = Survived ~ Pclass + Sex + Age + Child + Sex * Pclass +
    Family + Mother, family = binomial, data = trainData)

Coefficients:
(Intercept)       Pclass         Sex1          Age 
    7.77516     -0.87688      6.05753     -0.02839 
      Child       Family       Mother  Pclass:Sex1 
   -1.93722     -0.44175     -0.99742     -1.32707 

Degrees of Freedom: 890 Total (i.e. Null);  883 Residual
Null Deviance:         1187
Residual Deviance: 743.7      AIC: 759.7

#將train.glm模型套用在testData上
p.hats <- predict.glm(train.glm, newdata = testData, type = "response”)

#得到的結果如下,我們必須把結果轉換成0或1
 head(p.hats)
        1         2         3         4         5
0.4479812 0.7447584 0.4718448 0.5010278 0.7923101
        6
0.5922346 

#新增一個參數survival格式為vector
survival <- vector()

#檢驗各筆結果,如果大於0.5則指派1的值,其它則指派0
for(i in 1:length(p.hats)) { 
  if(p.hats[i] > .5) {  
    survival[i] <- 1   
  } else {  
    survival[i] <- 0 
  } 
}

#將PassengerId和survival合併成waggle.sub
kaggle.sub <- cbind(PassengerId,survival)

#將欄位命名為PassengerId和Survived
colnames(kaggle.sub) <- c("PassengerId", "Survived")

#將資料寫入kaggle.csv檔案中
write.csv(kaggle.sub, file = "kaggle.csv", row.names = FALSE)

Part 4: Random Forest

#同樣的參數,用random forest來建構模型
train.rf <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + Child + Sex*Pclass + Family + Mother , data=trainData, importance=TRUE, ntree=2000)

#新增一個Survived欄位
testData$Survived <-0

#用模型來預測testData資料
Prediction <- predict(train.rf, testData)

#將預測結果寫入kaggle.csv檔案
survival <-Prediction
kaggle.sub <- cbind(PassengerId,survival)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "kaggle.csv", row.names = FALSE)

#結果 0.75120,沒比較厲害

Kaggle練習:Titanic實作(方法一)

原文出處: Titanic: Getting Started With R

Part 1:Getting start with R

#先匯入train.csv

#瞭解train資料結構


str(train)

#列出生/死人數總額

table(train$Survived)

#將人數總額轉化成比例

prop.table(table(train$Survived))

#用rep指令重覆填滿test$Survived值為0

test$Survived <-rep(0, 418)

#將預測資料寫入submit以供上傳

submit <- data.frame(PassengerId = test$PassengerId, Survived=test$Survived)

#將submit資料寫入檔案,不包含row.names

write.csv(submit, file="theyallperish.csv", row.names = FALSE)

Part 2:The Gender-Class Model

#從Sex來瞭解資料,先列出男女數量

table(train$Sex)

#從比例上來看男女/存活比例

prop.table(table(train$Sex, train$Survived))

                  0          1
  female 0.09090909 0.26150393
  male   0.52525253 0.12233446

#但是上列指令會將男生及生死拆成100%顯示,如果要分別列出男性及女性的生死比例,則要在最後加入1

prop.table(table(train$Sex, train$Survived),1)

                 0         1
  female 0.2579618 0.7420382
  male   0.8110919 0.1889081

#如果把最後參數改成2,則會生/死分別的男女比例

prop.table(table(train$Sex, train$Survived),2)

                 0         1
  female 0.1475410 0.6812865
  male   0.8524590 0.3187135

#先把test裡Survived全設成0

test$Survived <-0

#再假設所有女生都生還

test$Survived[test$Sex ==( "female"] <-1

#用summary指令探索Age欄位,發現有177筆無資料

summary(train$Age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
   0.42   20.12   28.00   29.70   38.00   80.00     177 

#新增一個Child欄位,先預設值為0

train$Child <-0

#Age小於18的都定義為Child

train$Child[train$Age < 18] <-1

#用aggregate指令把Child和Sex的生還人數分別列出

aggregate(Survived ~ Child + Sex, data=train, FUN=sum)

  Child    Sex Survived
1     0 female      195
2     1 female       38
3     0   male       86
4     1   male       23

#用aggregate指令把Child和Sex的總人數分別列出

aggregate(Survived ~ Child + Sex, data=train, FUN=length)

  Child    Sex Survived
1     0 female      259
2     1 female       55
3     0   male      519
4     1   male       58

#用aggregate指令把Child和Sex的生還比例

aggregate(Survived ~ Child + Sex, data= train, FUN=function(x){sum(x)/length(x)})

  Child    Sex  Survived
1     0 female 0.7528958
2     1 female 0.6909091
3     0   male 0.1657033
4     1   male 0.3965517

#將年齡分成4個類別,存入新增的Fare2欄位

train$Fare2 <- "30+"
train$Fare2[train$Fare <30 & train$Fare >=20] <- "20-30"
train$Fare2[train$Fare <20 & train$Fare >=10] <- "10-20"
train$Fare2[train$Fare <10] <- "<10”

#用aggregate指令把Fare2, Pclass, Sex的生還比例列出來,找尋異常部份,如第9條有3位女生以30+價格購入最低等艙房,而且生存率異常的低

aggregate(Survived ~ Fare2 + Pclass + Sex, data=train, FUN=function(x){sum(x)/length(x)})

   Fare2 Pclass    Sex  Survived
1  20-30      1 female 0.8333333
2    30+      1 female 0.9772727
3  10-20      2 female 0.9142857
4  20-30      2 female 0.9000000
5    30+      2 female 1.0000000
6    <10      3 female 0.5937500
7  10-20      3 female 0.5813953
8  20-30      3 female 0.3333333
9    30+      3 female 0.1250000
10   <10      1   male 0.0000000
11 20-30      1   male 0.4000000
12   30+      1   male 0.3837209
13   <10      2   male 0.0000000
14 10-20      2   male 0.1587302
15 20-30      2   male 0.1600000
16   30+      2   male 0.2142857
17   <10      3   male 0.1115385
18 10-20      3   male 0.2368421
19 20-30      3   male 0.1250000
20   30+      3   male 0.2400000

#將上述發現加入text$Survived清單的死亡預測

test$Survived[test$Sex == "female" & test$Pclass==3 & test$Fare >=20] <-0

#要使用Decision Tree, 必須啟動rpart套件

Part 3:Decision Trees

library(rpart)

#建立名為fit的Decision Tree模型,用Pclass, Sex, SibSp, Parch, Fare, Embarked來預測存活率

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data = train, method="class”)

#用plot把fit模型的Decision Tree畫出來

plot(fit)

#加入文字描述

text(fit)

#安裝rattle, rpart.plot, RColorBrewer,用來晝更美觀的Decision Tree

install.packages('rattle')
install.packages('rpart.plot')
install.packages('RColorBrewer')
library(rattle)
library(rpart.plot)
library(RColorBrewer)

#用predict指令將fit模型套用在test資料中

Prediction <- predict(fit, test, type = "class”)

#將test中的PassengerId和Prediction的結果寫入submit資料中

submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)

#將submit資料寫入myfirstdtree.csv檔案裡

write.csv(submit, file = "myfirstdtree.csv", row.names = FALSE)

Part 4:Feature Engeneering

#列出第一筆Name

train$Name[1]

[1] Braund, Mr. Owen Harris
891 Levels: Abbing, Mr. Anthony ... Zimmerman, Mr. Leo

#將test的Survived設為NA

test$Survived <- NA

#把train和test資料合併成一個新的資料combi

combi <- rbind(train, test)

#把Name從factor轉換成character

combi$Name <-as.character(combi$Name)

#轉換後執行列出第一筆變成這樣

combi$Name[1]
[1] "Braund, Mr. Owen Harris”

#用strsplit把Name拆開

strsplit(combi$Name[1], split= '[,.]')[[1]]
[1] "Braund"       " Mr"          " Owen Harris"

#若要把第二個值分離出來則指令如下

strsplit(combi$Name[1], split= '[,.]')[[1]][2]
[1] " Mr”

#用sapply把所有Name裡的title extra出到一個新的欄位Title

combi$Title <- sapply(combi$Name, FUN=function(x){strsplit(x, split='[,.]')[[1]][2]})

#把Title裡的空白消除掉


combi$Title <- sub(" ", "", combi$Title)

#將所有的Title簡化為Mlle, Sir, Lady三個類型

combi$Title[combi$Title %in% c("Mme", "Mlle")] <- "Mlle"
combi$Title[combi$Title %in% c("Capt", "Don", "Major", "Sir")] <-"Sir"
combi$Title[combi$Title %in% c("Dona", "Lady", "the Countess", "Jonkheer")] <- “Lady"

#將combi$Title從character轉換成factor

combi$Title <-factor(combi$Title)

#新增一個欄位FamilySize, 將同行人數的兄弟姐妹配偶父母小孩及本人加總

combi$FamilySize <- combi$SibSp + combi$Parch +1

#從Name裡extra出姓

combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})

#新增FamilyID欄位,將FamilySize及姓結合在一起,這樣就可以知道哪些人是同一家人

combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep=“")

#將2人及以下人數,非家族旅行的全部稱做Small, 以減少資料level

combi$FamilyID[combi$FamilySize <= 2] <- ‘Small'

#將FamilyID及人數匯出成famIDs

famIDs <-data.frame(table(combi$FamilyID))

#將Freq小於2次的資料匯出

famIDs <- famIDs[famIDs$Freq <= 2,]

#將上述選出來的名單的FamilyID都改成Small

combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- ‘Small'

將FamilyID轉化成factor

combi$FamilyID <- factor(combi$FamilyID)

#將combi資料再寫回train和test

train <-combi[1:891, ]
test <-combi[892:1309,]

#加入上述欄位再一次產生dTree預測模型

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, method = "class”)

Part 5:Random Forests

#列出combi$Age的summary可以發現有263筆資料是NA, 在做random forest前必須先把NA資料填滿

summary(combi$Age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
   0.17   21.00   28.00   29.88   39.00   80.00     263 

#從Age不等於NA的資料中建立dTree, 使用anova的方法建立預測模型

Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data = combi[!is.na(combi$Age),], method="anova”)

#利用上述建立的模型Agefit去預測Age為NA的資料

combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

#Embarked有2筆空白資料

summary(combi$Embarked)
      C   Q   S
  2 270 123 914

#用which找出哪2筆是空白資料

which(combi$Embarked == ‘')
[1]  62 830 

#將這2筆資料填入S

combi$Embarked[c(62,830)] = “S"

#再把Embarked轉換為factor

combi$Embarked <- factor(combi$Embarked)

#Fare裡有一筆空白資料

summary(combi$Fare)

#找出Fare裡哪筆是na

which(is.na(combi$Fare))
[1] 1044

#將這筆Fare填入Fare的中位數

combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

#為符合random forest對level的要求,將FamilySize裡對Samll的標準拉高到3個人以下

combi$FamilyID2 <- combi$FamilyID
combi$FamilyID2 <- as.character(combi$FamilyID2)
combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
combi$FamilyID2 <- factor(combi$FamilyID2)

#將combi再拆成train及test

train <-combi[1:891, ]
test <-combi[892:1309,]

#安裝及啟動randomForest套件

install.packages('randomForest')
library(randomForest)

#用randomForest產生預測模型fit

fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize +FamilyID2, data=train, importance=TRUE, ntree=2000)

varImpPlot(fit)



#用fit建立預測資料寫入test

Prediction <- predict(fit, test)

#將預測的結果寫入submit檔

submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)

#將submit結果寫入csv檔中

write.csv(submit, file = "firstforest.csv", row.names = FALSE)

最終結果