2014年9月23日 星期二

Kaggle練習:Titanic實作(方法一)

原文出處: Titanic: Getting Started With R

Part 1:Getting start with R

#先匯入train.csv

#瞭解train資料結構


str(train)

#列出生/死人數總額

table(train$Survived)

#將人數總額轉化成比例

prop.table(table(train$Survived))

#用rep指令重覆填滿test$Survived值為0

test$Survived <-rep(0, 418)

#將預測資料寫入submit以供上傳

submit <- data.frame(PassengerId = test$PassengerId, Survived=test$Survived)

#將submit資料寫入檔案,不包含row.names

write.csv(submit, file="theyallperish.csv", row.names = FALSE)

Part 2:The Gender-Class Model

#從Sex來瞭解資料,先列出男女數量

table(train$Sex)

#從比例上來看男女/存活比例

prop.table(table(train$Sex, train$Survived))

                  0          1
  female 0.09090909 0.26150393
  male   0.52525253 0.12233446

#但是上列指令會將男生及生死拆成100%顯示,如果要分別列出男性及女性的生死比例,則要在最後加入1

prop.table(table(train$Sex, train$Survived),1)

                 0         1
  female 0.2579618 0.7420382
  male   0.8110919 0.1889081

#如果把最後參數改成2,則會生/死分別的男女比例

prop.table(table(train$Sex, train$Survived),2)

                 0         1
  female 0.1475410 0.6812865
  male   0.8524590 0.3187135

#先把test裡Survived全設成0

test$Survived <-0

#再假設所有女生都生還

test$Survived[test$Sex ==( "female"] <-1

#用summary指令探索Age欄位,發現有177筆無資料

summary(train$Age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
   0.42   20.12   28.00   29.70   38.00   80.00     177 

#新增一個Child欄位,先預設值為0

train$Child <-0

#Age小於18的都定義為Child

train$Child[train$Age < 18] <-1

#用aggregate指令把Child和Sex的生還人數分別列出

aggregate(Survived ~ Child + Sex, data=train, FUN=sum)

  Child    Sex Survived
1     0 female      195
2     1 female       38
3     0   male       86
4     1   male       23

#用aggregate指令把Child和Sex的總人數分別列出

aggregate(Survived ~ Child + Sex, data=train, FUN=length)

  Child    Sex Survived
1     0 female      259
2     1 female       55
3     0   male      519
4     1   male       58

#用aggregate指令把Child和Sex的生還比例

aggregate(Survived ~ Child + Sex, data= train, FUN=function(x){sum(x)/length(x)})

  Child    Sex  Survived
1     0 female 0.7528958
2     1 female 0.6909091
3     0   male 0.1657033
4     1   male 0.3965517

#將年齡分成4個類別,存入新增的Fare2欄位

train$Fare2 <- "30+"
train$Fare2[train$Fare <30 & train$Fare >=20] <- "20-30"
train$Fare2[train$Fare <20 & train$Fare >=10] <- "10-20"
train$Fare2[train$Fare <10] <- "<10”

#用aggregate指令把Fare2, Pclass, Sex的生還比例列出來,找尋異常部份,如第9條有3位女生以30+價格購入最低等艙房,而且生存率異常的低

aggregate(Survived ~ Fare2 + Pclass + Sex, data=train, FUN=function(x){sum(x)/length(x)})

   Fare2 Pclass    Sex  Survived
1  20-30      1 female 0.8333333
2    30+      1 female 0.9772727
3  10-20      2 female 0.9142857
4  20-30      2 female 0.9000000
5    30+      2 female 1.0000000
6    <10      3 female 0.5937500
7  10-20      3 female 0.5813953
8  20-30      3 female 0.3333333
9    30+      3 female 0.1250000
10   <10      1   male 0.0000000
11 20-30      1   male 0.4000000
12   30+      1   male 0.3837209
13   <10      2   male 0.0000000
14 10-20      2   male 0.1587302
15 20-30      2   male 0.1600000
16   30+      2   male 0.2142857
17   <10      3   male 0.1115385
18 10-20      3   male 0.2368421
19 20-30      3   male 0.1250000
20   30+      3   male 0.2400000

#將上述發現加入text$Survived清單的死亡預測

test$Survived[test$Sex == "female" & test$Pclass==3 & test$Fare >=20] <-0

#要使用Decision Tree, 必須啟動rpart套件

Part 3:Decision Trees

library(rpart)

#建立名為fit的Decision Tree模型,用Pclass, Sex, SibSp, Parch, Fare, Embarked來預測存活率

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data = train, method="class”)

#用plot把fit模型的Decision Tree畫出來

plot(fit)

#加入文字描述

text(fit)

#安裝rattle, rpart.plot, RColorBrewer,用來晝更美觀的Decision Tree

install.packages('rattle')
install.packages('rpart.plot')
install.packages('RColorBrewer')
library(rattle)
library(rpart.plot)
library(RColorBrewer)

#用predict指令將fit模型套用在test資料中

Prediction <- predict(fit, test, type = "class”)

#將test中的PassengerId和Prediction的結果寫入submit資料中

submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)

#將submit資料寫入myfirstdtree.csv檔案裡

write.csv(submit, file = "myfirstdtree.csv", row.names = FALSE)

Part 4:Feature Engeneering

#列出第一筆Name

train$Name[1]

[1] Braund, Mr. Owen Harris
891 Levels: Abbing, Mr. Anthony ... Zimmerman, Mr. Leo

#將test的Survived設為NA

test$Survived <- NA

#把train和test資料合併成一個新的資料combi

combi <- rbind(train, test)

#把Name從factor轉換成character

combi$Name <-as.character(combi$Name)

#轉換後執行列出第一筆變成這樣

combi$Name[1]
[1] "Braund, Mr. Owen Harris”

#用strsplit把Name拆開

strsplit(combi$Name[1], split= '[,.]')[[1]]
[1] "Braund"       " Mr"          " Owen Harris"

#若要把第二個值分離出來則指令如下

strsplit(combi$Name[1], split= '[,.]')[[1]][2]
[1] " Mr”

#用sapply把所有Name裡的title extra出到一個新的欄位Title

combi$Title <- sapply(combi$Name, FUN=function(x){strsplit(x, split='[,.]')[[1]][2]})

#把Title裡的空白消除掉


combi$Title <- sub(" ", "", combi$Title)

#將所有的Title簡化為Mlle, Sir, Lady三個類型

combi$Title[combi$Title %in% c("Mme", "Mlle")] <- "Mlle"
combi$Title[combi$Title %in% c("Capt", "Don", "Major", "Sir")] <-"Sir"
combi$Title[combi$Title %in% c("Dona", "Lady", "the Countess", "Jonkheer")] <- “Lady"

#將combi$Title從character轉換成factor

combi$Title <-factor(combi$Title)

#新增一個欄位FamilySize, 將同行人數的兄弟姐妹配偶父母小孩及本人加總

combi$FamilySize <- combi$SibSp + combi$Parch +1

#從Name裡extra出姓

combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})

#新增FamilyID欄位,將FamilySize及姓結合在一起,這樣就可以知道哪些人是同一家人

combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep=“")

#將2人及以下人數,非家族旅行的全部稱做Small, 以減少資料level

combi$FamilyID[combi$FamilySize <= 2] <- ‘Small'

#將FamilyID及人數匯出成famIDs

famIDs <-data.frame(table(combi$FamilyID))

#將Freq小於2次的資料匯出

famIDs <- famIDs[famIDs$Freq <= 2,]

#將上述選出來的名單的FamilyID都改成Small

combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- ‘Small'

將FamilyID轉化成factor

combi$FamilyID <- factor(combi$FamilyID)

#將combi資料再寫回train和test

train <-combi[1:891, ]
test <-combi[892:1309,]

#加入上述欄位再一次產生dTree預測模型

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, method = "class”)

Part 5:Random Forests

#列出combi$Age的summary可以發現有263筆資料是NA, 在做random forest前必須先把NA資料填滿

summary(combi$Age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
   0.17   21.00   28.00   29.88   39.00   80.00     263 

#從Age不等於NA的資料中建立dTree, 使用anova的方法建立預測模型

Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data = combi[!is.na(combi$Age),], method="anova”)

#利用上述建立的模型Agefit去預測Age為NA的資料

combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

#Embarked有2筆空白資料

summary(combi$Embarked)
      C   Q   S
  2 270 123 914

#用which找出哪2筆是空白資料

which(combi$Embarked == ‘')
[1]  62 830 

#將這2筆資料填入S

combi$Embarked[c(62,830)] = “S"

#再把Embarked轉換為factor

combi$Embarked <- factor(combi$Embarked)

#Fare裡有一筆空白資料

summary(combi$Fare)

#找出Fare裡哪筆是na

which(is.na(combi$Fare))
[1] 1044

#將這筆Fare填入Fare的中位數

combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

#為符合random forest對level的要求,將FamilySize裡對Samll的標準拉高到3個人以下

combi$FamilyID2 <- combi$FamilyID
combi$FamilyID2 <- as.character(combi$FamilyID2)
combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
combi$FamilyID2 <- factor(combi$FamilyID2)

#將combi再拆成train及test

train <-combi[1:891, ]
test <-combi[892:1309,]

#安裝及啟動randomForest套件

install.packages('randomForest')
library(randomForest)

#用randomForest產生預測模型fit

fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize +FamilyID2, data=train, importance=TRUE, ntree=2000)

varImpPlot(fit)



#用fit建立預測資料寫入test

Prediction <- predict(fit, test)

#將預測的結果寫入submit檔

submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)

#將submit結果寫入csv檔中

write.csv(submit, file = "firstforest.csv", row.names = FALSE)

最終結果



0 意見:

張貼留言