Part 1:Getting start with R
#瞭解train資料結構
str(train)
#列出生/死人數總額
table(train$Survived)
#將人數總額轉化成比例
prop.table(table(train$Survived))
#用rep指令重覆填滿test$Survived值為0
test$Survived <-rep(0, 418)
#將預測資料寫入submit以供上傳
submit <- data.frame(PassengerId = test$PassengerId, Survived=test$Survived)
#將submit資料寫入檔案,不包含row.names
write.csv(submit, file="theyallperish.csv", row.names = FALSE)
Part 2:The Gender-Class Model
#從Sex來瞭解資料,先列出男女數量
table(train$Sex)
#從比例上來看男女/存活比例
prop.table(table(train$Sex, train$Survived))
0 1
female 0.09090909 0.26150393
male 0.52525253 0.12233446
#但是上列指令會將男生及生死拆成100%顯示,如果要分別列出男性及女性的生死比例,則要在最後加入1
prop.table(table(train$Sex, train$Survived),1)
0 1
female 0.2579618 0.7420382
male 0.8110919 0.1889081
#如果把最後參數改成2,則會生/死分別的男女比例
prop.table(table(train$Sex, train$Survived),2)
0 1
female 0.1475410 0.6812865
male 0.8524590 0.3187135
#先把test裡Survived全設成0
test$Survived <-0
#再假設所有女生都生還
test$Survived[test$Sex ==( "female"] <-1
#用summary指令探索Age欄位,發現有177筆無資料
summary(train$Age)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.42 20.12 28.00 29.70 38.00 80.00 177
#新增一個Child欄位,先預設值為0
train$Child <-0
#Age小於18的都定義為Child
train$Child[train$Age < 18] <-1
#用aggregate指令把Child和Sex的生還人數分別列出
aggregate(Survived ~ Child + Sex, data=train, FUN=sum)
Child Sex Survived
1 0 female 195
2 1 female 38
3 0 male 86
4 1 male 23
#用aggregate指令把Child和Sex的總人數分別列出
aggregate(Survived ~ Child + Sex, data=train, FUN=length)
Child Sex Survived
1 0 female 259
2 1 female 55
3 0 male 519
4 1 male 58
#用aggregate指令把Child和Sex的生還比例
aggregate(Survived ~ Child + Sex, data= train, FUN=function(x){sum(x)/length(x)})
Child Sex Survived
1 0 female 0.7528958
2 1 female 0.6909091
3 0 male 0.1657033
4 1 male 0.3965517
#將年齡分成4個類別,存入新增的Fare2欄位
train$Fare2 <- "30+"
train$Fare2[train$Fare <30 & train$Fare >=20] <- "20-30"
train$Fare2[train$Fare <20 & train$Fare >=10] <- "10-20"
train$Fare2[train$Fare <10] <- "<10”
#用aggregate指令把Fare2, Pclass, Sex的生還比例列出來,找尋異常部份,如第9條有3位女生以30+價格購入最低等艙房,而且生存率異常的低
aggregate(Survived ~ Fare2 + Pclass + Sex, data=train, FUN=function(x){sum(x)/length(x)})
Fare2 Pclass Sex Survived
1 20-30 1 female 0.8333333
2 30+ 1 female 0.9772727
3 10-20 2 female 0.9142857
4 20-30 2 female 0.9000000
5 30+ 2 female 1.0000000
6 <10 3 female 0.5937500
7 10-20 3 female 0.5813953
8 20-30 3 female 0.3333333
9 30+ 3 female 0.1250000
10 <10 1 male 0.0000000
11 20-30 1 male 0.4000000
12 30+ 1 male 0.3837209
13 <10 2 male 0.0000000
14 10-20 2 male 0.1587302
15 20-30 2 male 0.1600000
16 30+ 2 male 0.2142857
17 <10 3 male 0.1115385
18 10-20 3 male 0.2368421
19 20-30 3 male 0.1250000
20 30+ 3 male 0.2400000
#將上述發現加入text$Survived清單的死亡預測
test$Survived[test$Sex == "female" & test$Pclass==3 & test$Fare >=20] <-0
#要使用Decision Tree, 必須啟動rpart套件
Part 3:Decision Trees
library(rpart)
#建立名為fit的Decision Tree模型,用Pclass, Sex, SibSp, Parch, Fare, Embarked來預測存活率
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data = train, method="class”)
#用plot把fit模型的Decision Tree畫出來
plot(fit)
#加入文字描述
text(fit)
#安裝rattle, rpart.plot, RColorBrewer,用來晝更美觀的Decision Tree
install.packages('rattle')
install.packages('rpart.plot')
install.packages('RColorBrewer')
library(rattle)
library(rpart.plot)
library(RColorBrewer)
#用predict指令將fit模型套用在test資料中
Prediction <- predict(fit, test, type = "class”)
#將test中的PassengerId和Prediction的結果寫入submit資料中
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
#將submit資料寫入myfirstdtree.csv檔案裡
write.csv(submit, file = "myfirstdtree.csv", row.names = FALSE)
Part 4:Feature Engeneering
#列出第一筆Name
train$Name[1]
[1] Braund, Mr. Owen Harris
891 Levels: Abbing, Mr. Anthony ... Zimmerman, Mr. Leo
#將test的Survived設為NA
test$Survived <- NA
#把train和test資料合併成一個新的資料combi
combi <- rbind(train, test)
#把Name從factor轉換成character
combi$Name <-as.character(combi$Name)
#轉換後執行列出第一筆變成這樣
combi$Name[1]
[1] "Braund, Mr. Owen Harris”
#用strsplit把Name拆開
strsplit(combi$Name[1], split= '[,.]')[[1]]
[1] "Braund" " Mr" " Owen Harris"
#若要把第二個值分離出來則指令如下
strsplit(combi$Name[1], split= '[,.]')[[1]][2]
[1] " Mr”
#用sapply把所有Name裡的title extra出到一個新的欄位Title
combi$Title <- sapply(combi$Name, FUN=function(x){strsplit(x, split='[,.]')[[1]][2]})
#把Title裡的空白消除掉
combi$Title <- sub(" ", "", combi$Title)
#將所有的Title簡化為Mlle, Sir, Lady三個類型
combi$Title[combi$Title %in% c("Mme", "Mlle")] <- "Mlle"
combi$Title[combi$Title %in% c("Capt", "Don", "Major", "Sir")] <-"Sir"
combi$Title[combi$Title %in% c("Dona", "Lady", "the Countess", "Jonkheer")] <- “Lady"
#將combi$Title從character轉換成factor
combi$Title <-factor(combi$Title)
#新增一個欄位FamilySize, 將同行人數的兄弟姐妹配偶父母小孩及本人加總
combi$FamilySize <- combi$SibSp + combi$Parch +1
#從Name裡extra出姓
combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
#新增FamilyID欄位,將FamilySize及姓結合在一起,這樣就可以知道哪些人是同一家人
combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep=“")
#將2人及以下人數,非家族旅行的全部稱做Small, 以減少資料level
combi$FamilyID[combi$FamilySize <= 2] <- ‘Small'
#將FamilyID及人數匯出成famIDs
famIDs <-data.frame(table(combi$FamilyID))
#將Freq小於2次的資料匯出
famIDs <- famIDs[famIDs$Freq <= 2,]
#將上述選出來的名單的FamilyID都改成Small
combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- ‘Small'
將FamilyID轉化成factor
combi$FamilyID <- factor(combi$FamilyID)
#將combi資料再寫回train和test
train <-combi[1:891, ]
test <-combi[892:1309,]
#加入上述欄位再一次產生dTree預測模型
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, method = "class”)
Part 5:Random Forests
#列出combi$Age的summary可以發現有263筆資料是NA, 在做random forest前必須先把NA資料填滿
summary(combi$Age)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.17 21.00 28.00 29.88 39.00 80.00 263
#從Age不等於NA的資料中建立dTree, 使用anova的方法建立預測模型
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data = combi[!is.na(combi$Age),], method="anova”)
#利用上述建立的模型Agefit去預測Age為NA的資料
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])
#Embarked有2筆空白資料
summary(combi$Embarked)
C Q S
2 270 123 914
#用which找出哪2筆是空白資料
which(combi$Embarked == ‘')
[1] 62 830
#將這2筆資料填入S
combi$Embarked[c(62,830)] = “S"
#再把Embarked轉換為factor
combi$Embarked <- factor(combi$Embarked)
#Fare裡有一筆空白資料
summary(combi$Fare)
#找出Fare裡哪筆是na
which(is.na(combi$Fare))
[1] 1044
#將這筆Fare填入Fare的中位數
combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)
#為符合random forest對level的要求,將FamilySize裡對Samll的標準拉高到3個人以下
combi$FamilyID2 <- combi$FamilyID
combi$FamilyID2 <- as.character(combi$FamilyID2)
combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
combi$FamilyID2 <- factor(combi$FamilyID2)
#將combi再拆成train及test
train <-combi[1:891, ]
test <-combi[892:1309,]
#安裝及啟動randomForest套件
install.packages('randomForest')
library(randomForest)
#用randomForest產生預測模型fit
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize +FamilyID2, data=train, importance=TRUE, ntree=2000)
varImpPlot(fit)
#用fit建立預測資料寫入test
Prediction <- predict(fit, test)
#將預測的結果寫入submit檔
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
#將submit結果寫入csv檔中
write.csv(submit, file = "firstforest.csv", row.names = FALSE)
最終結果
0 意見:
張貼留言