失智追夢日誌: 9月 2014

2014年9月23日星期二

Kaggle練習：Titanic實作（方法二）

原文出處：Data Analytics for Beginners

Part 1

#匯入資料(切記，stringsAsFactors要設為FALSE, 或是匯入設定要uncheck [Strings as factors]

trainData <- read.csv("train.csv", header = TRUE, stringsAsFactors = FALSE)

testData <- read.csv("test.csv", header = TRUE, stringsAsFactors = FALSE)

#瞭解資料，先列出前6筆資料看看資料長什麼樣

head(trainData)

#畫個密度圖看看資料的分佈

plot(density(trainData$Age, na.rm = TRUE))

#做一個存活和性別的table

counts <- table(trainData$Survived, trainData$Sex)

#畫一個bar char圖顯示性別與生還者的關係

barplot(counts, xlab = "Gender", ylab = "Number of People", main = "survived and deceased between male and female”)

#算出性別的生還比例

counts[2] / (counts[1] + counts[2])

[1] 0.7420382

counts[4] / (counts[3] + counts[4])

[1] 0.1889081

#做出艙別與生還者的table

Pclass_survival <- table(trainData$Survived, trainData$Pclass)

#畫出bar chart

barplot(Pclass_survival, xlab = "Cabin Class", ylab = "Number of People",main = "survived and deceased between male and female”)

#算出各艙等生還者比例

Pclass_survival[2] / (Pclass_survival[1] + Pclass_survival[2])

[1] 0.6296296

Pclass_survival[4] / (Pclass_survival[3] + Pclass_survival[4])

[1] 0.4728261

Pclass_survival[6] / (Pclass_survival[5] + Pclass_survival[6])

[1] 0.2423625

Part 2

#將用不到的欄位刪去

trainData = trainData[-c(1,9:12)]

#將female用1取代，male用0取代

trainData$Sex = gsub("female", 1, trainData$Sex)

trainData$Sex = gsub("^male", 0, trainData$Sex)

#將Name裡有出現同樣Title的資料列出

master_vector = grep("Master.",trainData$Name, fixed=TRUE)

miss_vector = grep("Miss.", trainData$Name, fixed=TRUE)

mrs_vector = grep("Mrs.", trainData$Name, fixed=TRUE)

mr_vector = grep("Mr.", trainData$Name, fixed=TRUE)

dr_vector = grep("Dr.", trainData$Name, fixed=TRUE)

#列出的資料如下

master_vector

[1] 8 17 51 60 64 66 79 126 160 165 166 172 177

[14] 183 184 194 262 279 306 341 349 387 408 446 481 490

[27] 550 710 752 756 788 789 803 804 820 825 828 832 851

[40] 870

#將所有的Name用Title取代，如果這部份出現問題，就是匯入資料時沒有把stringsAsFactors設為FALSE

for(i in master_vector) {

trainData$Name[i] = "Master"

}

for(i in miss_vector) {

trainData$Name[i] = "Miss"

}

for(i in mrs_vector) {

trainData$Name[i] = "Mrs"

}

for(i in mr_vector) {

trainData$Name[i] = "Mr"

}

for(i in dr_vector) {

trainData$Name[i] = "Dr"

}

#算出每一種Title的平均年齡

master_age = round(mean(trainData$Age[trainData$Name == "Master"], na.rm = TRUE), digits = 2)

miss_age = round(mean(trainData$Age[trainData$Name == "Miss"], na.rm = TRUE), digits =2)

mrs_age = round(mean(trainData$Age[trainData$Name == "Mrs"], na.rm = TRUE), digits = 2)

mr_age = round(mean(trainData$Age[trainData$Name == "Mr"], na.rm = TRUE), digits = 2)

dr_age = round(mean(trainData$Age[trainData$Name == "Dr"], na.rm = TRUE), digits = 2)

#將年齡為NA的資料，依照他的Title填入平均值

for (i in 1:nrow(trainData)) {

if (is.na(trainData[i,5])) {

if (trainData$Name[i] == "Master") {

trainData$Age[i] = master_age

} else if (trainData$Name[i] == "Miss") {

trainData$Age[i] = miss_age

} else if (trainData$Name[i] == "Mrs") {

trainData$Age[i] = mrs_age

} else if (trainData$Name[i] == "Mr") {

trainData$Age[i] = mr_age

} else if (trainData$Name[i] == "Dr") {

trainData$Age[i] = dr_age

} else {

print("Uncaught Title")

}

#新增一個欄位 “Child”, 定義12歲以下Child=1, 其它Child=2

for (i in 1:nrow(trainData)) {

if (trainData$Age[i] <= 12) {

trainData$Child[i] = 1

} else {

trainData$Child[i] = 2

}

#新增一個欄位 “Family”, Family人數= SibSp + Parch +1

trainData["Family"] = NA

for(i in 1:nrow(trainData)) {

x = trainData$SibSp[i]

y = trainData$Parch[i]

trainData$Family[i] = x + y + 1

}

#新增一個欄位 “Mother”, 如果Title是Mrs, Parch又不等於0, 那Mother就＝1, 其它Mother為0

for(i in 1:nrow(trainData)) {

if(trainData$Name[i] == "Mrs" & trainData$Parch[i] > 0) {

trainData$Mother[i] = 1

} else {

trainData$Mother[i] = 2

}

#清空testData無需使用的資料，並將格式改為和trainData相同

PassengerId = testData[1]

testData = testData[-c(1, 8:11)]

testData$Sex = gsub("female", 1, testData$Sex)

testData$Sex = gsub("^male", 0, testData$Sex)

test_master_vector = grep("Master.",testData$Name)

test_miss_vector = grep("Miss.", testData$Name)

test_mrs_vector = grep("Mrs.", testData$Name)

test_mr_vector = grep("Mr.", testData$Name)

test_dr_vector = grep("Dr.", testData$Name)

for(i in test_master_vector) {

testData[i, 2] = "Master"

}

for(i in test_miss_vector) {

testData[i, 2] = "Miss"

}

for(i in test_mrs_vector) {

testData[i, 2] = "Mrs"

}

for(i in test_mr_vector) {

testData[i, 2] = "Mr"

}

for(i in test_dr_vector) {

testData[i, 2] = "Dr"

}

test_master_age = round(mean(testData$Age[testData$Name == "Master"], na.rm = TRUE), digits = 2)

test_miss_age = round(mean(testData$Age[testData$Name == "Miss"], na.rm = TRUE), digits =2)

test_mrs_age = round(mean(testData$Age[testData$Name == "Mrs"], na.rm = TRUE), digits = 2)

test_mr_age = round(mean(testData$Age[testData$Name == "Mr"], na.rm = TRUE), digits = 2)

test_dr_age = round(mean(testData$Age[testData$Name == "Dr"], na.rm = TRUE), digits = 2)

for (i in 1:nrow(testData)) {

if (is.na(testData[i,4])) {

if (testData[i, 2] == "Master") {

testData[i, 4] = test_master_age

} else if (testData[i, 2] == "Miss") {

testData[i, 4] = test_miss_age

} else if (testData[i, 2] == "Mrs") {

testData[i, 4] = test_mrs_age

} else if (testData[i, 2] == "Mr") {

testData[i, 4] = test_mr_age

} else if (testData[i, 2] == "Dr") {

testData[i, 4] = test_dr_age

} else {

print(paste("Uncaught title at: ", i, sep=""))

print(paste("The title unrecognized was: ", testData[i,2], sep=""))

}

[1] "Uncaught title at: 89”

[1] "The title unrecognized was: O'Donoghue, Ms. Bridget”

testData[89, 4] = test_miss_age

testData["Child"] = NA

for (i in 1:nrow(testData)) {

if (testData[i, 4] <= 12) {

testData[i, 7] = 1

} else {

testData[i, 7] = 1

}

testData["Family"] = NA

for(i in 1:nrow(testData)) {

testData[i, 8] = testData[i, 5] + testData[i, 6] + 1

}

testData["Mother"] = NA

for(i in 1:nrow(testData)) {

if(testData[i, 2] == "Mrs" & testData[i, 6] > 0) {

testData[i, 9] = 1

} else {

testData[i, 9] = 2

}

Part 3: Prediction Model

#使用glm來建立線性回歸模型

train.glm <- glm(Survived ~ Pclass + Sex + Age + Child + Sex*Pclass + Family + Mother, family = binomial, data = trainData)

train.glm

Call: glm(formula = Survived ~ Pclass + Sex + Age + Child + Sex * Pclass +

Family + Mother, family = binomial, data = trainData)

Coefficients:

(Intercept) Pclass Sex1 Age

7.77516 -0.87688 6.05753 -0.02839

Child Family Mother Pclass:Sex1

-1.93722 -0.44175 -0.99742 -1.32707

Degrees of Freedom: 890 Total (i.e. Null); 883 Residual

Null Deviance: 1187

Residual Deviance: 743.7 AIC: 759.7

#將train.glm模型套用在testData上

p.hats <- predict.glm(train.glm, newdata = testData, type = "response”)

#得到的結果如下，我們必須把結果轉換成0或1

head(p.hats)

1 2 3 4 5

0.4479812 0.7447584 0.4718448 0.5010278 0.7923101

0.5922346

#新增一個參數survival格式為vector

survival <- vector()

#檢驗各筆結果，如果大於0.5則指派1的值，其它則指派0

for(i in 1:length(p.hats)) {

if(p.hats[i] > .5) {

survival[i] <- 1

} else {

survival[i] <- 0

}

#將PassengerId和survival合併成waggle.sub

kaggle.sub <- cbind(PassengerId,survival)

#將欄位命名為PassengerId和Survived

colnames(kaggle.sub) <- c("PassengerId", "Survived")

#將資料寫入kaggle.csv檔案中

write.csv(kaggle.sub, file = "kaggle.csv", row.names = FALSE)

Part 4: Random Forest

#同樣的參數，用random forest來建構模型

train.rf <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + Child + Sex*Pclass + Family + Mother , data=trainData, importance=TRUE, ntree=2000)

#新增一個Survived欄位

testData$Survived <-0

#用模型來預測testData資料

Prediction <- predict(train.rf, testData)

#將預測結果寫入kaggle.csv檔案

survival <-Prediction

kaggle.sub <- cbind(PassengerId,survival)

colnames(kaggle.sub) <- c("PassengerId", "Survived")

write.csv(kaggle.sub, file = "kaggle.csv", row.names = FALSE)

#結果 0.75120，沒比較厲害

Kaggle練習：Titanic實作（方法一）

原文出處: Titanic: Getting Started With R

Part 1:Getting start with R

#先匯入train.csv

#瞭解train資料結構

str(train)

#列出生/死人數總額

table(train$Survived)

#將人數總額轉化成比例

prop.table(table(train$Survived))

#用rep指令重覆填滿test$Survived值為0

test$Survived <-rep(0, 418)

#將預測資料寫入submit以供上傳

submit <- data.frame(PassengerId = test$PassengerId, Survived=test$Survived)

#將submit資料寫入檔案，不包含row.names

write.csv(submit, file="theyallperish.csv", row.names = FALSE)

Part 2:The Gender-Class Model

#從Sex來瞭解資料，先列出男女數量

table(train$Sex)

#從比例上來看男女/存活比例

prop.table(table(train$Sex, train$Survived))

0 1

female 0.09090909 0.26150393

male 0.52525253 0.12233446

#但是上列指令會將男生及生死拆成100%顯示，如果要分別列出男性及女性的生死比例，則要在最後加入1

prop.table(table(train$Sex, train$Survived),1)

0 1

female 0.2579618 0.7420382

male 0.8110919 0.1889081

#如果把最後參數改成2,則會生/死分別的男女比例

prop.table(table(train$Sex, train$Survived),2)

0 1

female 0.1475410 0.6812865

male 0.8524590 0.3187135

#先把test裡Survived全設成0

test$Survived <-0

#再假設所有女生都生還

test$Survived[test$Sex ==( "female"] <-1

#用summary指令探索Age欄位，發現有177筆無資料

summary(train$Age)

Min. 1st Qu. Median Mean 3rd Qu. Max. NA's

0.42 20.12 28.00 29.70 38.00 80.00 177

#新增一個Child欄位，先預設值為0

train$Child <-0

#Age小於18的都定義為Child

train$Child[train$Age < 18] <-1

#用aggregate指令把Child和Sex的生還人數分別列出

aggregate(Survived ~ Child + Sex, data=train, FUN=sum)

Child Sex Survived

1 0 female 195

2 1 female 38

3 0 male 86

4 1 male 23

#用aggregate指令把Child和Sex的總人數分別列出

aggregate(Survived ~ Child + Sex, data=train, FUN=length)

Child Sex Survived

1 0 female 259

2 1 female 55

3 0 male 519

4 1 male 58

#用aggregate指令把Child和Sex的生還比例

aggregate(Survived ~ Child + Sex, data= train, FUN=function(x){sum(x)/length(x)})

Child Sex Survived

1 0 female 0.7528958

2 1 female 0.6909091

3 0 male 0.1657033

4 1 male 0.3965517

#將年齡分成4個類別，存入新增的Fare2欄位

train$Fare2 <- "30+"

train$Fare2[train$Fare <30 & train$Fare >=20] <- "20-30"

train$Fare2[train$Fare <20 & train$Fare >=10] <- "10-20"

train$Fare2[train$Fare <10] <- "<10”

#用aggregate指令把Fare2, Pclass, Sex的生還比例列出來，找尋異常部份，如第9條有3位女生以30+價格購入最低等艙房，而且生存率異常的低

aggregate(Survived ~ Fare2 + Pclass + Sex, data=train, FUN=function(x){sum(x)/length(x)})

Fare2 Pclass Sex Survived

1 20-30 1 female 0.8333333

2 30+ 1 female 0.9772727

3 10-20 2 female 0.9142857

4 20-30 2 female 0.9000000

5 30+ 2 female 1.0000000

6 <10 3 female 0.5937500

7 10-20 3 female 0.5813953

8 20-30 3 female 0.3333333

9 30+ 3 female 0.1250000

10 <10 1 male 0.0000000

11 20-30 1 male 0.4000000

12 30+ 1 male 0.3837209

13 <10 2 male 0.0000000

14 10-20 2 male 0.1587302

15 20-30 2 male 0.1600000

16 30+ 2 male 0.2142857

17 <10 3 male 0.1115385

18 10-20 3 male 0.2368421

19 20-30 3 male 0.1250000

20 30+ 3 male 0.2400000

#將上述發現加入text$Survived清單的死亡預測

test$Survived[test$Sex == "female" & test$Pclass==3 & test$Fare >=20] <-0

#要使用Decision Tree, 必須啟動rpart套件

Part 3:Decision Trees

library(rpart)

#建立名為fit的Decision Tree模型，用Pclass, Sex, SibSp, Parch, Fare, Embarked來預測存活率

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data = train, method="class”)

#用plot把fit模型的Decision Tree畫出來

plot(fit)

#加入文字描述

text(fit)

#安裝rattle, rpart.plot, RColorBrewer，用來晝更美觀的Decision Tree

install.packages('rattle')

install.packages('rpart.plot')

install.packages('RColorBrewer')

library(rattle)

library(rpart.plot)

library(RColorBrewer)

#用predict指令將fit模型套用在test資料中

Prediction <- predict(fit, test, type = "class”)

#將test中的PassengerId和Prediction的結果寫入submit資料中

submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)

#將submit資料寫入myfirstdtree.csv檔案裡

write.csv(submit, file = "myfirstdtree.csv", row.names = FALSE)

Part 4:Feature Engeneering

#列出第一筆Name

train$Name[1]

[1] Braund, Mr. Owen Harris

891 Levels: Abbing, Mr. Anthony ... Zimmerman, Mr. Leo

#將test的Survived設為NA

test$Survived <- NA

#把train和test資料合併成一個新的資料combi

combi <- rbind(train, test)

#把Name從factor轉換成character

combi$Name <-as.character(combi$Name)

#轉換後執行列出第一筆變成這樣

combi$Name[1]

[1] "Braund, Mr. Owen Harris”

#用strsplit把Name拆開

strsplit(combi$Name[1], split= '[,.]')[[1]]

[1] "Braund" " Mr" " Owen Harris"

#若要把第二個值分離出來則指令如下

strsplit(combi$Name[1], split= '[,.]')[[1]][2]

[1] " Mr”

#用sapply把所有Name裡的title extra出到一個新的欄位Title

combi$Title <- sapply(combi$Name, FUN=function(x){strsplit(x, split='[,.]')[[1]][2]})

#把Title裡的空白消除掉

combi$Title <- sub(" ", "", combi$Title)

#將所有的Title簡化為Mlle, Sir, Lady三個類型

combi$Title[combi$Title %in% c("Mme", "Mlle")] <- "Mlle"

combi$Title[combi$Title %in% c("Capt", "Don", "Major", "Sir")] <-"Sir"

combi$Title[combi$Title %in% c("Dona", "Lady", "the Countess", "Jonkheer")] <- “Lady"

#將combi$Title從character轉換成factor

combi$Title <-factor(combi$Title)

#新增一個欄位FamilySize, 將同行人數的兄弟姐妹配偶父母小孩及本人加總

combi$FamilySize <- combi$SibSp + combi$Parch +1

#從Name裡extra出姓

combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})

#新增FamilyID欄位，將FamilySize及姓結合在一起，這樣就可以知道哪些人是同一家人

combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep=“")

#將2人及以下人數，非家族旅行的全部稱做Small, 以減少資料level

combi$FamilyID[combi$FamilySize <= 2] <- ‘Small'

#將FamilyID及人數匯出成famIDs

famIDs <-data.frame(table(combi$FamilyID))

#將Freq小於2次的資料匯出

famIDs <- famIDs[famIDs$Freq <= 2,]

#將上述選出來的名單的FamilyID都改成Small

combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- ‘Small'

將FamilyID轉化成factor

combi$FamilyID <- factor(combi$FamilyID)

#將combi資料再寫回train和test

train <-combi[1:891, ]

test <-combi[892:1309,]

#加入上述欄位再一次產生dTree預測模型

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize + FamilyID, data=train, method = "class”)

Part 5：Random Forests

#列出combi$Age的summary可以發現有263筆資料是NA, 在做random forest前必須先把NA資料填滿

summary(combi$Age)

Min. 1st Qu. Median Mean 3rd Qu. Max. NA's

0.17 21.00 28.00 29.88 39.00 80.00 263

#從Age不等於NA的資料中建立dTree, 使用anova的方法建立預測模型

Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data = combi[!is.na(combi$Age),], method="anova”)

#利用上述建立的模型Agefit去預測Age為NA的資料

combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

#Embarked有2筆空白資料

summary(combi$Embarked)

C Q S

2 270 123 914

#用which找出哪2筆是空白資料

which(combi$Embarked == ‘')

[1] 62 830

#將這2筆資料填入S

combi$Embarked[c(62,830)] = “S"

#再把Embarked轉換為factor

combi$Embarked <- factor(combi$Embarked)

#Fare裡有一筆空白資料

summary(combi$Fare)

#找出Fare裡哪筆是na

which(is.na(combi$Fare))

[1] 1044

#將這筆Fare填入Fare的中位數

combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

#為符合random forest對level的要求，將FamilySize裡對Samll的標準拉高到3個人以下

combi$FamilyID2 <- combi$FamilyID

combi$FamilyID2 <- as.character(combi$FamilyID2)

combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'

combi$FamilyID2 <- factor(combi$FamilyID2)

#將combi再拆成train及test

train <-combi[1:891, ]

test <-combi[892:1309,]

#安裝及啟動randomForest套件

install.packages('randomForest')

library(randomForest)

#用randomForest產生預測模型fit

fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize +FamilyID2, data=train, importance=TRUE, ntree=2000)

varImpPlot(fit)

#用fit建立預測資料寫入test

Prediction <- predict(fit, test)

#將預測的結果寫入submit檔

submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)

#將submit結果寫入csv檔中

write.csv(submit, file = "firstforest.csv", row.names = FALSE)

最終結果

訂閱：文章 (Atom)

APPs

2014年9月23日 星期二

Kaggle練習：Titanic實作（方法二）

Kaggle練習：Titanic實作（方法一）

2014年9月23日星期二