Part 1
trainData <-
read.csv
(
"train.csv"
, header =
TRUE
, stringsAsFactors =
FALSE
)
testData <-
read.csv
(
"test.csv"
, header =
TRUE
, stringsAsFactors =
FALSE
)
#瞭解資料,先列出前6筆資料看看資料長什麼樣
head(trainData)
#畫個密度圖看看資料的分佈
plot(density(trainData$Age, na.rm = TRUE))
#做一個存活和性別的table
counts <- table(trainData$Survived, trainData$Sex)
#畫一個bar char圖顯示性別與生還者的關係
barplot(counts, xlab = "Gender", ylab = "Number of People", main = "survived and deceased between male and female”)
#算出性別的生還比例
counts[2] / (counts[1] + counts[2])
[1] 0.7420382
counts[4] / (counts[3] + counts[4])
[1] 0.1889081
#做出艙別與生還者的table
Pclass_survival <- table(trainData$Survived, trainData$Pclass)
#畫出bar chart
barplot(Pclass_survival, xlab = "Cabin Class", ylab = "Number of People",main = "survived and deceased between male and female”)
#算出各艙等生還者比例
Pclass_survival[2] / (Pclass_survival[1] + Pclass_survival[2])
[1] 0.6296296
Pclass_survival[4] / (Pclass_survival[3] + Pclass_survival[4])
[1] 0.4728261
Pclass_survival[6] / (Pclass_survival[5] + Pclass_survival[6])
[1] 0.2423625
Part 2
#將用不到的欄位刪去
trainData = trainData[-c(1,9:12)]
#將female用1取代,male用0取代
trainData$Sex = gsub("female", 1, trainData$Sex)
trainData$Sex = gsub("^male", 0, trainData$Sex)
#將Name裡有出現同樣Title的資料列出
master_vector = grep("Master.",trainData$Name, fixed=TRUE)
miss_vector = grep("Miss.", trainData$Name, fixed=TRUE)
mrs_vector = grep("Mrs.", trainData$Name, fixed=TRUE)
mr_vector = grep("Mr.", trainData$Name, fixed=TRUE)
dr_vector = grep("Dr.", trainData$Name, fixed=TRUE)
#列出的資料如下
master_vector
[1] 8 17 51 60 64 66 79 126 160 165 166 172 177
[14] 183 184 194 262 279 306 341 349 387 408 446 481 490
[27] 550 710 752 756 788 789 803 804 820 825 828 832 851
[40] 870
#將所有的Name用Title取代,如果這部份出現問題,就是匯入資料時沒有把stringsAsFactors設為FALSE
for(i in master_vector) {
trainData$Name[i] = "Master"
}
for(i in miss_vector) {
trainData$Name[i] = "Miss"
}
for(i in mrs_vector) {
trainData$Name[i] = "Mrs"
}
for(i in mr_vector) {
trainData$Name[i] = "Mr"
}
for(i in dr_vector) {
trainData$Name[i] = "Dr"
}
#算出每一種Title的平均年齡
master_age = round(mean(trainData$Age[trainData$Name == "Master"], na.rm = TRUE), digits = 2)
miss_age = round(mean(trainData$Age[trainData$Name == "Miss"], na.rm = TRUE), digits =2)
mrs_age = round(mean(trainData$Age[trainData$Name == "Mrs"], na.rm = TRUE), digits = 2)
mr_age = round(mean(trainData$Age[trainData$Name == "Mr"], na.rm = TRUE), digits = 2)
dr_age = round(mean(trainData$Age[trainData$Name == "Dr"], na.rm = TRUE), digits = 2)
#將年齡為NA的資料,依照他的Title填入平均值
for (i in 1:nrow(trainData)) {
if (is.na(trainData[i,5])) {
if (trainData$Name[i] == "Master") {
trainData$Age[i] = master_age
} else if (trainData$Name[i] == "Miss") {
trainData$Age[i] = miss_age
} else if (trainData$Name[i] == "Mrs") {
trainData$Age[i] = mrs_age
} else if (trainData$Name[i] == "Mr") {
trainData$Age[i] = mr_age
} else if (trainData$Name[i] == "Dr") {
trainData$Age[i] = dr_age
} else {
print("Uncaught Title")
}
}
}
#新增一個欄位 “Child”, 定義12歲以下Child=1, 其它Child=2
for (i in 1:nrow(trainData)) {
if (trainData$Age[i] <= 12) {
trainData$Child[i] = 1
} else {
trainData$Child[i] = 2
}
}
#新增一個欄位 “Family”, Family人數= SibSp + Parch +1
trainData["Family"] = NA
for(i in 1:nrow(trainData)) {
x = trainData$SibSp[i]
y = trainData$Parch[i]
trainData$Family[i] = x + y + 1
}
#新增一個欄位 “Mother”, 如果Title是Mrs, Parch又不等於0, 那Mother就=1, 其它Mother為0
for(i in 1:nrow(trainData)) {
if(trainData$Name[i] == "Mrs" & trainData$Parch[i] > 0) {
trainData$Mother[i] = 1
} else {
trainData$Mother[i] = 2
}
}
#清空testData無需使用的資料,並將格式改為和trainData相同
PassengerId = testData[1]
testData = testData[-c(1, 8:11)]
testData$Sex = gsub("female", 1, testData$Sex)
testData$Sex = gsub("^male", 0, testData$Sex)
test_master_vector = grep("Master.",testData$Name)
test_miss_vector = grep("Miss.", testData$Name)
test_mrs_vector = grep("Mrs.", testData$Name)
test_mr_vector = grep("Mr.", testData$Name)
test_dr_vector = grep("Dr.", testData$Name)
for(i in test_master_vector) {
testData[i, 2] = "Master"
}
for(i in test_miss_vector) {
testData[i, 2] = "Miss"
}
for(i in test_mrs_vector) {
testData[i, 2] = "Mrs"
}
for(i in test_mr_vector) {
testData[i, 2] = "Mr"
}
for(i in test_dr_vector) {
testData[i, 2] = "Dr"
}
test_master_age = round(mean(testData$Age[testData$Name == "Master"], na.rm = TRUE), digits = 2)
test_miss_age = round(mean(testData$Age[testData$Name == "Miss"], na.rm = TRUE), digits =2)
test_mrs_age = round(mean(testData$Age[testData$Name == "Mrs"], na.rm = TRUE), digits = 2)
test_mr_age = round(mean(testData$Age[testData$Name == "Mr"], na.rm = TRUE), digits = 2)
test_dr_age = round(mean(testData$Age[testData$Name == "Dr"], na.rm = TRUE), digits = 2)
for (i in 1:nrow(testData)) {
if (is.na(testData[i,4])) {
if (testData[i, 2] == "Master") {
testData[i, 4] = test_master_age
} else if (testData[i, 2] == "Miss") {
testData[i, 4] = test_miss_age
} else if (testData[i, 2] == "Mrs") {
testData[i, 4] = test_mrs_age
} else if (testData[i, 2] == "Mr") {
testData[i, 4] = test_mr_age
} else if (testData[i, 2] == "Dr") {
testData[i, 4] = test_dr_age
} else {
print(paste("Uncaught title at: ", i, sep=""))
print(paste("The title unrecognized was: ", testData[i,2], sep=""))
}
}
}
[1] "Uncaught title at: 89”
[1] "The title unrecognized was: O'Donoghue, Ms. Bridget”
testData[89, 4] = test_miss_age
testData["Child"] = NA
for (i in 1:nrow(testData)) {
if (testData[i, 4] <= 12) {
testData[i, 7] = 1
} else {
testData[i, 7] = 1
}
}
testData["Family"] = NA
for(i in 1:nrow(testData)) {
testData[i, 8] = testData[i, 5] + testData[i, 6] + 1
}
testData["Mother"] = NA
for(i in 1:nrow(testData)) {
if(testData[i, 2] == "Mrs" & testData[i, 6] > 0) {
testData[i, 9] = 1
} else {
testData[i, 9] = 2
}
}
Part 3: Prediction Model
#使用glm來建立線性回歸模型
train.glm <- glm(Survived ~ Pclass + Sex + Age + Child + Sex*Pclass + Family + Mother, family = binomial, data = trainData)
train.glm
Call: glm(formula = Survived ~ Pclass + Sex + Age + Child + Sex * Pclass +
Family + Mother, family = binomial, data = trainData)
Coefficients:
(Intercept) Pclass Sex1 Age
7.77516 -0.87688 6.05753 -0.02839
Child Family Mother Pclass:Sex1
-1.93722 -0.44175 -0.99742 -1.32707
Degrees of Freedom: 890 Total (i.e. Null); 883 Residual
Null Deviance: 1187
Residual Deviance: 743.7 AIC: 759.7
#將train.glm模型套用在testData上
p.hats <- predict.glm(train.glm, newdata = testData, type = "response”)
#得到的結果如下,我們必須把結果轉換成0或1
head(p.hats)
1 2 3 4 5
0.4479812 0.7447584 0.4718448 0.5010278 0.7923101
6
0.5922346
#新增一個參數survival格式為vector
survival <- vector()
#檢驗各筆結果,如果大於0.5則指派1的值,其它則指派0
for(i in 1:length(p.hats)) {
if(p.hats[i] > .5) {
survival[i] <- 1
} else {
survival[i] <- 0
}
}
#將PassengerId和survival合併成waggle.sub
kaggle.sub <- cbind(PassengerId,survival)
#將欄位命名為PassengerId和Survived
colnames(kaggle.sub) <- c("PassengerId", "Survived")
#將資料寫入kaggle.csv檔案中
write.csv(kaggle.sub, file = "kaggle.csv", row.names = FALSE)
Part 4: Random Forest
#同樣的參數,用random forest來建構模型
train.rf <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + Child + Sex*Pclass + Family + Mother , data=trainData, importance=TRUE, ntree=2000)
#新增一個Survived欄位
testData$Survived <-0
#用模型來預測testData資料
Prediction <- predict(train.rf, testData)
#將預測結果寫入kaggle.csv檔案
survival <-Prediction
kaggle.sub <- cbind(PassengerId,survival)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "kaggle.csv", row.names = FALSE)
#結果 0.75120,沒比較厲害