2014年9月23日 星期二

Kaggle練習:Titanic實作(方法二)

原文出處:Data Analytics for Beginners

Part 1

#匯入資料(切記,stringsAsFactors要設為FALSE, 或是匯入設定要uncheck [Strings as factors]

trainData <- read.csv("train.csv", header = TRUE, stringsAsFactors = FALSE)
testData <- read.csv("test.csv", header = TRUE, stringsAsFactors = FALSE)

#瞭解資料,先列出前6筆資料看看資料長什麼樣
head(trainData)

#畫個密度圖看看資料的分佈
plot(density(trainData$Age, na.rm = TRUE))

#做一個存活和性別的table
counts <- table(trainData$Survived, trainData$Sex)

#畫一個bar char圖顯示性別與生還者的關係
barplot(counts, xlab = "Gender", ylab = "Number of People", main = "survived and deceased between male and female”)




























#算出性別的生還比例
counts[2] / (counts[1] + counts[2])
[1] 0.7420382
counts[4] / (counts[3] + counts[4])
[1] 0.1889081

#做出艙別與生還者的table
Pclass_survival <- table(trainData$Survived, trainData$Pclass)

#畫出bar chart
barplot(Pclass_survival, xlab = "Cabin Class", ylab = "Number of People",main = "survived and deceased between male and female”)




























#算出各艙等生還者比例
Pclass_survival[2] / (Pclass_survival[1] + Pclass_survival[2])
[1] 0.6296296
Pclass_survival[4] / (Pclass_survival[3] + Pclass_survival[4])
[1] 0.4728261
Pclass_survival[6] / (Pclass_survival[5] + Pclass_survival[6])
[1] 0.2423625

Part 2

#將用不到的欄位刪去
trainData = trainData[-c(1,9:12)]

#將female用1取代,male用0取代

trainData$Sex = gsub("female", 1, trainData$Sex)
trainData$Sex = gsub("^male", 0, trainData$Sex)

#將Name裡有出現同樣Title的資料列出
master_vector = grep("Master.",trainData$Name, fixed=TRUE)
miss_vector = grep("Miss.", trainData$Name, fixed=TRUE)
mrs_vector = grep("Mrs.", trainData$Name, fixed=TRUE)
mr_vector = grep("Mr.", trainData$Name, fixed=TRUE)
dr_vector = grep("Dr.", trainData$Name, fixed=TRUE)

#列出的資料如下
master_vector
 [1]   8  17  51  60  64  66  79 126 160 165 166 172 177
[14] 183 184 194 262 279 306 341 349 387 408 446 481 490
[27] 550 710 752 756 788 789 803 804 820 825 828 832 851
[40] 870

#將所有的Name用Title取代,如果這部份出現問題,就是匯入資料時沒有把stringsAsFactors設為FALSE
for(i in master_vector) {
  trainData$Name[i] = "Master"
}

for(i in miss_vector) {
  trainData$Name[i] = "Miss"
}

for(i in mrs_vector) {
  trainData$Name[i] = "Mrs"
}

for(i in mr_vector) {
  trainData$Name[i] = "Mr"
}

for(i in dr_vector) {
  trainData$Name[i] = "Dr"
}

#算出每一種Title的平均年齡
master_age = round(mean(trainData$Age[trainData$Name == "Master"], na.rm = TRUE), digits = 2)

miss_age = round(mean(trainData$Age[trainData$Name == "Miss"], na.rm = TRUE), digits =2)

mrs_age = round(mean(trainData$Age[trainData$Name == "Mrs"], na.rm = TRUE), digits = 2)

mr_age = round(mean(trainData$Age[trainData$Name == "Mr"], na.rm = TRUE), digits = 2)

dr_age = round(mean(trainData$Age[trainData$Name == "Dr"], na.rm = TRUE), digits = 2)

#將年齡為NA的資料,依照他的Title填入平均值
for (i in 1:nrow(trainData)) {
  if (is.na(trainData[i,5])) {   
    if (trainData$Name[i] == "Master") {   
      trainData$Age[i] = master_age     
    } else if (trainData$Name[i] == "Miss") {     
      trainData$Age[i] = miss_age     
    } else if (trainData$Name[i] == "Mrs") {     
      trainData$Age[i] = mrs_age    
    } else if (trainData$Name[i] == "Mr") {    
      trainData$Age[i] = mr_age     
    } else if (trainData$Name[i] == "Dr") {     
      trainData$Age[i] = dr_age   
    } else {
      print("Uncaught Title")
    }
  }
}

#新增一個欄位 “Child”, 定義12歲以下Child=1, 其它Child=2
for (i in 1:nrow(trainData)) {
  if (trainData$Age[i] <= 12) {   
    trainData$Child[i] = 1   
  } else { 
    trainData$Child[i] = 2  
  }
}

#新增一個欄位 “Family”, Family人數= SibSp + Parch +1
trainData["Family"] = NA

for(i in 1:nrow(trainData)) { 
  x = trainData$SibSp[i] 
  y = trainData$Parch[i] 
  trainData$Family[i] = x + y + 1 
}

#新增一個欄位 “Mother”, 如果Title是Mrs, Parch又不等於0, 那Mother就=1, 其它Mother為0
for(i in 1:nrow(trainData)) { 
  if(trainData$Name[i] == "Mrs" & trainData$Parch[i] > 0) { 
    trainData$Mother[i] = 1  
  } else {   
    trainData$Mother[i] = 2 
  }
}

#清空testData無需使用的資料,並將格式改為和trainData相同
PassengerId = testData[1]
testData = testData[-c(1, 8:11)]
testData$Sex = gsub("female", 1, testData$Sex)
testData$Sex = gsub("^male", 0, testData$Sex)
test_master_vector = grep("Master.",testData$Name)
test_miss_vector = grep("Miss.", testData$Name)
test_mrs_vector = grep("Mrs.", testData$Name)
test_mr_vector = grep("Mr.", testData$Name)
test_dr_vector = grep("Dr.", testData$Name)

for(i in test_master_vector) { 
  testData[i, 2] = "Master"
}
for(i in test_miss_vector) { 
  testData[i, 2] = "Miss" 
}
for(i in test_mrs_vector) { 
  testData[i, 2] = "Mrs" 
}
for(i in test_mr_vector) { 
  testData[i, 2] = "Mr" 
}
for(i in test_dr_vector) { 
  testData[i, 2] = "Dr" 
}

test_master_age = round(mean(testData$Age[testData$Name == "Master"], na.rm = TRUE), digits = 2)
test_miss_age = round(mean(testData$Age[testData$Name == "Miss"], na.rm = TRUE), digits =2)
test_mrs_age = round(mean(testData$Age[testData$Name == "Mrs"], na.rm = TRUE), digits = 2)
test_mr_age = round(mean(testData$Age[testData$Name == "Mr"], na.rm = TRUE), digits = 2)
test_dr_age = round(mean(testData$Age[testData$Name == "Dr"], na.rm = TRUE), digits = 2)

for (i in 1:nrow(testData)) { 
  if (is.na(testData[i,4])) {   
    if (testData[i, 2] == "Master") {     
      testData[i, 4] = test_master_age     
    } else if (testData[i, 2] == "Miss") {     
      testData[i, 4] = test_miss_age     
    } else if (testData[i, 2] == "Mrs") {     
      testData[i, 4] = test_mrs_age     
    } else if (testData[i, 2] == "Mr") {     
      testData[i, 4] = test_mr_age     
    } else if (testData[i, 2] == "Dr") {     
      testData[i, 4] = test_dr_age     
    } else {     
      print(paste("Uncaught title at: ", i, sep=""))     
      print(paste("The title unrecognized was: ", testData[i,2], sep=""))     
    }   
  } 
}

[1] "Uncaught title at: 89”
[1] "The title unrecognized was: O'Donoghue, Ms. Bridget”

testData[89, 4] = test_miss_age
testData["Child"] = NA

for (i in 1:nrow(testData)) { 
  if (testData[i, 4] <= 12) {   
    testData[i, 7] = 1   
  } else {   
    testData[i, 7] = 1   
  } 
}

testData["Family"] = NA

for(i in 1:nrow(testData)) { 
  testData[i, 8] = testData[i, 5] + testData[i, 6] + 1 
}
testData["Mother"] = NA
for(i in 1:nrow(testData)) { 
  if(testData[i, 2] == "Mrs" & testData[i, 6] > 0) {   
    testData[i, 9] = 1   
  } else {   
    testData[i, 9] = 2   
  }
}

Part 3: Prediction Model

#使用glm來建立線性回歸模型
train.glm <- glm(Survived ~ Pclass + Sex + Age + Child + Sex*Pclass + Family + Mother, family = binomial, data = trainData)

 train.glm

Call:  glm(formula = Survived ~ Pclass + Sex + Age + Child + Sex * Pclass +
    Family + Mother, family = binomial, data = trainData)

Coefficients:
(Intercept)       Pclass         Sex1          Age 
    7.77516     -0.87688      6.05753     -0.02839 
      Child       Family       Mother  Pclass:Sex1 
   -1.93722     -0.44175     -0.99742     -1.32707 

Degrees of Freedom: 890 Total (i.e. Null);  883 Residual
Null Deviance:         1187
Residual Deviance: 743.7      AIC: 759.7

#將train.glm模型套用在testData上
p.hats <- predict.glm(train.glm, newdata = testData, type = "response”)

#得到的結果如下,我們必須把結果轉換成0或1
 head(p.hats)
        1         2         3         4         5
0.4479812 0.7447584 0.4718448 0.5010278 0.7923101
        6
0.5922346 

#新增一個參數survival格式為vector
survival <- vector()

#檢驗各筆結果,如果大於0.5則指派1的值,其它則指派0
for(i in 1:length(p.hats)) { 
  if(p.hats[i] > .5) {  
    survival[i] <- 1   
  } else {  
    survival[i] <- 0 
  } 
}

#將PassengerId和survival合併成waggle.sub
kaggle.sub <- cbind(PassengerId,survival)

#將欄位命名為PassengerId和Survived
colnames(kaggle.sub) <- c("PassengerId", "Survived")

#將資料寫入kaggle.csv檔案中
write.csv(kaggle.sub, file = "kaggle.csv", row.names = FALSE)

Part 4: Random Forest

#同樣的參數,用random forest來建構模型
train.rf <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + Child + Sex*Pclass + Family + Mother , data=trainData, importance=TRUE, ntree=2000)

#新增一個Survived欄位
testData$Survived <-0

#用模型來預測testData資料
Prediction <- predict(train.rf, testData)

#將預測結果寫入kaggle.csv檔案
survival <-Prediction
kaggle.sub <- cbind(PassengerId,survival)
colnames(kaggle.sub) <- c("PassengerId", "Survived")
write.csv(kaggle.sub, file = "kaggle.csv", row.names = FALSE)

#結果 0.75120,沒比較厲害

0 意見:

張貼留言