這是在Coursera裡Johns Hopkins大學開的Exploratory Data Analysis課程中,使用R比較1999年及2012年空氣品質的case study,練習主要的重點在於資料的ECTL(Extract-Clean-Transform-Load)
# 匯入1990年資料
pm0 <- read.table("RD_501_88101_1999-0.txt", comment.char = "#", header = FALSE, sep = "|", na.strings = "")
#查看pm0的資料筆數及欄位
dim(pm0)
[1] 117421 28
#查看前6筆資料
head(pm0)
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15
1 RD I 1 27 1 88101 1 7 105 120 19990103 00:00 NA AS 3
2 RD I 1 27 1 88101 1 7 105 120 19990106 00:00 NA AS 3
3 RD I 1 27 1 88101 1 7 105 120 19990109 00:00 NA AS 3
4 RD I 1 27 1 88101 1 7 105 120 19990112 00:00 8.841 <NA> 3
5 RD I 1 27 1 88101 1 7 105 120 19990115 00:00 14.920 <NA> 3
6 RD I 1 27 1 88101 1 7 105 120 19990118 00:00 3.878 <NA> 3
V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28
1 NA <NA> NA NA NA NA NA NA NA NA NA NA NA
2 NA <NA> NA NA NA NA NA NA NA NA NA NA NA
3 NA <NA> NA NA NA NA NA NA NA NA NA NA NA
4 NA <NA> NA NA NA NA NA NA NA NA NA NA NA
5 NA <NA> NA NA NA NA NA NA NA NA NA NA NA
6 NA <NA> NA NA NA NA NA NA NA NA NA NA NA
#用匯入資料的第一筆當作欄位名稱
cnames <- readLines("RD_501_88101_1999-0.txt", 1)
print(cnames)
[1] "# RD|Action Code|State Code|County Code|Site ID|Parameter|POC|Sample Duration|Unit|Method|Date|Start Time|Sample Value|Null Data Code|Sampling Frequency|Monitor Protocol (MP) ID|Qualifier - 1|Qualifier - 2|Qualifier - 3|Qualifier - 4|Qualifier - 5|Qualifier - 6|Qualifier - 7|Qualifier - 8|Qualifier - 9|Qualifier - 10|Alternate Method Detectable Limit|Uncertainty”
#將cnames分割成
cnames <- strsplit(cnames, "|", fixed = TRUE)
print(cnames)
[[1]]
[1] "# RD"
[2] "Action Code"
[3] "State Code"
[4] "County Code"
[5] "Site ID"
[6] "Parameter"
[7] "POC"
[8] "Sample Duration"
[9] "Unit"
[10] "Method"
[11] "Date"
[12] "Start Time"
[13] "Sample Value"
[14] "Null Data Code"
[15] "Sampling Frequency"
[16] "Monitor Protocol (MP) ID"
[17] "Qualifier - 1"
[18] "Qualifier - 2"
[19] "Qualifier - 3"
[20] "Qualifier - 4"
[21] "Qualifier - 5"
[22] "Qualifier - 6"
[23] "Qualifier - 7"
[24] "Qualifier - 8"
[25] "Qualifier - 9"
[26] "Qualifier - 10"
[27] "Alternate Method Detectable Limit"
[28] “Uncertainty"
#將cnames作為pm0的欄位名稱
names(pm0) <- make.names(cnames[[1]])
head(pm0)
#把Sample.Value的資料匯入x0
x0 <- pm0$Sample.Value
#確認x0的class是numeric
class(x0)
[1] “numeric"
#確認x0的結構
str(x0)
num [1:117421] NA NA NA 8.84 14.92 …
#列出x0的統計摘要
summary(x0)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 7.20 11.50 13.74 17.90 157.10 13217
#列出x0缺少資料的比例
mean(is.na(x0))
[1] 0.1125608
## 匯入2012年資料
pm1 <- read.table("RD_501_88101_2012-0.txt", comment.char = "#", header = FALSE, sep = "|", na.strings = "", nrow = 1304290)
#把cnames當作pm1的欄位名稱
names(pm1) <- make.names(cnames[[1]])
head(pm1)
dim(pm1)
[1] 1304287 28
#將2012年的sample value匯出至x1
x1 <- pm1$Sample.Value
class(x1)
#比較x1, x0的摘要資料
summary(x1)
summary(x0)
#x1缺少資料的比例
mean(is.na(x1))
[1] 0.05607125
## Make a boxplot of both 1999 and 2012
boxplot(x0, x1)
#將x0, x1 log10
boxplot(log10(x0), log10(x1))
## 理論上x1不應該有負值,但是x1最小值是-10
summary(x1)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-10.00 4.00 7.63 9.14 12.00 909.00 73133
#將x1的負值寫入negative
negative <- x1 < 0
#共有負值
sum(negative, na.rm = T)
[1] 26474
#負值佔的比例
mean(negative, na.rm = T)
[1] 0.0215034
#將pm1$Date匯出成dates
dates <- pm1$Date
#dates是int
str(dates)
int [1:1304287] 20120101 20120104 20120107 20120110 20120113 20120116 20120119 20120122 20120125 20120128 ...
#將dates轉換為日期格式,才可以做為時間上的分析
dates <- as.Date(as.character(dates), "%Y%m%d")
str(dates)
Date[1:1304287], format: "2012-01-01" "2012-01-04" "2012-01-07" …
hist(dates, "month") ## Check what's going on in months 1--6
# 使用subset選擇紐約的資料中的County.Code和Site.ID, 再用unique讓每筆同樣的組合只出現一次
site0 <- unique(subset(pm0, State.Code == 36, c(County.Code, Site.ID)))
site1 <- unique(subset(pm1, State.Code == 36, c(County.Code, Site.ID)))
#用paste把2個欄位合併
site0 <- paste(site0[,1], site0[,2], sep = ".")
site1 <- paste(site1[,1], site1[,2], sep = ".”)
str(site0)
chr [1:33] "1.5" "1.12" "5.73" "5.80" "5.83" "5.110" "13.11" …
str(site1)
#把site0, site1都有出現的值選出來到both
both <- intersect(site0, site1)
print(both)
[1] "1.5" "1.12" "5.80" "13.11" "29.5" "31.3" "63.2008" "67.1015"
[9] "85.55" "101.3”
#新增一個新的欄位county.site, 合併County.Code和Site.ID資料成一個欄位
pm0$county.site <- with(pm0, paste(County.Code, Site.ID, sep = "."))
pm1$county.site <- with(pm1, paste(County.Code, Site.ID, sep = ".”))
#產生一個新的data frame - cnt0, 放入符合State.Code == 36而且county.site包含both內的資料的資料
cnt0 <- subset(pm0, State.Code == 36 & county.site %in% both)
cnt1 <- subset(pm1, State.Code == 36 & county.site %in% both)
sapply(split(cnt0, cnt0$county.site), nrow)
sapply(split(cnt1, cnt1$county.site), nrow)
## 選擇 county 63 and side ID 2008
pm1sub <- subset(pm1, State.Code == 36 & County.Code == 63 & Site.ID == 2008)
pm0sub <- subset(pm0, State.Code == 36 & County.Code == 63 & Site.ID == 2008)
dim(pm1sub)
dim(pm0sub)
## 繪製2012的圖
#將pm1sub$Date匯至dates1
dates1 <- pm1sub$Date
#將Sample.Value匯至x1sub
x1sub <- pm1sub$Sample.Value
#把日期和sample.value的plot畫出來, 但是date1是int格式, 所以無法依正常時間序呈現
plot(dates1, x1sub)
#將dates1轉換為日期格式
dates1 <- as.Date(as.character(dates1), "%Y%m%d”)
str(dates1)
Date[1:30], format: "2012-01-01" "2012-01-04" "2012-01-07" "2012-01-10" …
#再畫一次,OK了
plot(dates1, x1sub)
## 繪製1999年的圖
dates0 <- pm0sub$Date
dates0 <- as.Date(as.character(dates0), "%Y%m%d")
x0sub <- pm0sub$Sample.Value
plot(dates0, x0sub)
## 把2個年份的資料畫在一起,用mfrow設定一張畫布畫2張圖
par(mfrow = c(1, 2), mar = c(4, 4, 2, 1))
#先畫1999年資料
plot(dates0, x0sub, pch = 20)
#畫出中位值的直線
abline(h = median(x0sub, na.rm = T))
#畫出2012年的資料
plot(dates1, x1sub, pch = 20)
#但是和1999年的資料範圍不同,無法清楚比較
abline(h = median(x1sub, na.rm = T))
## 抓出x0sub和x1sub共同的range
rng <- range(x0sub, x1sub, na.rm = T)
rng
[1] 3.0 40.1
#再重畫一次,但是將2個圖的ylim都設為剛才抓出的rng
par(mfrow = c(1, 2), mar = c(4, 4, 2, 1))
plot(dates0, x0sub, pch = 20, ylim = rng)
abline(h = median(x0sub, na.rm = T))
plot(dates1, x1sub, pch = 20, ylim = rng)
abline(h = median(x1sub, na.rm = T))
## 顯示各州的平均值及趨勢
head(pm0)
#用tapply顯示以州分組資料的平均值
mn0 <- with(pm0, tapply(Sample.Value, State.Code, mean, na.rm = T))
str(mn0)
num [1:53(1d)] 19.96 6.67 10.8 15.68 17.66 …
- attr(*, "dimnames")=List of 1
..$ : chr [1:53] "1" "2" "4" "5" …
summary(mn0)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.862 9.519 12.310 12.410 15.640 19.960
mn1 <- with(pm1, tapply(Sample.Value, State.Code, mean, na.rm = T))
str(mn1)
## 將mn0中的和州和平均值匯出成一個新的data.frame
d0 <- data.frame(state = names(mn0), mean = mn0)
d1 <- data.frame(state = names(mn1), mean = mn1)
#將1999年及2012年資料合併成一個新的data.frame
mrg <- merge(d0, d1, by = "state”)
dim(mrg)
[1] 52 3
head(mrg)
state mean.x mean.y
1 1 19.956391 10.126190
2 10 14.492895 11.236059
3 11 15.786507 11.991697
4 12 11.137139 8.239690
5 13 19.943240 11.321364
6 15 4.861821 8.749336
# 把畫布改回一張圖
par(mfrow = c(1, 1))
#用point分別畫出52個州的點
with(mrg, plot(rep(1, 52), mrg[, 2], xlim = c(.5, 2.5)))
with(mrg, points(rep(2, 52), mrg[, 3]))
#將各州1999年和2012年的數據連成線,看出趨勢
segments(rep(1, 52), mrg[, 2], rep(2, 52), mrg[, 3])