2014年10月20日 星期一

Market Basket Analysis 實作

Step 1 – collecting data 
# 要使用Groceries的資料必須先安裝arules套件
install.packages(arules)
library(arules)
data(Groceries)
Step 2 – exploring and preparing the data 
# Groceries資料是由9835筆資料組合,包含169種品項
summary(Groceries)
transactions as itemMatrix in sparse format with
 9835 rows (elements/itemsets/transactions) and
 169 columns (items) and a density of 0.02609146

most frequent items:
      whole milk other vegetables       rolls/buns             soda           yogurt
            2513             1903             1809             1715             1372
         (Other)
           34055

element (itemset/transaction) length distribution:
sizes
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19
2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46   29   14   14
  20   21   22   23   24   26   27   28   29   32
   9   11    4    6    1    1    1    1    3    1

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  1.000   2.000   3.000   4.409   6.000  32.000

includes extended item information - examples:
       labels  level2           level1
1 frankfurter sausage meet and sausage
2     sausage sausage meet and sausage
3  liver loaf sausage meet and sausage

# Groceries的資料包含每一次客戶採購的品項,列出前5筆的購物車項目(用inspect(head(Groceries))也可)
inspect(Groceries[1:5])
  items                    
1 {citrus fruit,           
   semi-finished bread,    
   margarine,              
   ready soups}            
2 {tropical fruit,         
   yogurt,                 
   coffee}                 
3 {whole milk}             
4 {pip fruit,              
   yogurt,                 
   cream cheese ,          
   meat spreads}           
5 {other vegetables,       
   whole milk,             
   condensed milk,         
   long life bakery product}

# itemFrequency可列出每一項品項佔的比例
itemFrequency(Groceries[, 1:3])
frankfurter     sausage  liver loaf
0.058973055 0.093950178 0.005083884 

# 用itemFrequencyPlot繪出產品佔的比例圖,support參數是僅列出此比例的項目,如不使用則會列出所有產品品項
itemFrequencyPlot(Groceries, support = 0.1)

# 若使用topN參數則會列出依產品排名列出
itemFrequencyPlot(Groceries, topN = 20)
# 用image繪出前5筆交易的品項
image(Groceries[1:5])
# 任意sample 100筆交易紀錄的品項
image(sample(Groceries,100))


Step 3 – training a model on the data
# 若使用原本default值的support = 0.1, confidence = 0.8得到的結果是set of 0 rules
apriori(Groceries)
parameter specification:
 confidence minval smax arem  aval originalSupport support minlen maxlen
        0.8    0.1    1 none FALSE            TRUE     0.1      1     10
 target   ext
  rules FALSE

algorithmic control:
 filter tree heap memopt load sort verbose
    0.1 TRUE TRUE  FALSE TRUE    2    TRUE

apriori - find association rules with the apriori algorithm
version 4.21 (2004.05.09)        (c) 1996-2004   Christian Borgelt
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
sorting and recoding items ... [8 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
writing ... [0 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
set of 0 rules 

# 加入參數為 support = 0.006, confidence = 0.25)
groceryrules <- apriori(Groceries, parameter = list(support = 0.006, confidence = 0.25, minlen = 2))

# 產生set of 463 rules
groceryrules
set of 463 rules 

Step 4 – evaluating model performance 

summary(groceryrules)
set of 463 rules

rule length distribution (lhs + rhs):sizes
  2   3   4
150 297  16

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  2.000   2.000   3.000   2.711   3.000   4.000

summary of quality measures:
    support           confidence          lift      
 Min.   :0.006101   Min.   :0.2500   Min.   :0.9932 
 1st Qu.:0.007117   1st Qu.:0.2971   1st Qu.:1.6229 
 Median :0.008744   Median :0.3554   Median :1.9332 
 Mean   :0.011539   Mean   :0.3786   Mean   :2.0351 
 3rd Qu.:0.012303   3rd Qu.:0.4495   3rd Qu.:2.3565 
 Max.   :0.074835   Max.   :0.6600   Max.   :3.9565 

mining info:
      data ntransactions support confidence
 Groceries          9835   0.006       0.25


# 產生的rules看起來如下
inspect(groceryrules[1:3])
  lhs             rhs                   support confidence     lift
1 {pot plants} => {whole milk}      0.006914082  0.4000000 1.565460
2 {pasta}      => {whole milk}      0.006100661  0.4054054 1.586614
3 {herbs}      => {root vegetables} 0.007015760  0.4312500 3.956477

Step 5 – improving model performance

# 用lift的欄位排序,lift的數據是呈現一個產品和另一個產品可能被同時購買的機率
inspect(sort(groceryrules, by = "lift")[1:5])
  lhs                   rhs                      support confidence     lift
1 {herbs}            => {root vegetables}    0.007015760  0.4312500 3.956477
2 {berries}          => {whipped/sour cream} 0.009049314  0.2721713 3.796886
3 {tropical fruit,                                                         
   other vegetables,                                                       
   whole milk}       => {root vegetables}    0.007015760  0.4107143 3.768074
4 {beef,                                                                   
   other vegetables} => {root vegetables}    0.007930859  0.4020619 3.688692
5 {tropical fruit,                                                         
   other vegetables} => {pip fruit}          0.009456024  0.2634561 3.482649

# 用berries這個產品來產生一個相關聯的rules
berryrules <- subset(groceryrules, items %in% "berries”)

# 可以看出哪些和berries最常在一起被購買
inspect(berryrules)
  lhs          rhs                      support confidence     lift
1 {berries} => {whipped/sour cream} 0.009049314  0.2721713 3.796886
2 {berries} => {yogurt}             0.010574479  0.3180428 2.279848
3 {berries} => {other vegetables}   0.010269446  0.3088685 1.596280
4 {berries} => {whole milk}         0.011794611  0.3547401 1.388328

# 將rules存人竹.csv檔中
write(groceryrules, file = "groceryrules.csv",sep = ",", quote = TRUE, row.names = FALSE)

# 將rules從value轉換成data.frame格式
groceryrules_df <- as(groceryrules, "data.frame”)

str(groceryrules_df)
'data.frame':     463 obs. of  4 variables:
 $ rules     : Factor w/ 463 levels "{baking powder} => {other vegetables}",..: 237 204 128 127 129 238 317 21 89 90 ...
 $ support   : num  0.00691 0.0061 0.00702 0.00773 0.00773 ...
 $ confidence: num  0.4 0.405 0.431 0.475 0.475 ...
 $ lift      : num  1.57 1.59 3.96 2.45 1.86 …

2014年10月6日 星期一

Air Pollution Case Study

這是在Coursera裡Johns Hopkins大學開的Exploratory Data Analysis課程中,使用R比較1999年及2012年空氣品質的case study,練習主要的重點在於資料的ECTL(Extract-Clean-Transform-Load)

# 匯入1990年資料
pm0 <- read.table("RD_501_88101_1999-0.txt", comment.char = "#", header = FALSE, sep = "|", na.strings = "")

#查看pm0的資料筆數及欄位
dim(pm0)
[1] 117421     28

#查看前6筆資料
head(pm0)
  V1 V2 V3 V4 V5    V6 V7 V8  V9 V10      V11   V12    V13  V14 V15
1 RD  I  1 27  1 88101  1  7 105 120 19990103 00:00     NA   AS   3
2 RD  I  1 27  1 88101  1  7 105 120 19990106 00:00     NA   AS   3
3 RD  I  1 27  1 88101  1  7 105 120 19990109 00:00     NA   AS   3
4 RD  I  1 27  1 88101  1  7 105 120 19990112 00:00  8.841 <NA>   3
5 RD  I  1 27  1 88101  1  7 105 120 19990115 00:00 14.920 <NA>   3
6 RD  I  1 27  1 88101  1  7 105 120 19990118 00:00  3.878 <NA>   3
  V16  V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28
1  NA <NA>  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
2  NA <NA>  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
3  NA <NA>  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
4  NA <NA>  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
5  NA <NA>  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
6  NA <NA>  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA

#用匯入資料的第一筆當作欄位名稱
cnames <- readLines("RD_501_88101_1999-0.txt", 1)
print(cnames)
[1] "# RD|Action Code|State Code|County Code|Site ID|Parameter|POC|Sample Duration|Unit|Method|Date|Start Time|Sample Value|Null Data Code|Sampling Frequency|Monitor Protocol (MP) ID|Qualifier - 1|Qualifier - 2|Qualifier - 3|Qualifier - 4|Qualifier - 5|Qualifier - 6|Qualifier - 7|Qualifier - 8|Qualifier - 9|Qualifier - 10|Alternate Method Detectable Limit|Uncertainty”

#將cnames分割成
cnames <- strsplit(cnames, "|", fixed = TRUE)
print(cnames)
[[1]]
 [1] "# RD"                            
 [2] "Action Code"                     
 [3] "State Code"                      
 [4] "County Code"                     
 [5] "Site ID"                         
 [6] "Parameter"                       
 [7] "POC"                             
 [8] "Sample Duration"                 
 [9] "Unit"                            
[10] "Method"                          
[11] "Date"                            
[12] "Start Time"                      
[13] "Sample Value"                    
[14] "Null Data Code"                  
[15] "Sampling Frequency"              
[16] "Monitor Protocol (MP) ID"        
[17] "Qualifier - 1"                   
[18] "Qualifier - 2"                   
[19] "Qualifier - 3"                   
[20] "Qualifier - 4"                   
[21] "Qualifier - 5"                   
[22] "Qualifier - 6"                   
[23] "Qualifier - 7"                   
[24] "Qualifier - 8"                   
[25] "Qualifier - 9"                   
[26] "Qualifier - 10"                  
[27] "Alternate Method Detectable Limit"
[28] “Uncertainty"

#將cnames作為pm0的欄位名稱                      
names(pm0) <- make.names(cnames[[1]])

head(pm0)

#把Sample.Value的資料匯入x0
x0 <- pm0$Sample.Value

#確認x0的class是numeric
class(x0)
[1] “numeric"

#確認x0的結構
str(x0)
num [1:117421] NA NA NA 8.84 14.92 …

#列出x0的統計摘要
summary(x0)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
   0.00    7.20   11.50   13.74   17.90  157.10   13217 

#列出x0缺少資料的比例
mean(is.na(x0)) 
[1] 0.1125608

## 匯入2012年資料
pm1 <- read.table("RD_501_88101_2012-0.txt", comment.char = "#", header = FALSE, sep = "|", na.strings = "", nrow = 1304290)

#把cnames當作pm1的欄位名稱
names(pm1) <- make.names(cnames[[1]])

head(pm1)

dim(pm1)
[1] 1304287      28

#將2012年的sample value匯出至x1
x1 <- pm1$Sample.Value

class(x1)

#比較x1, x0的摘要資料
summary(x1)
summary(x0)

#x1缺少資料的比例
mean(is.na(x1))  
[1] 0.05607125

## Make a boxplot of both 1999 and 2012
boxplot(x0, x1)
#將x0, x1 log10
boxplot(log10(x0), log10(x1))
## 理論上x1不應該有負值,但是x1最小值是-10
summary(x1)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
 -10.00    4.00    7.63    9.14   12.00  909.00   73133 

#將x1的負值寫入negative
negative <- x1 < 0

#共有負值
sum(negative, na.rm = T)
[1] 26474

#負值佔的比例
mean(negative, na.rm = T)
[1] 0.0215034

#將pm1$Date匯出成dates
dates <- pm1$Date

#dates是int
str(dates)
int [1:1304287] 20120101 20120104 20120107 20120110 20120113 20120116 20120119 20120122 20120125 20120128 ...

#將dates轉換為日期格式,才可以做為時間上的分析
dates <- as.Date(as.character(dates), "%Y%m%d")

str(dates)
Date[1:1304287], format: "2012-01-01" "2012-01-04" "2012-01-07" …

hist(dates, "month")  ## Check what's going on in months 1--6

# 使用subset選擇紐約的資料中的County.Code和Site.ID, 再用unique讓每筆同樣的組合只出現一次
site0 <- unique(subset(pm0, State.Code == 36, c(County.Code, Site.ID)))
site1 <- unique(subset(pm1, State.Code == 36, c(County.Code, Site.ID)))

#用paste把2個欄位合併
site0 <- paste(site0[,1], site0[,2], sep = ".")
site1 <- paste(site1[,1], site1[,2], sep = ".”)

str(site0)
chr [1:33] "1.5" "1.12" "5.73" "5.80" "5.83" "5.110" "13.11" …

str(site1)

#把site0, site1都有出現的值選出來到both
both <- intersect(site0, site1)

print(both)
 [1] "1.5"     "1.12"    "5.80"    "13.11"   "29.5"    "31.3"    "63.2008" "67.1015"
 [9] "85.55"   "101.3” 

#新增一個新的欄位county.site, 合併County.Code和Site.ID資料成一個欄位
pm0$county.site <- with(pm0, paste(County.Code, Site.ID, sep = "."))
pm1$county.site <- with(pm1, paste(County.Code, Site.ID, sep = ".”))

#產生一個新的data frame - cnt0, 放入符合State.Code == 36而且county.site包含both內的資料的資料
cnt0 <- subset(pm0, State.Code == 36 & county.site %in% both)
cnt1 <- subset(pm1, State.Code == 36 & county.site %in% both)

sapply(split(cnt0, cnt0$county.site), nrow)
sapply(split(cnt1, cnt1$county.site), nrow)

## 選擇 county 63 and side ID 2008
pm1sub <- subset(pm1, State.Code == 36 & County.Code == 63 & Site.ID == 2008)
pm0sub <- subset(pm0, State.Code == 36 & County.Code == 63 & Site.ID == 2008)

dim(pm1sub)
dim(pm0sub)

## 繪製2012的圖
#將pm1sub$Date匯至dates1
dates1 <- pm1sub$Date

#將Sample.Value匯至x1sub
x1sub <- pm1sub$Sample.Value

#把日期和sample.value的plot畫出來, 但是date1是int格式, 所以無法依正常時間序呈現
plot(dates1, x1sub)

#將dates1轉換為日期格式
dates1 <- as.Date(as.character(dates1), "%Y%m%d”)

str(dates1)
Date[1:30], format: "2012-01-01" "2012-01-04" "2012-01-07" "2012-01-10" …

#再畫一次,OK了
plot(dates1, x1sub)

## 繪製1999年的圖
dates0 <- pm0sub$Date
dates0 <- as.Date(as.character(dates0), "%Y%m%d")
x0sub <- pm0sub$Sample.Value
plot(dates0, x0sub)

## 把2個年份的資料畫在一起,用mfrow設定一張畫布畫2張圖
par(mfrow = c(1, 2), mar = c(4, 4, 2, 1))

#先畫1999年資料
plot(dates0, x0sub, pch = 20)

#畫出中位值的直線
abline(h = median(x0sub, na.rm = T))

#畫出2012年的資料
plot(dates1, x1sub, pch = 20) 

#但是和1999年的資料範圍不同,無法清楚比較 
abline(h = median(x1sub, na.rm = T))

## 抓出x0sub和x1sub共同的range
rng <- range(x0sub, x1sub, na.rm = T)

rng
[1]  3.0 40.1

#再重畫一次,但是將2個圖的ylim都設為剛才抓出的rng
par(mfrow = c(1, 2), mar = c(4, 4, 2, 1))
plot(dates0, x0sub, pch = 20, ylim = rng)
abline(h = median(x0sub, na.rm = T))
plot(dates1, x1sub, pch = 20, ylim = rng)
abline(h = median(x1sub, na.rm = T))

## 顯示各州的平均值及趨勢
head(pm0)

#用tapply顯示以州分組資料的平均值
mn0 <- with(pm0, tapply(Sample.Value, State.Code, mean, na.rm = T))

str(mn0)
num [1:53(1d)] 19.96 6.67 10.8 15.68 17.66 …
 - attr(*, "dimnames")=List of 1
  ..$ : chr [1:53] "1" "2" "4" "5" …

summary(mn0)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
  4.862   9.519  12.310  12.410  15.640  19.960 

mn1 <- with(pm1, tapply(Sample.Value, State.Code, mean, na.rm = T))
str(mn1)

## 將mn0中的和州和平均值匯出成一個新的data.frame
d0 <- data.frame(state = names(mn0), mean = mn0)
d1 <- data.frame(state = names(mn1), mean = mn1)

#將1999年及2012年資料合併成一個新的data.frame
mrg <- merge(d0, d1, by = "state”)

dim(mrg)
[1] 52  3

head(mrg)
  state    mean.x    mean.y
1     1 19.956391 10.126190
2    10 14.492895 11.236059
3    11 15.786507 11.991697
4    12 11.137139  8.239690
5    13 19.943240 11.321364
6    15  4.861821  8.749336

# 把畫布改回一張圖
par(mfrow = c(1, 1))

#用point分別畫出52個州的點
with(mrg, plot(rep(1, 52), mrg[, 2], xlim = c(.5, 2.5)))
with(mrg, points(rep(2, 52), mrg[, 3]))

#將各州1999年和2012年的數據連成線,看出趨勢
segments(rep(1, 52), mrg[, 2], rep(2, 52), mrg[, 3])