Step 1 – collecting data
# 要使用Groceries的資料必須先安裝arules套件
install.packages(arules)
library(arules)
data(Groceries)
Step 2 – exploring and preparing the data
# Groceries資料是由9835筆資料組合,包含169種品項
summary(Groceries)
transactions as itemMatrix in sparse format with
9835 rows (elements/itemsets/transactions) and
169 columns (items) and a density of 0.02609146
most frequent items:
whole milk other vegetables rolls/buns soda yogurt
2513 1903 1809 1715 1372
(Other)
34055
element (itemset/transaction) length distribution:
sizes
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46 29 14 14
20 21 22 23 24 26 27 28 29 32
9 11 4 6 1 1 1 1 3 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.000 3.000 4.409 6.000 32.000
includes extended item information - examples:
labels level2 level1
1 frankfurter sausage meet and sausage
2 sausage sausage meet and sausage
3 liver loaf sausage meet and sausage
# Groceries的資料包含每一次客戶採購的品項,列出前5筆的購物車項目(用inspect(head(Groceries))也可)
inspect(Groceries[1:5])
items
1 {citrus fruit,
semi-finished bread,
margarine,
ready soups}
2 {tropical fruit,
yogurt,
coffee}
3 {whole milk}
4 {pip fruit,
yogurt,
cream cheese ,
meat spreads}
5 {other vegetables,
whole milk,
condensed milk,
long life bakery product}
# itemFrequency可列出每一項品項佔的比例
itemFrequency(Groceries[, 1:3])
frankfurter sausage liver loaf
0.058973055 0.093950178 0.005083884
# 用itemFrequencyPlot繪出產品佔的比例圖,support參數是僅列出此比例的項目,如不使用則會列出所有產品品項
itemFrequencyPlot(Groceries, support = 0.1)
# 若使用topN參數則會列出依產品排名列出
itemFrequencyPlot(Groceries, topN = 20)
# 用image繪出前5筆交易的品項
image(Groceries[1:5])
# 任意sample 100筆交易紀錄的品項
image(sample(Groceries,100))
Step 3 – training a model on the data
# 若使用原本default值的support = 0.1, confidence = 0.8得到的結果是set of 0 rules
apriori(Groceries)
parameter specification:
confidence minval smax arem aval originalSupport support minlen maxlen
0.8 0.1 1 none FALSE TRUE 0.1 1 10
target ext
rules FALSE
algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
apriori - find association rules with the apriori algorithm
version 4.21 (2004.05.09) (c) 1996-2004 Christian Borgelt
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
sorting and recoding items ... [8 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
writing ... [0 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
set of 0 rules
# 加入參數為 support = 0.006, confidence = 0.25)
groceryrules <- apriori(Groceries, parameter = list(support = 0.006, confidence = 0.25, minlen = 2))
# 產生set of 463 rules
groceryrules
set of 463 rules
Step 4 – evaluating model performance
summary(groceryrules)
set of 463 rules
rule length distribution (lhs + rhs):sizes
2 3 4
150 297 16
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.000 2.000 3.000 2.711 3.000 4.000
summary of quality measures:
support confidence lift
Min. :0.006101 Min. :0.2500 Min. :0.9932
1st Qu.:0.007117 1st Qu.:0.2971 1st Qu.:1.6229
Median :0.008744 Median :0.3554 Median :1.9332
Mean :0.011539 Mean :0.3786 Mean :2.0351
3rd Qu.:0.012303 3rd Qu.:0.4495 3rd Qu.:2.3565
Max. :0.074835 Max. :0.6600 Max. :3.9565
mining info:
data ntransactions support confidence
Groceries 9835 0.006 0.25
# 產生的rules看起來如下
inspect(groceryrules[1:3])
lhs rhs support confidence lift
1 {pot plants} => {whole milk} 0.006914082 0.4000000 1.565460
2 {pasta} => {whole milk} 0.006100661 0.4054054 1.586614
3 {herbs} => {root vegetables} 0.007015760 0.4312500 3.956477
Step 5 – improving model performance
# 用lift的欄位排序,lift的數據是呈現一個產品和另一個產品可能被同時購買的機率
inspect(sort(groceryrules, by = "lift")[1:5])
lhs rhs support confidence lift
1 {herbs} => {root vegetables} 0.007015760 0.4312500 3.956477
2 {berries} => {whipped/sour cream} 0.009049314 0.2721713 3.796886
3 {tropical fruit,
other vegetables,
whole milk} => {root vegetables} 0.007015760 0.4107143 3.768074
4 {beef,
other vegetables} => {root vegetables} 0.007930859 0.4020619 3.688692
5 {tropical fruit,
other vegetables} => {pip fruit} 0.009456024 0.2634561 3.482649
# 用berries這個產品來產生一個相關聯的rules
berryrules <- subset(groceryrules, items %in% "berries”)
# 可以看出哪些和berries最常在一起被購買
inspect(berryrules)
lhs rhs support confidence lift
1 {berries} => {whipped/sour cream} 0.009049314 0.2721713 3.796886
2 {berries} => {yogurt} 0.010574479 0.3180428 2.279848
3 {berries} => {other vegetables} 0.010269446 0.3088685 1.596280
4 {berries} => {whole milk} 0.011794611 0.3547401 1.388328
# 將rules存人竹.csv檔中
write(groceryrules, file = "groceryrules.csv",sep = ",", quote = TRUE, row.names = FALSE)
# 將rules從value轉換成data.frame格式
groceryrules_df <- as(groceryrules, "data.frame”)
str(groceryrules_df)
'data.frame': 463 obs. of 4 variables:
$ rules : Factor w/ 463 levels "{baking powder} => {other vegetables}",..: 237 204 128 127 129 238 317 21 89 90 ...
$ support : num 0.00691 0.0061 0.00702 0.00773 0.00773 ...
$ confidence: num 0.4 0.405 0.431 0.475 0.475 ...
$ lift : num 1.57 1.59 3.96 2.45 1.86 …