This project uses market basket analysis to explore frequent itemsets and association rules.
library(arules)
library(arulesViz)
library(tidyverse)
library(readxl)
library(plyr)
library(knitr)
Itemslist <- read.csv2("Assignment-1_Data.csv", check.names = FALSE)
Itemslist <- Itemslist[complete.cases(Itemslist), ]
head(Itemslist)
## BillNo Itemname Quantity Date Price
## 1 536365 WHITE HANGING HEART T-LIGHT HOLDER 6 01.12.2010 08:26 2.55
## 2 536365 WHITE METAL LANTERN 6 01.12.2010 08:26 3.39
## 3 536365 CREAM CUPID HEARTS COAT HANGER 8 01.12.2010 08:26 2.75
## 4 536365 KNITTED UNION FLAG HOT WATER BOTTLE 6 01.12.2010 08:26 3.39
## 5 536365 RED WOOLLY HOTTIE WHITE HEART. 6 01.12.2010 08:26 3.39
## 6 536365 SET 7 BABUSHKA NESTING BOXES 2 01.12.2010 08:26 7.65
## CustomerID Country
## 1 17850 United Kingdom
## 2 17850 United Kingdom
## 3 17850 United Kingdom
## 4 17850 United Kingdom
## 5 17850 United Kingdom
## 6 17850 United Kingdom
# Group items into full transactions
transactionData <- ddply(Itemslist, c("BillNo","Country"),
function(df1) paste(df1$Itemname, collapse=","))
colnames(transactionData) <- c("BillNo", "Country", "items")
# Create France & Other datasets
franceData <- subset(transactionData, Country == "France")
otherData <- subset(transactionData, Country != "France")
# Remove Country column
franceData$Country <- NULL
otherData$Country <- NULL
colnames(franceData) <- c("BillNo", "items")
colnames(otherData) <- c("BillNo", "items")
# Prepare item-only data frames for Apriori
transactionData_apriori <- data.frame(items = transactionData$items)
france_apriori <- data.frame(items = franceData$items)
other_apriori <- data.frame(items = otherData$items)
# Save CSVs
write.csv(transactionData_apriori, "transactions_basket.csv", quote=FALSE, row.names=FALSE)
write.csv(france_apriori, "france_basket.csv", quote=FALSE, row.names=FALSE)
write.csv(other_apriori, "other_basket.csv", quote=FALSE, row.names=FALSE)
# Load into transaction objects
trans_all <- read.transactions("transactions_basket.csv", format="basket", sep=",")
trans_fra <- read.transactions("france_basket.csv", format="basket", sep=",")
trans_other <- read.transactions("other_basket.csv", format="basket", sep=",")
summary(trans_all)
## transactions as itemMatrix in sparse format with
## 18164 rows (elements/itemsets/transactions) and
## 7699 columns (items) and a density of 0.002293202
##
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER REGENCY CAKESTAND 3 TIER
## 1717 1468
## JUMBO BAG RED RETROSPOT PARTY BUNTING
## 1394 1244
## ASSORTED COLOUR BIRD ORNAMENT (Other)
## 1226 313643
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1541 857 742 741 742 693 642 632 631 565 598 517 494 519 530 509
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 457 429 467 406 385 307 303 267 233 246 226 210 212 208 164 153
## 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 135 139 131 106 110 87 108 91 87 86 84 62 59 67 59 58
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 57 47 61 39 39 47 41 34 27 37 29 26 27 16 24 25
## 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 20 27 24 21 14 20 19 13 16 15 11 15 12 6 8 14
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 13 10 8 8 11 10 13 8 6 5 5 11 5 4 4 3
## 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## 5 5 2 4 1 4 4 2 2 2 6 3 4 3 2 1
## 113 114 116 117 118 120 121 122 123 125 126 127 131 132 133 134
## 3 1 3 3 3 1 2 2 1 3 2 2 1 1 2 1
## 140 141 142 143 145 146 147 150 154 157 168 171 177 178 180 202
## 1 2 2 1 1 2 1 1 3 2 2 2 1 1 1 1
## 204 228 236 249 250 285 320 400 419
## 1 1 1 1 1 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 5.00 13.00 17.66 23.00 419.00
##
## includes extended item information - examples:
## labels
## 1 1 HANGER
## 2 10 COLOUR SPACEBOY PEN
## 3 12 COLOURED PARTY BALLOONS
summary(trans_fra)
## transactions as itemMatrix in sparse format with
## 390 rows (elements/itemsets/transactions) and
## 1523 columns (items) and a density of 0.01184067
##
## most frequent items:
## POSTAGE RABBIT NIGHT LIGHT
## 249 67
## RED TOADSTOOL LED NIGHT LIGHT PLASTERS IN TIN CIRCUS PARADE
## 61 55
## ROUND SNACK BOXES SET OF4 WOODLAND (Other)
## 54 6547
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 21 14 7 10 16 10 16 14 6 25 15 12 15 15 7 12 14 12 13 12
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 38 39 40 41
## 10 10 8 10 4 4 5 3 5 6 6 6 4 1 5 2 3 4 3 2
## 42 43 45 46 47 49 52 57 58 61 62 64 66 67 68 87 204
## 1 1 1 1 3 3 1 1 1 2 1 1 1 2 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 8.00 14.00 18.03 23.00 204.00
##
## includes extended item information - examples:
## labels
## 1 10 COLOUR SPACEBOY PEN
## 2 12 COLOURED PARTY BALLOONS
## 3 12 EGG HOUSE PAINTED WOOD
summary(trans_other)
## transactions as itemMatrix in sparse format with
## 17775 rows (elements/itemsets/transactions) and
## 7613 columns (items) and a density of 0.002317895
##
## most frequent items:
## WHITE HANGING HEART T-LIGHT HOLDER REGENCY CAKESTAND 3 TIER
## 1716 1422
## JUMBO BAG RED RETROSPOT PARTY BUNTING
## 1364 1224
## ASSORTED COLOUR BIRD ORNAMENT (Other)
## 1212 306722
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 1521 843 735 731 726 683 626 618 625 540 583 505 479 504 523 497
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
## 443 417 454 394 375 297 295 257 229 242 221 207 207 202 158 147
## 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
## 131 138 126 104 110 84 104 88 85 85 83 62 58 66 56 58
## 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 54 47 61 38 39 47 41 34 26 36 29 26 25 15 24 24
## 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 20 26 22 20 14 20 19 13 16 15 11 15 12 6 8 14
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
## 13 10 8 8 11 10 12 8 6 5 5 11 5 4 4 3
## 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
## 5 5 2 4 1 4 4 2 2 2 6 3 4 3 2 1
## 113 114 116 117 118 120 121 122 123 125 126 127 131 132 133 134
## 3 1 3 3 3 1 2 2 1 3 2 2 1 1 2 1
## 140 141 142 143 145 146 147 150 154 157 168 171 177 178 180 202
## 1 2 2 1 1 2 1 1 3 2 2 2 1 1 1 1
## 228 236 249 250 285 320 400 419
## 1 1 1 1 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 5.00 13.00 17.65 23.00 419.00
##
## includes extended item information - examples:
## labels
## 1 1 HANGER
## 2 10 COLOUR SPACEBOY PEN
## 3 12 COLOURED PARTY BALLOONS
##Everthing above in one dataframe is a tiny 3 row output that shows the printed example of item labels insidet the transaction object ##its purpose is to show a preview of item names so so you know what type of items are inside the transactions ##The bigger R Console data frame like output is a transaction length distribution that says: ## 1521 transactions contain 1 item , 843 transactions contain 2 items , 735 transactions contain 3 items , 419 items is the longest basket in the dataset (very large transaction)
rules_all <- apriori(trans_all,
parameter = list(supp = 0.01, conf = 0.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 181
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7699 item(s), 18164 transaction(s)] done [0.17s].
## sorting and recoding items ... [500 item(s)] done [0.01s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [159 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
summary(rules_all)
## set of 159 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 81 74 4
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.000 2.516 3.000 4.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01002 Min. :0.5010 Min. :0.01068 Min. : 6.40
## 1st Qu.:0.01082 1st Qu.:0.5568 1st Qu.:0.01715 1st Qu.:11.37
## Median :0.01178 Median :0.6006 Median :0.01993 Median :13.70
## Mean :0.01331 Mean :0.6470 Mean :0.02121 Mean :21.50
## 3rd Qu.:0.01409 3rd Qu.:0.7030 3rd Qu.:0.02310 3rd Qu.:22.74
## Max. :0.02604 Max. :1.0000 Max. :0.04432 Max. :86.50
## count
## Min. :182.0
## 1st Qu.:196.5
## Median :214.0
## Mean :241.7
## 3rd Qu.:256.0
## Max. :473.0
##
## mining info:
## data ntransactions support confidence
## trans_all 18164 0.01 0.5
## call
## apriori(data = trans_all, parameter = list(supp = 0.01, conf = 0.5))
inspect(head(rules_all, 10))
## lhs rhs support confidence coverage lift count
## [1] {SET 3 RETROSPOT TEA} => {SUGAR} 0.01156133 1.0000000 0.01156133 86.49524 210
## [2] {SUGAR} => {SET 3 RETROSPOT TEA} 0.01156133 1.0000000 0.01156133 86.49524 210
## [3] {SET 3 RETROSPOT TEA} => {COFFEE} 0.01156133 1.0000000 0.01156133 65.57401 210
## [4] {COFFEE} => {SET 3 RETROSPOT TEA} 0.01156133 0.7581227 0.01524994 65.57401 210
## [5] {SUGAR} => {COFFEE} 0.01156133 1.0000000 0.01156133 65.57401 210
## [6] {COFFEE} => {SUGAR} 0.01156133 0.7581227 0.01524994 65.57401 210
## [7] {REGENCY TEA PLATE GREEN} => {REGENCY TEA PLATE ROSES} 0.01029509 0.8385650 0.01227703 56.20552 187
## [8] {REGENCY TEA PLATE ROSES} => {REGENCY TEA PLATE GREEN} 0.01029509 0.6900369 0.01491962 56.20552 187
## [9] {ALARM CLOCK BAKELIKE CHOCOLATE} => {ALARM CLOCK BAKELIKE RED} 0.01018498 0.7142857 0.01425897 17.79737 185
## [10] {SET/6 RED SPOTTY PAPER CUPS} => {SET/6 RED SPOTTY PAPER PLATES} 0.01139617 0.8181818 0.01392865 51.78207 207
###Most rules contained 2–3 items, showing that customers tend to purchase small item combinations ###Support
###→ Rules appear in 1%–2% of all transactions.
###Confidence ###→ Many rules are strong because confidence is 0.50 to 1.00 ###(50%–100% probability of RHS given LHS)
###Lift ###→ Lift values range from 6 to 86, meaning: ###lift > 1 = items occur together more than random chance ###lift > 10 = very strong association ###lift ~ 80 = extremely strong association
itemFrequencyPlot(trans_all,
topN = 20,
type = "absolute",
main = "Top 20 Most Frequent Items")
plot(rules_all,
measure = c("support","confidence"),
shading = "lift",
main = "Support vs Confidence (Colored by Lift)")
top20 <- head(sort(rules_all, by="lift"), 20)
plot(top20, method="graph", control=list(type="items"),
main="Top 20 Strongest Association Rules")
## Available control parameters (with default values):
## layout = stress
## circular = FALSE
## ggraphdots = NULL
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE
rules_fra <- apriori(trans_fra,
parameter = list(supp = 0.01, conf = 0.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 3
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1523 item(s), 390 transaction(s)] done [0.00s].
## sorting and recoding items ... [471 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 done [0.01s].
## writing ... [34463 rule(s)] done [0.00s].
## creating S4 object ... done [0.01s].
summary(rules_fra)
## set of 34463 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7 8 9
## 1 898 7810 11507 8213 4121 1527 350 36
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 4.000 4.377 5.000 9.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01026 Min. :0.5000 Min. :0.01026 Min. : 0.7831
## 1st Qu.:0.01026 1st Qu.:0.7143 1st Qu.:0.01026 1st Qu.: 6.1905
## Median :0.01026 Median :0.9167 Median :0.01282 Median : 8.4783
## Mean :0.01197 Mean :0.8523 Mean :0.01485 Mean :10.4364
## 3rd Qu.:0.01282 3rd Qu.:1.0000 3rd Qu.:0.01795 3rd Qu.:12.5806
## Max. :0.63846 Max. :1.0000 Max. :1.00000 Max. :97.5000
## count
## Min. : 4.000
## 1st Qu.: 4.000
## Median : 4.000
## Mean : 4.666
## 3rd Qu.: 5.000
## Max. :249.000
##
## mining info:
## data ntransactions support confidence
## trans_fra 390 0.01 0.5
## call
## apriori(data = trans_fra, parameter = list(supp = 0.01, conf = 0.5))
inspect(head(rules_fra, 10))
## lhs rhs support confidence coverage lift count
## [1] {} => {POSTAGE} 0.63846154 0.6384615 1.00000000 1.000000 249
## [2] {PAPER CHAIN KIT RETROSPOT} => {POSTAGE} 0.01025641 0.8000000 0.01282051 1.253012 4
## [3] {LIGHT PINK} => {FEATHER PEN} 0.01025641 1.0000000 0.01025641 97.500000 4
## [4] {FEATHER PEN} => {LIGHT PINK} 0.01025641 1.0000000 0.01025641 97.500000 4
## [5] {TRADITIONAL NAUGHTS & CROSSES} => {RABBIT NIGHT LIGHT} 0.01025641 1.0000000 0.01025641 5.820896 4
## [6] {HAIRCLIPS FORTIES FABRIC ASSORTED} => {POSTAGE} 0.01025641 1.0000000 0.01025641 1.566265 4
## [7] {ZINC FOLKART SLEIGH BELLS} => {POSTAGE} 0.01025641 1.0000000 0.01025641 1.566265 4
## [8] {SET 3 PAPER VINTAGE CHICK PAPER EGG} => {POSTAGE} 0.01282051 0.8333333 0.01538462 1.305221 5
## [9] {SET 12 COLOUR PENCILS DOLLY GIRL} => {SET 12 COLOUR PENCILS SPACEBOY} 0.01025641 0.8000000 0.01282051 39.000000 4
## [10] {SET 12 COLOUR PENCILS SPACEBOY} => {SET 12 COLOUR PENCILS DOLLY GIRL} 0.01025641 0.5000000 0.02051282 39.000000 4
rules_other <- apriori(trans_other,
parameter = list(supp = 0.01, conf = 0.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 177
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7613 item(s), 17775 transaction(s)] done [0.14s].
## sorting and recoding items ... [502 item(s)] done [0.00s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 done [0.01s].
## writing ... [157 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(rules_other)
## set of 157 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 82 71 4
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 2.000 2.503 3.000 4.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01007 Min. :0.5053 Min. :0.01086 Min. : 6.479
## 1st Qu.:0.01103 1st Qu.:0.5583 1st Qu.:0.01722 1st Qu.:11.635
## Median :0.01181 Median :0.6018 Median :0.02008 Median :14.262
## Mean :0.01342 Mean :0.6465 Mean :0.02139 Mean :21.444
## 3rd Qu.:0.01406 3rd Qu.:0.6992 3rd Qu.:0.02363 3rd Qu.:23.166
## Max. :0.02633 Max. :1.0000 Max. :0.04439 Max. :85.457
## count
## Min. :179.0
## 1st Qu.:196.0
## Median :210.0
## Mean :238.5
## 3rd Qu.:250.0
## Max. :468.0
##
## mining info:
## data ntransactions support confidence
## trans_other 17775 0.01 0.5
## call
## apriori(data = trans_other, parameter = list(supp = 0.01, conf = 0.5))
inspect(head(rules_other, 10))
## lhs rhs support confidence coverage lift count
## [1] {SET 3 RETROSPOT TEA} => {SUGAR} 0.01170183 1.0000000 0.01170183 85.45673 208
## [2] {SUGAR} => {SET 3 RETROSPOT TEA} 0.01170183 1.0000000 0.01170183 85.45673 208
## [3] {SET 3 RETROSPOT TEA} => {COFFEE} 0.01170183 1.0000000 0.01170183 64.63636 208
## [4] {COFFEE} => {SET 3 RETROSPOT TEA} 0.01170183 0.7563636 0.01547117 64.63636 208
## [5] {SUGAR} => {COFFEE} 0.01170183 1.0000000 0.01170183 64.63636 208
## [6] {COFFEE} => {SUGAR} 0.01170183 0.7563636 0.01547117 64.63636 208
## [7] {REGENCY TEA PLATE GREEN} => {REGENCY TEA PLATE ROSES} 0.01046414 0.8454545 0.01237693 56.49607 186
## [8] {REGENCY TEA PLATE ROSES} => {REGENCY TEA PLATE GREEN} 0.01046414 0.6992481 0.01496484 56.49607 186
## [9] {ALARM CLOCK BAKELIKE CHOCOLATE} => {ALARM CLOCK BAKELIKE RED} 0.01007032 0.7103175 0.01417722 18.03699 179
## [10] {BACK DOOR} => {KEY FOB} 0.01085795 1.0000000 0.01085795 58.47039 193
plot(rules_fra,
measure = c("support","confidence"),
shading = "lift",
main="France: Support vs Confidence")
plot(rules_other,
measure = c("support","confidence"),
shading = "lift",
main="Other Regions: Support vs Confidence")
top_fra <- head(sort(rules_fra, by="lift"), 10)
plot(top_fra, method="graph", control=list(type="items"),
main="France: Top 10 Strongest Rules")
## Available control parameters (with default values):
## layout = stress
## circular = FALSE
## ggraphdots = NULL
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE
top_other <- head(sort(rules_other, by="lift"), 10)
plot(top_other, method="graph", control=list(type="items"),
main="Other Countries: Top 10 Strongest Rules")
## Available control parameters (with default values):
## layout = stress
## circular = FALSE
## ggraphdots = NULL
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE