-
Notifications
You must be signed in to change notification settings - Fork 2k
/
Copy pathrecommender_systems_association_rules.py
315 lines (269 loc) · 11.5 KB
/
recommender_systems_association_rules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# %%
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from mlxtend.frequent_patterns import apriori, association_rules
from collections import Counter
# %%
# dataset = pd.read_csv("data.csv",encoding= 'unicode_escape')
dataset = pd.read_excel("Online Retail.xlsx")
dataset.head()
# %%
dataset.shape
# %%
## Verify missing value
dataset.isnull().sum().sort_values(ascending=False)
# %%
## Remove missing values
dataset1 = dataset.dropna()
dataset1.describe()
# %%
#selecting data where quantity > 0
dataset1= dataset1[dataset1.Quantity > 0]
dataset1.describe()
# %%
# Creating a new feature 'Amount' which is the product of Quantity and its Unit Price
dataset1['Amount'] = dataset1['Quantity'] * dataset1['UnitPrice']
# to highlight the Customers with most no. of orders (invoices) with groupby function
orders = dataset1.groupby(by=['CustomerID','Country'], as_index=False)['InvoiceNo'].count()
print('The TOP 5 loyal customers with most number of orders...')
orders.sort_values(by='InvoiceNo', ascending=False).head()
# %%
# Creating a subplot of size 15x6
plt.subplots(figsize=(15,6))
# Using the style bmh for better visualization
plt.style.use('bmh')
# X axis will denote the customer ID, Y axis will denote the number of orders
plt.plot(orders.CustomerID, orders.InvoiceNo)
# Labelling the X axis
plt.xlabel('Customers ID')
# Labelling the Y axis
plt.ylabel('Number of Orders')
# Title to the plot
plt.title('Number of Orders by different Customers')
plt.show()
# %%
#Using groupby function to highlight the Customers with highest spent amount (invoices)
money = dataset1.groupby(by=['CustomerID','Country'], as_index=False)['Amount'].sum()
print('The TOP 5 profitable customers with highest money spent...')
money.sort_values(by='Amount', ascending=False).head()
# %%
# Creating a subplot of size 15*6
plt.subplots(figsize=(15,6))
# X axis will denote the customer ID, Y axis will denote the amount spent
plt.plot(money.CustomerID, money.Amount)
# Using bmh style for better visualization
plt.style.use('bmh')
# Labelling the X-axis
plt.xlabel('Customers ID')
# Labelling the Y-axis
plt.ylabel('Money spent')
# Giving a suitable title to the plot
plt.title('Money Spent by different Customers')
plt.show()
# %%
# Convert InvoiceDate from object to datetime
dataset1['InvoiceDate'] = pd.to_datetime(dataset.InvoiceDate, format='%m/%d/%Y %H:%M')
# Creating a new feature called year_month, such that December 2010 will be denoted as 201012
dataset1.insert(loc=2, column='year_month', value=dataset1['InvoiceDate'].map(lambda x: 100*x.year + x.month))
# Creating a new feature for Month
dataset1.insert(loc=3, column='month', value=dataset1.InvoiceDate.dt.month)
# Creating a new feature for Day
# +1 to make Monday=1.....until Sunday=7
dataset1.insert(loc=4, column='day', value=(dataset1.InvoiceDate.dt.dayofweek)+1)
# Creating a new feature for Hour
dataset1.insert(loc=5, column='hour', value=dataset1.InvoiceDate.dt.hour)
# %%
# Using bmh style for better visualization
plt.style.use('bmh')
# Using groupby to extract No. of Invoices year-monthwise
ax = dataset1.groupby('InvoiceNo')['year_month'].unique().value_counts().sort_index().plot(kind='bar',figsize=(15,6))
# Labelling the X axis
ax.set_xlabel('Month',fontsize=15)
# Labelling the Y-axis
ax.set_ylabel('Number of Orders',fontsize=15)
# Giving suitable title to the plot
ax.set_title('Number of orders for different Months (Dec 2010 - Dec 2011)',fontsize=15)
# Providing with X tick labels
ax.set_xticklabels(('Dec_10','Jan_11','Feb_11','Mar_11','Apr_11','May_11','Jun_11','July_11','Aug_11','Sep_11','Oct_11','Nov_11','Dec_11'), rotation='horizontal', fontsize=13)
plt.show()
# %%
# Day = 6 is Saturday.no orders placed
dataset1[dataset1['day']==6]
# %%
# Using groupby to count no. of Invoices daywise
ax = dataset1.groupby('InvoiceNo')['day'].unique().value_counts().sort_index().plot(kind='bar',figsize=(15,6))
# Labelling X axis
ax.set_xlabel('Day',fontsize=15)
# Labelling Y axis
ax.set_ylabel('Number of Orders',fontsize=15)
# Giving suitable title to the plot
ax.set_title('Number of orders for different Days',fontsize=15)
# Providing with X tick labels
# Since there are no orders placed on Saturdays, we are excluding Sat from xticklabels
ax.set_xticklabels(('Mon','Tue','Wed','Thur','Fri','Sun'), rotation='horizontal', fontsize=15)
plt.show()
# %%
# Using groupby to count the no. of Invoices hourwise
ax = dataset1.groupby('InvoiceNo')['hour'].unique().value_counts().iloc[:-2].sort_index().plot(kind='bar',figsize=(15,6))
# Labelling X axis
ax.set_xlabel('Hour',fontsize=15)
# Labelling Y axis
ax.set_ylabel('Number of Orders',fontsize=15)
# Giving suitable title to the plot
ax.set_title('Number of orders for different Hours', fontsize=15)
# Providing with X tick lables ( all orders are placed between 6 and 20 hour )
ax.set_xticklabels(range(6,21), rotation='horizontal', fontsize=15)
plt.show()
# %%
dataset1.UnitPrice.describe()
# %%
# checking the distribution of unit price
plt.subplots(figsize=(12,6))
# Using darkgrid style for better visualization
sns.set_style('darkgrid')
# Applying boxplot visualization on Unit Price
sns.boxplot(dataset1.UnitPrice)
plt.show()
# %%
# Creating a new df of free items
freeproducts = dataset1[dataset1['UnitPrice'] == 0]
freeproducts.head()
# %%
# Counting how many free items were given out year-month wise
freeproducts.year_month.value_counts().sort_index()
# %%
# Counting how many free items were given out year-month wise
ax = freeproducts.year_month.value_counts().sort_index().plot(kind='bar',figsize=(12,6))
# Labelling X-axis
ax.set_xlabel('Month',fontsize=15)
# Labelling Y-axis
ax.set_ylabel('Frequency',fontsize=15)
# Giving suitable title to the plot
ax.set_title('Frequency for different Months (Dec 2010 - Dec 2011)',fontsize=15)
# Providing X tick labels
# Since there are 0 free items in June 2011, we are excluding it
ax.set_xticklabels(('Dec_10','Jan_11','Feb_11','Mar_11','Apr_11','May_11','July_11','Aug_11','Sep_11','Oct_11','Nov_11'), rotation='horizontal', fontsize=13)
plt.show()
# %%
plt.style.use('bmh')
# Using groupby to sum the amount spent year-month wise
ax = dataset1.groupby('year_month')['Amount'].sum().sort_index().plot(kind='bar',figsize=(15,6))
# Labelling X axis
ax.set_xlabel('Month',fontsize=15)
# Labelling Y axis
ax.set_ylabel('Amount',fontsize=15)
# Giving suitable title to the plot
ax.set_title('Revenue Generated for different Months (Dec 2010 - Dec 2011)',fontsize=15)
# Providing with X tick labels
ax.set_xticklabels(('Dec_10','Jan_11','Feb_11','Mar_11','Apr_11','May_11','Jun_11','July_11','Aug_11','Sep_11','Oct_11','Nov_11','Dec_11'), rotation='horizontal', fontsize=13)
plt.show()
# %%
# Creating a new pivot table which sums the Quantity ordered for each item
most_sold= dataset1.pivot_table(index=['StockCode','Description'], values='Quantity', aggfunc='sum').sort_values(by='Quantity', ascending=False)
most_sold.reset_index(inplace=True)
sns.set_style('white')
# Creating a bar plot of Description ( or the item ) on the Y axis and the sum of Quantity on the X axis
# We are plotting only the 10 most ordered items
sns.barplot(y='Description', x='Quantity', data=most_sold.head(10))
# Giving suitable title to the plot
plt.title('Top 10 Items based on No. of Sales', fontsize=14)
plt.ylabel('Item')
# %%
# choosing WHITE HANGING HEART T-LIGHT HOLDER as a sample
d_white = dataset1[dataset1['Description']=='WHITE HANGING HEART T-LIGHT HOLDER']
# %%
# WHITE HANGING HEART T-LIGHT HOLDER has been ordered 2028 times
d_white.shape
# %%
# WHITE HANGING HEART T-LIGHT HOLDER has been ordered by 856 customers
len(d_white.CustomerID.unique())
# %%
# Creating a pivot table that displays the sum of unique Customers who bought particular item
most_customers = dataset1.pivot_table(index=['StockCode','Description'], values='CustomerID', aggfunc=lambda x: len(x.unique())).sort_values(by='CustomerID', ascending=False)
most_customers
# Since the count for WHITE HANGING HEART T-LIGHT HOLDER matches above length 856, the pivot table looks correct for all items
# %%
most_customers.reset_index(inplace=True)
sns.set_style('white')
# Creating a bar plot of Description ( or the item ) on the Y axis and the sum of unique Customers on the X axis
# We are plotting only the 10 most bought items
sns.barplot(y='Description', x='CustomerID', data=most_customers.head(10))
# Giving suitable title to the plot
plt.title('Top 10 Items bought by Most no. of Customers', fontsize=14)
plt.ylabel('Item')
# %%
# Storing all the invoice numbers into a list y
y = dataset1['InvoiceNo']
y = y.to_list()
# Using set function to find unique invoice numbers only and storing them in invoices list
invoices = list(set(y))
# Creating empty list first_choices
firstchoices = []
# looping into list of unique invoice numbers
for i in invoices:
# the first item (index = 0) of every invoice is the first purchase
# extracting the item name for the first purchase
firstpurchase = dataset1[dataset1['InvoiceNo']==i]['items'].reset_index(drop=True)[0]
# Appending the first purchase name into first choices list
firstchoices.append(firstpurchase)
firstchoices[:5]
# %%
# Using counter to count repeating first choices
count = Counter(firstchoices)
# Storing the counter into a datafrane
data_first_choices = pd.DataFrame.from_dict(count, orient='index').reset_index()
# Rename columns as item and count
data_first_choices.rename(columns={'index':'item', 0:'count'},inplace=True)
# Sorting the data based on count
data_first_choices.sort_values(by='count',ascending=False)
# %%
plt.subplots(figsize=(20,10))
sns.set_style('white')
# Creating a bar plot that displays Item name on the Y axis and Count on the X axis
sns.barplot(y='item', x='count', data=data_first_choices.sort_values(by='count',ascending=False).head(10))
# Giving suitable title to the plot
plt.title('Top 10 First Choices', fontsize=14)
plt.ylabel('Item')
# %%
basket = (dataset1.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().reset_index().fillna(0).set_index('InvoiceNo'))
basket.head(10)
# %%
def encode_u(x):
if x < 1:
return 0
if x >= 1:
return 1
basket = basket.applymap(encode_u)
# everything is encoded into 0 and 1
basket.head(10)
# %%
# trying out on a sample item
wooden_star = basket.loc[basket['WOODEN STAR CHRISTMAS SCANDINAVIAN']==1]
# Using apriori algorithm, creating association rules for the sample item
# Applying apriori algorithm for wooden_star
frequentitemsets = apriori(wooden_star, min_support=0.15, use_colnames=True)
# Storing the association rules into rules
wooden_star_rules = association_rules(frequentitemsets, metric="lift", min_threshold=1)
# Sorting the rules on lift and support
wooden_star_rules.sort_values(['lift','support'],ascending=False).reset_index(drop=True)
# %%
# In other words, it returns the items which are likely to be bought by user because he bought the item passed into function
def frequently_bought_t(item):
# df of item passed
item_d = basket.loc[basket[item]==1]
# Applying apriori algorithm on item df
frequentitemsets = apriori(item_d, min_support=0.15, use_colnames=True)
# Storing association rules
rules = association_rules(frequentitemsets, metric="lift", min_threshold=1)
# Sorting on lift and support
rules.sort_values(['lift','support'],ascending=False).reset_index(drop=True)
print('Items frequently bought together with {0}'.format(item))
# Returning top 6 items with highest lift and support
return rules['consequents'].unique()[:6]
# %%
frequently_bought_t('WOODEN STAR CHRISTMAS SCANDINAVIAN')
# %%
frequently_bought_t('JAM MAKING SET WITH JARS')
# %%