Skip to content

Commit 3af37a2

Browse files
authored
Merge pull request #3 from Team-Fourth-Dimension/master
Update Notebook
2 parents 053d3e8 + bcc9415 commit 3af37a2

File tree

2 files changed

+105
-51
lines changed

2 files changed

+105
-51
lines changed

Notebook.py

+100-50
Original file line numberDiff line numberDiff line change
@@ -4,58 +4,108 @@
44
import matplotlib.pyplot as plt
55
import seaborn as sns
66

7+
def cleaning_data():
78
# Importing the datasets
8-
portfolio = pd.read_json("portfolio.json",lines=True)
9-
profile = pd.read_json("profile.json",lines=True)
10-
transcript = pd.read_json("transcript.json",lines=True)
9+
portfolio = pd.read_json("portfolio.json",lines=True)
10+
profile = pd.read_json("profile.json",lines=True)
11+
transcript = pd.read_json("transcript.json",lines=True)
1112

12-
# Data Cleaning of portfolio dataset
13-
ohe = {'email':[1,1,1,1,1,1,1,1,1,1],
14-
'mobile':[1,1,1,1,0,1,1,1,1,1],
15-
'social':[1,1,0,0,0,1,1,1,1,0],
16-
'web':[0,1,1,1,1,1,1,0,1,1]}
13+
# Data Cleaning of portfolio dataset
14+
ohe = {'email':[1,1,1,1,1,1,1,1,1,1],
15+
'mobile':[1,1,1,1,0,1,1,1,1,1],
16+
'social':[1,1,0,0,0,1,1,1,1,0],
17+
'web':[0,1,1,1,1,1,1,0,1,1]}
18+
19+
ohx = pd.DataFrame(ohe,columns = ['email','mobile','social','web'])
20+
21+
cleaned_portfolio = portfolio
22+
cleaned_portfolio = pd.concat([portfolio,ohx],axis=1)
23+
cleaned_portfolio = cleaned_portfolio.drop(['channels'],axis=1)
24+
25+
# converting duration from days to hours for better comparision
26+
cleaned_portfolio['duration'] = cleaned_portfolio['duration'] * 24
1727

18-
ohx = pd.DataFrame(ohe,columns = ['email','mobile','social','web'])
28+
# one hot encoding the offer_type column
29+
ohe = pd.get_dummies(cleaned_portfolio['offer_type'])
30+
cleaned_portfolio = pd.concat([cleaned_portfolio,ohe],axis=1)
31+
cleaned_portfolio = cleaned_portfolio.drop(['offer_type'],axis=1)
32+
33+
# renaming the id column to offer_id
34+
cleaned_portfolio = cleaned_portfolio.rename(columns={'id':'offer_id'})
35+
36+
# Data Cleaning of profile dataset
1937

20-
cleaned_portfolio = portfolio
21-
cleaned_portfolio = pd.concat([portfolio,ohx],axis=1)
38+
# To check the number of NULL values in each column
39+
# profile.isnull().sum()
40+
'''
41+
gender 2175
42+
age 0
43+
id 0
44+
became_member_on 0
45+
income 2175
46+
'''
47+
# Also on checking the age column against all the pts having gender and income
48+
# as Null we find that the corresponding age value is 118 which is quite
49+
# unusual. So in order to cleanse the data we drop all such points.
50+
51+
# Dropping NULL values
52+
cleaned_profile = profile
53+
cleaned_profile = cleaned_profile.dropna()
2254

23-
# Data Cleaning of profile dataset
24-
25-
# To check the number of NULL values in each column
26-
# profile.isnull().sum()
27-
'''
28-
gender 2175
29-
age 0
30-
id 0
31-
became_member_on 0
32-
income 2175
33-
'''
34-
# Also on checking the age column against all the pts having gender and income
35-
# as Null we find that the corresponding age value is 118 which is quite
36-
# unusual. So in order to cleanse the data we drop all such points.
37-
38-
# Dropping NULL values
39-
cleaned_profile = profile
40-
cleaned_profile = cleaned_profile.dropna()
41-
42-
# OneHotEncoding the gender column
43-
ohe = pd.get_dummies(cleaned_profile['gender'])
44-
cleaned_profile = pd.concat([cleaned_profile,ohe],axis=1)
45-
cleaned_profile = cleaned_profile.drop(['gender'],axis=1)
46-
47-
# To convert the became_member_on to date-time stamp because the machine will not
48-
# understand data corresponding to date in integer form.
49-
cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y%m%d').dt.date
50-
51-
# We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks
52-
cleaned_profile['today_date'] = pd.to_datetime('20200828',format='%Y%m%d')
53-
cleaned_profile['today_date'] = pd.to_datetime(cleaned_profile['today_date'],format='%Y%m%d').dt.date
54-
cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub(cleaned_profile['became_member_on'], axis=0)
55-
56-
# Taking a ratio of the subtracted dates to convert it into no.of.days
57-
cleaned_profile['days_of_membership'] = cleaned_profile['days_of_membership'] / np.timedelta64(1, 'D')
58-
cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year
59-
60-
# Then we drop the reference column because it is not useful to us further analysis
61-
cleaned_profile = cleaned_profile.drop(['today_date'],axis=1)
55+
# Renaming the id column to customer_id
56+
cleaned_profile = cleaned_profile.rename(columns={'id':'person_id'})
57+
58+
# OneHotEncoding the gender column
59+
ohe = pd.get_dummies(cleaned_profile['gender'])
60+
cleaned_profile = pd.concat([cleaned_profile,ohe],axis=1)
61+
cleaned_profile = cleaned_profile.drop(['gender'],axis=1)
62+
63+
# To convert the became_member_on to date-time stamp because the machine will not
64+
# understand data corresponding to date in integer form.
65+
cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y%m%d').dt.date
66+
67+
# We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks
68+
cleaned_profile['today_date'] = pd.to_datetime('20200828',format='%Y%m%d')
69+
cleaned_profile['today_date'] = pd.to_datetime(cleaned_profile['today_date'],format='%Y%m%d').dt.date
70+
cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub(cleaned_profile['became_member_on'], axis=0)
71+
72+
# Taking a ratio of the subtracted dates to convert it into no.of.days
73+
cleaned_profile['days_of_membership'] = cleaned_profile['days_of_membership'] / np.timedelta64(1, 'D')
74+
cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year
75+
76+
# Then we drop the reference column because it is not useful to us further analysis
77+
cleaned_profile = cleaned_profile.drop(['today_date'],axis=1)
78+
cleaned_profile['age_by_decade'] = pd.cut(cleaned_profile['age'], bins=range(10,120,10),right=False, labels=['10s','20s', '30s', '40s', '50s','60s', '70s', '80s', '90s', '100s'])
79+
cleaned_profile['income_range'] = pd.cut(cleaned_profile['income'], bins=range(0,120001,10000),right=False, labels=['10k','20k', '30k', '40k', '50k','60k', '70k', '80k', '90k', '100k', '110k', '120k'])
80+
81+
# Data Cleaning of transcript.json
82+
cleaned_transcript = transcript
83+
84+
# OneHotEncoding the event column
85+
ohy = pd.get_dummies(cleaned_transcript['event'])
86+
cleaned_transcript = pd.concat([cleaned_transcript,ohy],axis=1)
87+
cleaned_transcript = cleaned_transcript.drop(['event'],axis=1)
88+
89+
# To delete all the information of the people had NULL values qhich we previously dropped.
90+
profile118 = profile[profile['age']==118]
91+
id118 = profile118['id']
92+
93+
cleaned_transcript = cleaned_transcript[~cleaned_transcript['person'].isin(id118)]
94+
95+
cleaned_transcript['record'] = cleaned_transcript.value.apply(lambda x: list(x.keys())[0])
96+
cleaned_transcript['record_value'] = cleaned_transcript.value.apply(lambda x: list(x.values())[0])
97+
cleaned_transcript.drop(['value'], axis=1, inplace=True)
98+
99+
transactions = cleaned_transcript[cleaned_transcript.transaction == 1]
100+
offers = cleaned_transcript[cleaned_transcript.transaction != 1]
101+
102+
# cleaning transactions
103+
transactions = transactions.drop(['offer completed','offer viewed','offer received'],axis=1)
104+
transactions = transactions.drop(['transaction','record'],axis=1)
105+
transactions = transactions.rename(columns={'record_value':'amount'})
106+
107+
# cleaning offers
108+
offers = offers.drop(['transaction','record'],axis=1)
109+
offers = offers.rename(columns={'record_value':'offer_id'})
110+
111+
return cleaned_portfolio, cleaned_profile, offers, transactions

README.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@
1313
- [x] Repositories forked and pull request created for variable identification.
1414
- 28th August, 2020.
1515
- [x] Data imported.
16-
- [x] Data cleaning in process.
16+
- [x] Data cleaning started.
17+
- [x] Landing page of our website created.
18+
- 29th August, 2020.
19+
- [x] Data cleaning completed.
20+
- [x] Start Exploratory Data Analysis.
1721

1822
## 📄 Abstract
1923
The data simulates how people make purchasing decisions and how those decisions are influenced by promotional offers.

0 commit comments

Comments
 (0)