|
4 | 4 | import matplotlib.pyplot as plt
|
5 | 5 | import seaborn as sns
|
6 | 6 |
|
| 7 | +def cleaning_data(): |
7 | 8 | # Importing the datasets
|
8 |
| -portfolio = pd.read_json("portfolio.json",lines=True) |
9 |
| -profile = pd.read_json("profile.json",lines=True) |
10 |
| -transcript = pd.read_json("transcript.json",lines=True) |
| 9 | + portfolio = pd.read_json("portfolio.json",lines=True) |
| 10 | + profile = pd.read_json("profile.json",lines=True) |
| 11 | + transcript = pd.read_json("transcript.json",lines=True) |
11 | 12 |
|
12 |
| -# Data Cleaning of portfolio dataset |
13 |
| -ohe = {'email':[1,1,1,1,1,1,1,1,1,1], |
14 |
| - 'mobile':[1,1,1,1,0,1,1,1,1,1], |
15 |
| - 'social':[1,1,0,0,0,1,1,1,1,0], |
16 |
| - 'web':[0,1,1,1,1,1,1,0,1,1]} |
| 13 | + # Data Cleaning of portfolio dataset |
| 14 | + ohe = {'email':[1,1,1,1,1,1,1,1,1,1], |
| 15 | + 'mobile':[1,1,1,1,0,1,1,1,1,1], |
| 16 | + 'social':[1,1,0,0,0,1,1,1,1,0], |
| 17 | + 'web':[0,1,1,1,1,1,1,0,1,1]} |
| 18 | + |
| 19 | + ohx = pd.DataFrame(ohe,columns = ['email','mobile','social','web']) |
| 20 | + |
| 21 | + cleaned_portfolio = portfolio |
| 22 | + cleaned_portfolio = pd.concat([portfolio,ohx],axis=1) |
| 23 | + cleaned_portfolio = cleaned_portfolio.drop(['channels'],axis=1) |
| 24 | + |
| 25 | + # converting duration from days to hours for better comparision |
| 26 | + cleaned_portfolio['duration'] = cleaned_portfolio['duration'] * 24 |
17 | 27 |
|
18 |
| -ohx = pd.DataFrame(ohe,columns = ['email','mobile','social','web']) |
| 28 | + # one hot encoding the offer_type column |
| 29 | + ohe = pd.get_dummies(cleaned_portfolio['offer_type']) |
| 30 | + cleaned_portfolio = pd.concat([cleaned_portfolio,ohe],axis=1) |
| 31 | + cleaned_portfolio = cleaned_portfolio.drop(['offer_type'],axis=1) |
| 32 | + |
| 33 | + # renaming the id column to offer_id |
| 34 | + cleaned_portfolio = cleaned_portfolio.rename(columns={'id':'offer_id'}) |
| 35 | + |
| 36 | + # Data Cleaning of profile dataset |
19 | 37 |
|
20 |
| -cleaned_portfolio = portfolio |
21 |
| -cleaned_portfolio = pd.concat([portfolio,ohx],axis=1) |
| 38 | + # To check the number of NULL values in each column |
| 39 | + # profile.isnull().sum() |
| 40 | + ''' |
| 41 | + gender 2175 |
| 42 | + age 0 |
| 43 | + id 0 |
| 44 | + became_member_on 0 |
| 45 | + income 2175 |
| 46 | + ''' |
| 47 | + # Also on checking the age column against all the pts having gender and income |
| 48 | + # as Null we find that the corresponding age value is 118 which is quite |
| 49 | + # unusual. So in order to cleanse the data we drop all such points. |
| 50 | + |
| 51 | + # Dropping NULL values |
| 52 | + cleaned_profile = profile |
| 53 | + cleaned_profile = cleaned_profile.dropna() |
22 | 54 |
|
23 |
| -# Data Cleaning of profile dataset |
24 |
| - |
25 |
| -# To check the number of NULL values in each column |
26 |
| -# profile.isnull().sum() |
27 |
| -''' |
28 |
| -gender 2175 |
29 |
| -age 0 |
30 |
| -id 0 |
31 |
| -became_member_on 0 |
32 |
| -income 2175 |
33 |
| -''' |
34 |
| -# Also on checking the age column against all the pts having gender and income |
35 |
| -# as Null we find that the corresponding age value is 118 which is quite |
36 |
| -# unusual. So in order to cleanse the data we drop all such points. |
37 |
| - |
38 |
| -# Dropping NULL values |
39 |
| -cleaned_profile = profile |
40 |
| -cleaned_profile = cleaned_profile.dropna() |
41 |
| - |
42 |
| -# OneHotEncoding the gender column |
43 |
| -ohe = pd.get_dummies(cleaned_profile['gender']) |
44 |
| -cleaned_profile = pd.concat([cleaned_profile,ohe],axis=1) |
45 |
| -cleaned_profile = cleaned_profile.drop(['gender'],axis=1) |
46 |
| - |
47 |
| -# To convert the became_member_on to date-time stamp because the machine will not |
48 |
| -# understand data corresponding to date in integer form. |
49 |
| -cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y%m%d').dt.date |
50 |
| - |
51 |
| -# We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks |
52 |
| -cleaned_profile['today_date'] = pd.to_datetime('20200828',format='%Y%m%d') |
53 |
| -cleaned_profile['today_date'] = pd.to_datetime(cleaned_profile['today_date'],format='%Y%m%d').dt.date |
54 |
| -cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub(cleaned_profile['became_member_on'], axis=0) |
55 |
| - |
56 |
| -# Taking a ratio of the subtracted dates to convert it into no.of.days |
57 |
| -cleaned_profile['days_of_membership'] = cleaned_profile['days_of_membership'] / np.timedelta64(1, 'D') |
58 |
| -cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year |
59 |
| - |
60 |
| -# Then we drop the reference column because it is not useful to us further analysis |
61 |
| -cleaned_profile = cleaned_profile.drop(['today_date'],axis=1) |
| 55 | + # Renaming the id column to customer_id |
| 56 | + cleaned_profile = cleaned_profile.rename(columns={'id':'person_id'}) |
| 57 | + |
| 58 | + # OneHotEncoding the gender column |
| 59 | + ohe = pd.get_dummies(cleaned_profile['gender']) |
| 60 | + cleaned_profile = pd.concat([cleaned_profile,ohe],axis=1) |
| 61 | + cleaned_profile = cleaned_profile.drop(['gender'],axis=1) |
| 62 | + |
| 63 | + # To convert the became_member_on to date-time stamp because the machine will not |
| 64 | + # understand data corresponding to date in integer form. |
| 65 | + cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y%m%d').dt.date |
| 66 | + |
| 67 | + # We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks |
| 68 | + cleaned_profile['today_date'] = pd.to_datetime('20200828',format='%Y%m%d') |
| 69 | + cleaned_profile['today_date'] = pd.to_datetime(cleaned_profile['today_date'],format='%Y%m%d').dt.date |
| 70 | + cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub(cleaned_profile['became_member_on'], axis=0) |
| 71 | + |
| 72 | + # Taking a ratio of the subtracted dates to convert it into no.of.days |
| 73 | + cleaned_profile['days_of_membership'] = cleaned_profile['days_of_membership'] / np.timedelta64(1, 'D') |
| 74 | + cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year |
| 75 | + |
| 76 | + # Then we drop the reference column because it is not useful to us further analysis |
| 77 | + cleaned_profile = cleaned_profile.drop(['today_date'],axis=1) |
| 78 | + cleaned_profile['age_by_decade'] = pd.cut(cleaned_profile['age'], bins=range(10,120,10),right=False, labels=['10s','20s', '30s', '40s', '50s','60s', '70s', '80s', '90s', '100s']) |
| 79 | + cleaned_profile['income_range'] = pd.cut(cleaned_profile['income'], bins=range(0,120001,10000),right=False, labels=['10k','20k', '30k', '40k', '50k','60k', '70k', '80k', '90k', '100k', '110k', '120k']) |
| 80 | + |
| 81 | + # Data Cleaning of transcript.json |
| 82 | + cleaned_transcript = transcript |
| 83 | + |
| 84 | + # OneHotEncoding the event column |
| 85 | + ohy = pd.get_dummies(cleaned_transcript['event']) |
| 86 | + cleaned_transcript = pd.concat([cleaned_transcript,ohy],axis=1) |
| 87 | + cleaned_transcript = cleaned_transcript.drop(['event'],axis=1) |
| 88 | + |
| 89 | + # To delete all the information of the people had NULL values qhich we previously dropped. |
| 90 | + profile118 = profile[profile['age']==118] |
| 91 | + id118 = profile118['id'] |
| 92 | + |
| 93 | + cleaned_transcript = cleaned_transcript[~cleaned_transcript['person'].isin(id118)] |
| 94 | + |
| 95 | + cleaned_transcript['record'] = cleaned_transcript.value.apply(lambda x: list(x.keys())[0]) |
| 96 | + cleaned_transcript['record_value'] = cleaned_transcript.value.apply(lambda x: list(x.values())[0]) |
| 97 | + cleaned_transcript.drop(['value'], axis=1, inplace=True) |
| 98 | + |
| 99 | + transactions = cleaned_transcript[cleaned_transcript.transaction == 1] |
| 100 | + offers = cleaned_transcript[cleaned_transcript.transaction != 1] |
| 101 | + |
| 102 | + # cleaning transactions |
| 103 | + transactions = transactions.drop(['offer completed','offer viewed','offer received'],axis=1) |
| 104 | + transactions = transactions.drop(['transaction','record'],axis=1) |
| 105 | + transactions = transactions.rename(columns={'record_value':'amount'}) |
| 106 | + |
| 107 | + # cleaning offers |
| 108 | + offers = offers.drop(['transaction','record'],axis=1) |
| 109 | + offers = offers.rename(columns={'record_value':'offer_id'}) |
| 110 | + |
| 111 | + return cleaned_portfolio, cleaned_profile, offers, transactions |
0 commit comments