|
19 | 19 |
|
20 | 20 | cleaned_portfolio = portfolio
|
21 | 21 | cleaned_portfolio = pd.concat([portfolio,ohx],axis=1)
|
| 22 | + |
| 23 | +# Data Cleaning of profile dataset |
| 24 | + |
| 25 | +# To check the number of NULL values in each column |
| 26 | +# profile.isnull().sum() |
| 27 | +''' |
| 28 | +gender 2175 |
| 29 | +age 0 |
| 30 | +id 0 |
| 31 | +became_member_on 0 |
| 32 | +income 2175 |
| 33 | +''' |
| 34 | +# Also on checking the age column against all the pts having gender and income |
| 35 | +# as Null we find that the corresponding age value is 118 which is quite |
| 36 | +# unusual. So in order to cleanse the data we drop all such points. |
| 37 | + |
| 38 | +# Dropping NULL values |
| 39 | +cleaned_profile = profile |
| 40 | +cleaned_profile = cleaned_profile.dropna() |
| 41 | + |
| 42 | +# OneHotEncoding the gender column |
| 43 | +ohe = pd.get_dummies(cleaned_profile['gender']) |
| 44 | +cleaned_profile = pd.concat([cleaned_profile,ohe],axis=1) |
| 45 | +cleaned_profile = cleaned_profile.drop(['gender'],axis=1) |
| 46 | + |
| 47 | +# To convert the became_member_on to date-time stamp because the machine will not |
| 48 | +# understand data corresponding to date in integer form. |
| 49 | +cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y%m%d').dt.date |
| 50 | + |
| 51 | +# We added a column today's date in the dataframe for refereence to calculate the no of days the customer has been a member of Starbucks |
| 52 | +cleaned_profile['today_date'] = pd.to_datetime('20200828',format='%Y%m%d') |
| 53 | +cleaned_profile['today_date'] = pd.to_datetime(cleaned_profile['today_date'],format='%Y%m%d').dt.date |
| 54 | +cleaned_profile['days_of_membership'] = cleaned_profile['today_date'].sub(cleaned_profile['became_member_on'], axis=0) |
| 55 | + |
| 56 | +# Taking a ratio of the subtracted dates to convert it into no.of.days |
| 57 | +cleaned_profile['days_of_membership'] = cleaned_profile['days_of_membership'] / np.timedelta64(1, 'D') |
| 58 | +cleaned_profile['became_member_on'] = pd.to_datetime(cleaned_profile['became_member_on'], format='%Y-%m-%d').dt.year |
| 59 | + |
| 60 | +# Then we drop the reference column because it is not useful to us further analysis |
| 61 | +cleaned_profile = cleaned_profile.drop(['today_date'],axis=1) |
0 commit comments