-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxgboost_classifier_titanic.py
76 lines (63 loc) · 3.61 KB
/
xgboost_classifier_titanic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 891 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
Embarked 891 non-null object
FamilySize 891 non-null int64
IsAlone 891 non-null int64
Title 891 non-null object
FareBin 891 non-null category
AgeBin 891 non-null category
Sex_Code 891 non-null int64
Embarked_Code 891 non-null int64
Title_Code 891 non-null int64
AgeBin_Code 891 non-null int64
FareBin_Code 891 non-null int64
dtypes: category(2), float64(2), int64(11), object(4)
memory usage: 120.3+ KB
None
"""
from xgboost_classifier_class import XGBoostClassifier
class XGBoostClassifierTitanic(XGBoostClassifier):
def clean_data(self):
###COMPLETING: complete or delete missing values in train and test/validation dataset
for dataset in self.df:
# complete missing age with median
dataset['Age'].fillna(dataset['Age'].median(), inplace=True)
# complete embarked with mode
dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace=True)
# complete missing fare with median
dataset['Fare'].fillna(dataset['Fare'].median(), inplace=True)
# delete the cabin feature/column and others previously stated to exclude in train dataset
drop_column = ['PassengerId', 'Cabin', 'Ticket']
self.df.drop(drop_column, axis=1, inplace=True)
def feature_engineering(self):
###CREATE: Feature Engineering for train and test/validation dataset
for dataset in self.df:
# Discrete variables
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
dataset['IsAlone'] = 1 # initialize to yes/1 is alone
dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1
# quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split
dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
# Continuous variable bins; qcut vs cut: https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut
# Fare Bins/Buckets using qcut or frequency bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
# Age Bins/Buckets using cut or value bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
# cleanup rare title names
# print(data1['Title'].value_counts())
stat_min = 10 # while small is arbitrary, we'll use the common minimum in statistics: http://nicholasjjackson.com/2012/03/08/sample-size-is-10-a-magic-number/
title_names = (self.df[
'Title'].value_counts() < stat_min) # this will create a true false series with title name as index
# apply and lambda functions are quick and dirty code to find and replace with fewer lines of code: https://community.modeanalytics.com/python/tutorial/pandas-groupby-and-python-lambda-functions/
self.df['Title'] = self.df['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)