-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
167 lines (153 loc) · 5.03 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from sklearn.metrics import fbeta_score, precision_score, recall_score
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from ml.data import process_data
# Optional: implement hyperparameter tuning.
def train_model(X_train, y_train, random_state=42):
"""
Trains a machine learning model and returns it.
Inputs
------
X_train : np.array
Training data.
y_train : np.array
Labels.
Returns
-------
model
Trained machine learning model.
"""
model = RandomForestClassifier(random_state)
model.fit(X_train,y_train)
return model
def compute_model_metrics(y, preds):
"""
Validates the trained machine learning model using precision, recall, and F1.
Inputs
------
y : np.array
Known labels, binarized.
preds : np.array
Predicted labels, binarized.
Returns
-------
precision : float
recall : float
fbeta : float
"""
fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
precision = precision_score(y, preds, zero_division=1)
recall = recall_score(y, preds, zero_division=1)
return precision, recall, fbeta
def inference(model, X):
""" Run model inferences and return the predictions.
Inputs
------
model : ???
Trained machine learning model.
X : np.array
Data used for prediction.
Returns
-------
preds : np.array
Predictions from the model.
"""
preds=model.predict(X)
return preds
def train_and_test_on_slices(train,test,test_size_def=.2):
"""
function that get the performance of the model on slices of data
Inputs:
train (pd.DataFrame): The features
test (pd.Series): The labels
"""
metrics=[]
for i in range(10):
print(f"slice {i}:")
train_data, test_data, train_label, test_label = train_test_split(
train,
test,
test_size=test_size_def,
random_state=i,
)
model=train_model(train_data,train_label)
predictions=model.predict(test_data)
precision,recall,fbeta = compute_model_metrics(test_label,predictions)
print_metrics(precision, recall, fbeta, model.score(test_data,test_label))
metrics.append(
[
int(i),
test_size_def,
precision,
recall,
fbeta,
model.score(test_data, test_label),
]
)
metrics_df=pd.DataFrame(
metrics,
columns=["random_state","test_size","precision","recall","f1","accuracy"],
)
metrics_mean=metrics_df.mean()
return metrics_df, metrics_mean
def print_metrics(precision, recall, fbeta, accuracy):
"""
Function to print metrics
Inputs
Precision
Recall
fbeta
accuracy
"""
print(f"Precision: {round(precision, 2)}")
print(f"Recall: {round(recall, 2)}")
print(f"F1: {round(fbeta, 2)}")
print(f"Accuracy: {round(accuracy, 2)}")
def evaluate_with_feature_fixed(model, train_data, fixed_metric, cat_features, encoder, label_binarizer):
"""
Function to compute the performance metrics when
the value of a given feature is fixed
Inputs
------
model : ML model
Trained machine learning model.
train_data: pd.DataFrame
The data to be used for evaluation
fixed_metric : str
The name of the feature to be held fixed
cat_features :
encoder : sklearn.preprocessing.OneHotEncoder
The encoder used to encode the categorical features
label_binarizer : sklearn.preprocessing.LabelBinarizer
The label binarizer used to binarize the labels
"""
unique_values = train_data[fixed_metric].unique()
with open(f"Sliced_output_{fixed_metric}.txt", "w", encoding="utf-8") as file:
file.write(f"Performance metrics for {fixed_metric}")
file.write("\n")
file.write("-" * 10)
file.write("\n")
file.write("-" * 10)
file.write("\n")
for fixed_slice in unique_values:
file.write(fixed_slice)
file.write("\n")
metric_fixed_df = train_data.loc[train_data.loc[:, fixed_metric] == fixed_slice, :]
data_processed, labels_processed, encoder, label_binarizer = process_data(
metric_fixed_df,
categorical_features=cat_features,
label="salary",
training=False,
encoder=encoder,
lb=label_binarizer
)
predictions=inference(model, data_processed)
precision,recall,fbeta = compute_model_metrics(labels_processed, predictions)
file.write(f"Precision: {precision}\n")
file.write(f"Recall: {recall}\n")
file.write(f"fbeta: {fbeta}\n")
file.write(f"Accuracy: {model.score(data_processed, labels_processed)}\n")
file.write("-" * 10)
file.write("\n")
file.close()