Skip to content

Commit d708fe8

Browse files
committed
added business impact hyperparams
1 parent ef0ab17 commit d708fe8

File tree

2 files changed

+3837
-2171
lines changed

2 files changed

+3837
-2171
lines changed

auto_ph.py

+169-15
Original file line numberDiff line numberDiff line change
@@ -154,13 +154,14 @@ def gbm_forward_select_train(orig_x_names, y_name, train, valid, seed_, next_lis
154154
:param monotone_constraints_: Dictionary of monotonicity constraints (optional).
155155
:param hyper_params_: Dictionary of hyperparamters over which to search (optional).
156156
:param search_criteria_: Dictionary of criterion for grid search (optional).
157-
:return: List of H2O GBM models trained in forward selection; list containing
158-
a coef_frame for each model.
157+
:return: Dictionary of: list of H2O GBM models trained in forward selection, list
158+
containing a coef_frame for each model, list of Shapley values for each model.
159159
"""
160160

161161
# init empty parallel lists to store results
162162
model_list = []
163163
coef_list = []
164+
shap_list = []
164165

165166
# init loop var
166167
selected = orig_x_names
@@ -198,6 +199,7 @@ def gbm_forward_select_train(orig_x_names, y_name, train, valid, seed_, next_lis
198199
monotone_constraints_=mc, hyper_params_=hyper_params_,
199200
search_criteria_=search_criteria_))
200201
shap_values = model_list[j].predict_contributions(hvalid).as_data_frame().values[:, :-1]
202+
shap_list.append(shap_values)
201203

202204
# update coef_frame with current model Shapley values
203205
# update coef_list
@@ -217,7 +219,7 @@ def gbm_forward_select_train(orig_x_names, y_name, train, valid, seed_, next_lis
217219

218220
print('Done.')
219221

220-
return model_list, coef_list
222+
return {'MODELS': model_list, 'GLOBAL_COEFS': coef_list, 'LOCAL_COEFS': shap_list}
221223

222224

223225
def plot_coefs(coef_list, model_list, title_model, column_order):
@@ -327,7 +329,7 @@ def cv_model_rank(valid, seed_, model_name_list, nfolds=5):
327329
return eval_frame
328330

329331

330-
def cv_model_rank_select(valid, seed_, coef_list, model_list, model_prefix,
332+
def cv_model_rank_select(valid, seed_, train_results, model_prefix,
331333
compare_model_ids, nfolds=5):
332334

333335
""" Performs CV ranking for models in model_list, as compared
@@ -336,10 +338,11 @@ def cv_model_rank_select(valid, seed_, coef_list, model_list, model_prefix,
336338
337339
:param valid: Pandas validation frame.
338340
:param seed_: Random seed for better reproducibility.
339-
:param coef_list: List containing global var. imp. coefficients
340-
for models in model list (tightly coupled to frame schemas).
341-
:param model_list: List of H2O GBM models trained in forward selection.
342-
:param model_prefix:
341+
:param train_results: Dict created by gbm_forward_select_train
342+
containing a list of models, a list of
343+
global coefficients, and a list of local
344+
coefficients.
345+
:param model_prefix: String prefix for generated model_id's.
343346
:param compare_model_ids: A list of H2O model_ids.
344347
:param nfolds: Number of folds over which to evaluate model rankings.
345348
@@ -352,13 +355,13 @@ def cv_model_rank_select(valid, seed_, coef_list, model_list, model_prefix,
352355
rank = len(compare_model_ids) + 1
353356
best_model_frame = None
354357

355-
for i in range(0, len(model_list)):
358+
for i in range(0, len(train_results['MODELS'])):
356359

357360
# assign model_ids correctly
358361
# so models can be accessed by model_id
359362
# in cv_model_rank
360363
model_id = model_prefix + str(i+1)
361-
model_list[i].model_id = model_id
364+
train_results['MODELS'][i].model_id = model_id
362365
model_name_list = compare_model_ids + [model_id]
363366

364367
# perform CV rank eval for
@@ -374,20 +377,26 @@ def cv_model_rank_select(valid, seed_, coef_list, model_list, model_prefix,
374377
if new_rank < rank:
375378
best_idx = i
376379
best_model_frame = eval_frame
377-
print('Evaluated model %i/%i with rank: %.2f* ...' % (i + 1, len(model_list), new_rank))
380+
print('Evaluated model %i/%i with rank: %.2f* ...' % (i + 1, len(train_results['MODELS']),
381+
new_rank))
378382
rank = new_rank
379383
else:
380-
print('Evaluated model %i/%i with rank: %.2f ...' % (i + 1, len(model_list), new_rank))
384+
print('Evaluated model %i/%i with rank: %.2f ...' % (i + 1, len(train_results['MODELS']),
385+
new_rank))
381386

382387
# select model and coefficients
383-
best_model = model_list[best_idx]
384-
best_coefs = coef_list[best_idx]
388+
best_model = train_results['MODELS'][best_idx]
389+
best_shap = train_results['LOCAL_COEFS'][best_idx]
390+
best_coefs = train_results['GLOBAL_COEFS'][best_idx]
385391

386392
print('Done.')
387393

388394
# return best model, it's associated coefficients
389395
# and it's CV ranking frame
390-
return best_model, best_coefs, best_model_frame
396+
return {'BEST_MODEL': best_model,
397+
'BEST_LOCAL_COEFS': best_shap,
398+
'BEST_GLOBAL_COEFS': best_coefs,
399+
'METRICS': best_model_frame}
391400

392401

393402
def pd_ice(x_name, valid, model, resolution=20, bins=None):
@@ -519,6 +528,16 @@ def plot_pd_ice(x_name, par_dep_frame, ax=None):
519528

520529
def hist_mean_pd_ice_plot(x_name, y_name, valid, pd_ice_dict):
521530

531+
""" Plots diagnostic plot of histogram with mean line overlay
532+
side-by-side with partial dependence and ICE.
533+
534+
:param x_name: Name of variable for which to plot ICE and partial dependence.
535+
:param y_name: Name of target variable.
536+
:param valid: Pandas validation frame.
537+
:param pd_ice_dict: Dict of Pandas DataFrames containing partial dependence
538+
and ICE values.
539+
"""
540+
522541
# initialize figure and axis
523542
fig, (ax, ax2) = plt.subplots(ncols=2, sharey=False)
524543
plt.tight_layout()
@@ -561,3 +580,138 @@ def hist_mean_pd_ice_plot(x_name, y_name, valid, pd_ice_dict):
561580
_ = ax2.legend(bbox_to_anchor=(1.05, 0),
562581
loc=3,
563582
borderaxespad=0.)
583+
584+
585+
def get_confusion_matrix(valid, y_name, yhat_name, by=None, level=None, cutoff=0.5):
586+
587+
""" Creates confusion matrix from pandas DataFrame of y and yhat values, can be sliced
588+
by a variable and level.
589+
590+
:param valid: Validation DataFrame of actual (y) and predicted (yhat) values.
591+
:param y_name: Name of actual value column.
592+
:param yhat_name: Name of predicted value column.
593+
:param by: By variable to slice frame before creating confusion matrix, default None.
594+
:param level: Value of by variable to slice frame before creating confusion matrix, default None.
595+
:param cutoff: Cutoff threshold for confusion matrix, default 0.5.
596+
597+
:return: Confusion matrix as pandas DataFrame.
598+
"""
599+
600+
# determine levels of target (y) variable
601+
# sort for consistency
602+
level_list = list(valid[y_name].unique())
603+
level_list.sort(reverse=True)
604+
605+
# init confusion matrix
606+
cm_frame = pd.DataFrame(columns=['actual: ' + str(i) for i in level_list],
607+
index=['predicted: ' + str(i) for i in level_list])
608+
609+
# don't destroy original data
610+
frame_ = valid.copy(deep=True)
611+
612+
# convert numeric predictions to binary decisions using cutoff
613+
dname = 'd_' + str(y_name)
614+
frame_[dname] = np.where(frame_[yhat_name] > cutoff, 1, 0)
615+
616+
# slice frame
617+
if (by is not None) & (level is not None):
618+
frame_ = frame_[valid[by] == level]
619+
620+
# calculate size of each confusion matrix value
621+
for i, lev_i in enumerate(level_list):
622+
for j, lev_j in enumerate(level_list):
623+
cm_frame.iat[j, i] = frame_[(frame_[y_name] == lev_i) & (frame_[dname] == lev_j)].shape[0]
624+
# i, j vs. j, i nasty little bug ... updated 8/30/19
625+
626+
return cm_frame
627+
628+
629+
def air(cm_dict, reference, protected):
630+
631+
""" Calculates the adverse impact ratio as a quotient between protected and
632+
reference group acceptance rates: protected_prop/reference_prop.
633+
Prints intermediate values. Tightly coupled to cm_dict.
634+
635+
:param cm_dict: Dict of confusion matrices containing information
636+
about reference and protected groups.
637+
:param reference: Name of reference group in cm_dict as a string.
638+
:param protected: Name of protected group in cm_dict as a string.
639+
:return: AIR value.
640+
"""
641+
642+
# reference group summary
643+
reference_accepted = float(cm_dict[reference].iat[1, 0] + cm_dict[reference].iat[1, 1]) # predicted 0's
644+
reference_total = float(cm_dict[reference].sum().sum())
645+
reference_prop = reference_accepted / reference_total
646+
print(reference.title() + ' proportion accepted: %.3f' % reference_prop)
647+
648+
# protected group summary
649+
protected_accepted = float(cm_dict[protected].iat[1, 0] + cm_dict[protected].iat[1, 1]) # predicted 0's
650+
protected_total = float(cm_dict[protected].sum().sum())
651+
protected_prop = protected_accepted / protected_total
652+
print(protected.title() + ' proportion accepted: %.3f' % protected_prop)
653+
654+
# return adverse impact ratio
655+
return protected_prop/reference_prop
656+
657+
658+
def marginal_effect(cm_dict, reference, protected):
659+
660+
""" Calculates the marginal effect as a percentage difference between a reference and
661+
a protected group: reference_percent - protected_percent. Prints intermediate values.
662+
Tightly coupled to cm_dict.
663+
664+
:param cm_dict: Dict of confusion matrices containing information
665+
about reference and protected groups.
666+
:param reference: Name of reference group in cm_dict as a string.
667+
:param protected: Name of protected group in cm_dict as a string.
668+
:return: Marginal effect value.
669+
670+
"""
671+
672+
# reference group summary
673+
reference_accepted = float(cm_dict[reference].iat[1, 0] + cm_dict[reference].iat[1, 1]) # predicted 0's
674+
reference_total = float(cm_dict[reference].sum().sum())
675+
reference_percent = 100 * (reference_accepted / reference_total)
676+
print(reference.title() + ' accepted: %.2f%%' % reference_percent)
677+
678+
# protected group summary
679+
protected_accepted = float(cm_dict[protected].iat[1, 0] + cm_dict[protected].iat[1, 1]) # predicted 0's
680+
protected_total = float(cm_dict[protected].sum().sum())
681+
protected_percent = 100 * (protected_accepted / protected_total)
682+
print(protected.title() + ' accepted: %.2f%%' % protected_percent)
683+
684+
# return marginal effect
685+
return reference_percent - protected_percent
686+
687+
688+
def smd(valid, x_name, yhat_name, reference, protected):
689+
690+
""" Calculates standardized mean difference between a protected and reference group:
691+
(mean(yhat | x_j=protected) - mean(yhat | x_j=reference))/sigma(yhat).
692+
Prints intermediate values.
693+
694+
:param valid: Pandas dataframe containing j and predicted (yhat) values.
695+
:param x_name: name of demographic column containing reference and protected group labels.
696+
:param yhat_name: Name of predicted value column.
697+
:param reference: name of reference group in x_name.
698+
:param protected: name of protected group in x_name.
699+
700+
Returns:
701+
Standardized mean difference as a formatted string.
702+
703+
"""
704+
705+
# yhat mean for j=reference
706+
reference_yhat_mean = valid[valid[x_name] == reference][yhat_name].mean()
707+
print(reference.title() + ' mean yhat: %.2f' % reference_yhat_mean)
708+
709+
# yhat mean for j=protected
710+
protected_yhat_mean = valid[valid[x_name] == protected][yhat_name].mean()
711+
print(protected.title() + ' mean yhat: %.2f' % protected_yhat_mean)
712+
713+
# std for yhat
714+
sigma = valid[yhat_name].std()
715+
print(yhat_name.title() + ' std. dev.: %.2f' % sigma)
716+
717+
return (protected_yhat_mean - reference_yhat_mean) / sigma

0 commit comments

Comments
 (0)