@@ -154,13 +154,14 @@ def gbm_forward_select_train(orig_x_names, y_name, train, valid, seed_, next_lis
154
154
:param monotone_constraints_: Dictionary of monotonicity constraints (optional).
155
155
:param hyper_params_: Dictionary of hyperparamters over which to search (optional).
156
156
:param search_criteria_: Dictionary of criterion for grid search (optional).
157
- :return: List of H2O GBM models trained in forward selection; list containing
158
- a coef_frame for each model.
157
+ :return: Dictionary of: list of H2O GBM models trained in forward selection, list
158
+ containing a coef_frame for each model, list of Shapley values for each model.
159
159
"""
160
160
161
161
# init empty parallel lists to store results
162
162
model_list = []
163
163
coef_list = []
164
+ shap_list = []
164
165
165
166
# init loop var
166
167
selected = orig_x_names
@@ -198,6 +199,7 @@ def gbm_forward_select_train(orig_x_names, y_name, train, valid, seed_, next_lis
198
199
monotone_constraints_ = mc , hyper_params_ = hyper_params_ ,
199
200
search_criteria_ = search_criteria_ ))
200
201
shap_values = model_list [j ].predict_contributions (hvalid ).as_data_frame ().values [:, :- 1 ]
202
+ shap_list .append (shap_values )
201
203
202
204
# update coef_frame with current model Shapley values
203
205
# update coef_list
@@ -217,7 +219,7 @@ def gbm_forward_select_train(orig_x_names, y_name, train, valid, seed_, next_lis
217
219
218
220
print ('Done.' )
219
221
220
- return model_list , coef_list
222
+ return { 'MODELS' : model_list , 'GLOBAL_COEFS' : coef_list , 'LOCAL_COEFS' : shap_list }
221
223
222
224
223
225
def plot_coefs (coef_list , model_list , title_model , column_order ):
@@ -327,7 +329,7 @@ def cv_model_rank(valid, seed_, model_name_list, nfolds=5):
327
329
return eval_frame
328
330
329
331
330
- def cv_model_rank_select (valid , seed_ , coef_list , model_list , model_prefix ,
332
+ def cv_model_rank_select (valid , seed_ , train_results , model_prefix ,
331
333
compare_model_ids , nfolds = 5 ):
332
334
333
335
""" Performs CV ranking for models in model_list, as compared
@@ -336,10 +338,11 @@ def cv_model_rank_select(valid, seed_, coef_list, model_list, model_prefix,
336
338
337
339
:param valid: Pandas validation frame.
338
340
:param seed_: Random seed for better reproducibility.
339
- :param coef_list: List containing global var. imp. coefficients
340
- for models in model list (tightly coupled to frame schemas).
341
- :param model_list: List of H2O GBM models trained in forward selection.
342
- :param model_prefix:
341
+ :param train_results: Dict created by gbm_forward_select_train
342
+ containing a list of models, a list of
343
+ global coefficients, and a list of local
344
+ coefficients.
345
+ :param model_prefix: String prefix for generated model_id's.
343
346
:param compare_model_ids: A list of H2O model_ids.
344
347
:param nfolds: Number of folds over which to evaluate model rankings.
345
348
@@ -352,13 +355,13 @@ def cv_model_rank_select(valid, seed_, coef_list, model_list, model_prefix,
352
355
rank = len (compare_model_ids ) + 1
353
356
best_model_frame = None
354
357
355
- for i in range (0 , len (model_list )):
358
+ for i in range (0 , len (train_results [ 'MODELS' ] )):
356
359
357
360
# assign model_ids correctly
358
361
# so models can be accessed by model_id
359
362
# in cv_model_rank
360
363
model_id = model_prefix + str (i + 1 )
361
- model_list [i ].model_id = model_id
364
+ train_results [ 'MODELS' ] [i ].model_id = model_id
362
365
model_name_list = compare_model_ids + [model_id ]
363
366
364
367
# perform CV rank eval for
@@ -374,20 +377,26 @@ def cv_model_rank_select(valid, seed_, coef_list, model_list, model_prefix,
374
377
if new_rank < rank :
375
378
best_idx = i
376
379
best_model_frame = eval_frame
377
- print ('Evaluated model %i/%i with rank: %.2f* ...' % (i + 1 , len (model_list ), new_rank ))
380
+ print ('Evaluated model %i/%i with rank: %.2f* ...' % (i + 1 , len (train_results ['MODELS' ]),
381
+ new_rank ))
378
382
rank = new_rank
379
383
else :
380
- print ('Evaluated model %i/%i with rank: %.2f ...' % (i + 1 , len (model_list ), new_rank ))
384
+ print ('Evaluated model %i/%i with rank: %.2f ...' % (i + 1 , len (train_results ['MODELS' ]),
385
+ new_rank ))
381
386
382
387
# select model and coefficients
383
- best_model = model_list [best_idx ]
384
- best_coefs = coef_list [best_idx ]
388
+ best_model = train_results ['MODELS' ][best_idx ]
389
+ best_shap = train_results ['LOCAL_COEFS' ][best_idx ]
390
+ best_coefs = train_results ['GLOBAL_COEFS' ][best_idx ]
385
391
386
392
print ('Done.' )
387
393
388
394
# return best model, it's associated coefficients
389
395
# and it's CV ranking frame
390
- return best_model , best_coefs , best_model_frame
396
+ return {'BEST_MODEL' : best_model ,
397
+ 'BEST_LOCAL_COEFS' : best_shap ,
398
+ 'BEST_GLOBAL_COEFS' : best_coefs ,
399
+ 'METRICS' : best_model_frame }
391
400
392
401
393
402
def pd_ice (x_name , valid , model , resolution = 20 , bins = None ):
@@ -519,6 +528,16 @@ def plot_pd_ice(x_name, par_dep_frame, ax=None):
519
528
520
529
def hist_mean_pd_ice_plot (x_name , y_name , valid , pd_ice_dict ):
521
530
531
+ """ Plots diagnostic plot of histogram with mean line overlay
532
+ side-by-side with partial dependence and ICE.
533
+
534
+ :param x_name: Name of variable for which to plot ICE and partial dependence.
535
+ :param y_name: Name of target variable.
536
+ :param valid: Pandas validation frame.
537
+ :param pd_ice_dict: Dict of Pandas DataFrames containing partial dependence
538
+ and ICE values.
539
+ """
540
+
522
541
# initialize figure and axis
523
542
fig , (ax , ax2 ) = plt .subplots (ncols = 2 , sharey = False )
524
543
plt .tight_layout ()
@@ -561,3 +580,138 @@ def hist_mean_pd_ice_plot(x_name, y_name, valid, pd_ice_dict):
561
580
_ = ax2 .legend (bbox_to_anchor = (1.05 , 0 ),
562
581
loc = 3 ,
563
582
borderaxespad = 0. )
583
+
584
+
585
+ def get_confusion_matrix (valid , y_name , yhat_name , by = None , level = None , cutoff = 0.5 ):
586
+
587
+ """ Creates confusion matrix from pandas DataFrame of y and yhat values, can be sliced
588
+ by a variable and level.
589
+
590
+ :param valid: Validation DataFrame of actual (y) and predicted (yhat) values.
591
+ :param y_name: Name of actual value column.
592
+ :param yhat_name: Name of predicted value column.
593
+ :param by: By variable to slice frame before creating confusion matrix, default None.
594
+ :param level: Value of by variable to slice frame before creating confusion matrix, default None.
595
+ :param cutoff: Cutoff threshold for confusion matrix, default 0.5.
596
+
597
+ :return: Confusion matrix as pandas DataFrame.
598
+ """
599
+
600
+ # determine levels of target (y) variable
601
+ # sort for consistency
602
+ level_list = list (valid [y_name ].unique ())
603
+ level_list .sort (reverse = True )
604
+
605
+ # init confusion matrix
606
+ cm_frame = pd .DataFrame (columns = ['actual: ' + str (i ) for i in level_list ],
607
+ index = ['predicted: ' + str (i ) for i in level_list ])
608
+
609
+ # don't destroy original data
610
+ frame_ = valid .copy (deep = True )
611
+
612
+ # convert numeric predictions to binary decisions using cutoff
613
+ dname = 'd_' + str (y_name )
614
+ frame_ [dname ] = np .where (frame_ [yhat_name ] > cutoff , 1 , 0 )
615
+
616
+ # slice frame
617
+ if (by is not None ) & (level is not None ):
618
+ frame_ = frame_ [valid [by ] == level ]
619
+
620
+ # calculate size of each confusion matrix value
621
+ for i , lev_i in enumerate (level_list ):
622
+ for j , lev_j in enumerate (level_list ):
623
+ cm_frame .iat [j , i ] = frame_ [(frame_ [y_name ] == lev_i ) & (frame_ [dname ] == lev_j )].shape [0 ]
624
+ # i, j vs. j, i nasty little bug ... updated 8/30/19
625
+
626
+ return cm_frame
627
+
628
+
629
+ def air (cm_dict , reference , protected ):
630
+
631
+ """ Calculates the adverse impact ratio as a quotient between protected and
632
+ reference group acceptance rates: protected_prop/reference_prop.
633
+ Prints intermediate values. Tightly coupled to cm_dict.
634
+
635
+ :param cm_dict: Dict of confusion matrices containing information
636
+ about reference and protected groups.
637
+ :param reference: Name of reference group in cm_dict as a string.
638
+ :param protected: Name of protected group in cm_dict as a string.
639
+ :return: AIR value.
640
+ """
641
+
642
+ # reference group summary
643
+ reference_accepted = float (cm_dict [reference ].iat [1 , 0 ] + cm_dict [reference ].iat [1 , 1 ]) # predicted 0's
644
+ reference_total = float (cm_dict [reference ].sum ().sum ())
645
+ reference_prop = reference_accepted / reference_total
646
+ print (reference .title () + ' proportion accepted: %.3f' % reference_prop )
647
+
648
+ # protected group summary
649
+ protected_accepted = float (cm_dict [protected ].iat [1 , 0 ] + cm_dict [protected ].iat [1 , 1 ]) # predicted 0's
650
+ protected_total = float (cm_dict [protected ].sum ().sum ())
651
+ protected_prop = protected_accepted / protected_total
652
+ print (protected .title () + ' proportion accepted: %.3f' % protected_prop )
653
+
654
+ # return adverse impact ratio
655
+ return protected_prop / reference_prop
656
+
657
+
658
+ def marginal_effect (cm_dict , reference , protected ):
659
+
660
+ """ Calculates the marginal effect as a percentage difference between a reference and
661
+ a protected group: reference_percent - protected_percent. Prints intermediate values.
662
+ Tightly coupled to cm_dict.
663
+
664
+ :param cm_dict: Dict of confusion matrices containing information
665
+ about reference and protected groups.
666
+ :param reference: Name of reference group in cm_dict as a string.
667
+ :param protected: Name of protected group in cm_dict as a string.
668
+ :return: Marginal effect value.
669
+
670
+ """
671
+
672
+ # reference group summary
673
+ reference_accepted = float (cm_dict [reference ].iat [1 , 0 ] + cm_dict [reference ].iat [1 , 1 ]) # predicted 0's
674
+ reference_total = float (cm_dict [reference ].sum ().sum ())
675
+ reference_percent = 100 * (reference_accepted / reference_total )
676
+ print (reference .title () + ' accepted: %.2f%%' % reference_percent )
677
+
678
+ # protected group summary
679
+ protected_accepted = float (cm_dict [protected ].iat [1 , 0 ] + cm_dict [protected ].iat [1 , 1 ]) # predicted 0's
680
+ protected_total = float (cm_dict [protected ].sum ().sum ())
681
+ protected_percent = 100 * (protected_accepted / protected_total )
682
+ print (protected .title () + ' accepted: %.2f%%' % protected_percent )
683
+
684
+ # return marginal effect
685
+ return reference_percent - protected_percent
686
+
687
+
688
+ def smd (valid , x_name , yhat_name , reference , protected ):
689
+
690
+ """ Calculates standardized mean difference between a protected and reference group:
691
+ (mean(yhat | x_j=protected) - mean(yhat | x_j=reference))/sigma(yhat).
692
+ Prints intermediate values.
693
+
694
+ :param valid: Pandas dataframe containing j and predicted (yhat) values.
695
+ :param x_name: name of demographic column containing reference and protected group labels.
696
+ :param yhat_name: Name of predicted value column.
697
+ :param reference: name of reference group in x_name.
698
+ :param protected: name of protected group in x_name.
699
+
700
+ Returns:
701
+ Standardized mean difference as a formatted string.
702
+
703
+ """
704
+
705
+ # yhat mean for j=reference
706
+ reference_yhat_mean = valid [valid [x_name ] == reference ][yhat_name ].mean ()
707
+ print (reference .title () + ' mean yhat: %.2f' % reference_yhat_mean )
708
+
709
+ # yhat mean for j=protected
710
+ protected_yhat_mean = valid [valid [x_name ] == protected ][yhat_name ].mean ()
711
+ print (protected .title () + ' mean yhat: %.2f' % protected_yhat_mean )
712
+
713
+ # std for yhat
714
+ sigma = valid [yhat_name ].std ()
715
+ print (yhat_name .title () + ' std. dev.: %.2f' % sigma )
716
+
717
+ return (protected_yhat_mean - reference_yhat_mean ) / sigma
0 commit comments