@@ -165,7 +165,7 @@ def gbm_forward_select_train(orig_x_names, y_name, train, valid, seed_, next_lis
165
165
# init loop var
166
166
selected = orig_x_names
167
167
168
- for j , name in enumerate ( next_list ):
168
+ for j in range ( 0 , len ( next_list ) + 1 ):
169
169
170
170
# init or clear local dict of monotone constraints
171
171
mc = None
@@ -190,6 +190,10 @@ def gbm_forward_select_train(orig_x_names, y_name, train, valid, seed_, next_lis
190
190
hvalid = h2o .H2OFrame (valid [selected + [y_name ]])
191
191
192
192
# train model and calculate Shapley values
193
+ print ('Starting grid search %i/%i ...' % (j + 1 , len (next_list )+ 1 ))
194
+ print ('Input features =' , selected )
195
+ if mc is not None :
196
+ print ('Monotone constraints =' , mc )
193
197
model_list .append (gbm_grid (selected , y_name , htrain , hvalid , seed_ ,
194
198
monotone_constraints_ = mc , hyper_params_ = hyper_params_ ,
195
199
search_criteria_ = search_criteria_ ))
@@ -203,11 +207,13 @@ def gbm_forward_select_train(orig_x_names, y_name, train, valid, seed_, next_lis
203
207
204
208
# retrieve AUC and update progress
205
209
auc_ = model_list [j ].auc (valid = True )
206
- print ('Completed grid search %i/%i with AUC: %.2f ...' % (j + 1 , len (next_list ), auc_ ))
210
+ print ('Completed grid search %i/%i with AUC: %.2f ...' % (j + 1 , len (next_list )+ 1 , auc_ ))
211
+ print ('--------------------------------------------------------------------------------' )
207
212
208
213
# add the next most y-correlated feature
209
214
# for the next modeling iteration
210
- selected = selected + [next_list [j ]]
215
+ if j < len (next_list ):
216
+ selected = selected + [next_list [j ]]
211
217
212
218
print ('Done.' )
213
219
@@ -283,7 +289,8 @@ def cv_model_rank(valid, seed_, model_name_list, nfolds=5):
283
289
# dynamically generate and run code statements
284
290
# to calculate metrics for each fold and model
285
291
for model in sorted (model_name_list ):
286
- code = 'h2o.get_model("%s").model_performance(h2o.H2OFrame(temp_df[temp_df["fold"] == %d])).%s()' % (model , fold , metric )
292
+ code = 'h2o.get_model("%s").model_performance(h2o.H2OFrame(temp_df[temp_df["fold"] == %d])).%s()' \
293
+ % (model , fold , metric )
287
294
key_ = model + ' Value'
288
295
val_ = eval (code )
289
296
@@ -343,6 +350,7 @@ def cv_model_rank_select(valid, seed_, coef_list, model_list, model_prefix,
343
350
344
351
best_idx = 0
345
352
rank = len (compare_model_ids ) + 1
353
+ best_model_frame = None
346
354
347
355
for i in range (0 , len (model_list )):
348
356
@@ -453,9 +461,7 @@ def get_percentile_dict(yhat_name, valid, id_):
453
461
sort_df .reset_index (inplace = True )
454
462
455
463
# find top and bottom percentiles
456
- percentiles_dict = {}
457
- percentiles_dict [0 ] = sort_df .loc [0 , id_ ]
458
- percentiles_dict [99 ] = sort_df .loc [sort_df .shape [0 ] - 1 , id_ ]
464
+ percentiles_dict = {0 : sort_df .loc [0 , id_ ], 99 : sort_df .loc [sort_df .shape [0 ] - 1 , id_ ]}
459
465
460
466
# find 10th-90th percentiles
461
467
inc = sort_df .shape [0 ] // 10
@@ -498,9 +504,9 @@ def plot_pd_ice(x_name, par_dep_frame, ax=None):
498
504
else :
499
505
500
506
# plot ICE
501
- par_dep_frame .plot (x = x_name ,
502
- colormap = 'gnuplot' ,
503
- ax = ax )
507
+ par_dep_frame .drop ( 'partial_dependence' , axis = 1 ). plot (x = x_name ,
508
+ colormap = 'gnuplot' ,
509
+ ax = ax )
504
510
505
511
# overlay partial dependence, annotate plot
506
512
par_dep_frame .plot (title = 'Partial Dependence with ICE: ' + x_name ,
0 commit comments