16
16
def parse_args ():
17
17
parser = argparse .ArgumentParser ()
18
18
parser .add_argument ('--n_trees' , type = int , default = 16 )
19
- parser .add_argument ('--openml_ids' , type = int , nargs = '+' , default = [3 ])
19
+ parser .add_argument ('--openml_ids' , type = int , nargs = '+' , default = [3 , 6 ])
20
20
parser .add_argument ('--workflow_name' , type = str , default = "lcdb.workflow.sklearn.LibLinearWorkflow" )
21
21
parser .add_argument ('--openml_taskid_name' , type = str , default = "m:openmlid" )
22
22
parser .add_argument ('--output_directory' , type = str , default = os .path .expanduser ('~/experiments/lcdb' ))
23
23
parser .add_argument ('--output_filetype' , type = str , choices = ['pdf' , 'png' ], default = 'png' )
24
24
parser .add_argument ('--max_load' , type = int , default = None )
25
- parser .add_argument ('--anchor_value ' , type = int , default = 2048 )
25
+ parser .add_argument ('--anchor_values ' , type = int , nargs = '+' , default = [ 128 , 512 , 2048 , - 1 ] )
26
26
return parser .parse_args ()
27
27
28
28
@@ -44,7 +44,7 @@ def numeric_encode(df, config_space):
44
44
return result
45
45
46
46
47
- def fanova_on_task (task_results , performance_column_name , config_space , n_trees ):
47
+ def fanova_on_task (task_results , performance_column_name , current_anchor_value , config_space , n_trees ):
48
48
fanova_results = []
49
49
50
50
evaluator = fanova .fanova .fANOVA (
@@ -61,14 +61,15 @@ def fanova_on_task(task_results, performance_column_name, config_space, n_trees)
61
61
62
62
fanova_results .append ({
63
63
"hyperparameter" : pname ,
64
- "fanova" : importance [(idx ,)]["individual importance" ],
64
+ "anchor" : current_anchor_value ,
65
+ "variance_contribution" : importance [(idx ,)]["individual importance" ],
65
66
})
66
67
return fanova_results
67
68
68
69
69
70
def run (args ):
70
71
fanova_all_results = []
71
- performance_column = "objective"
72
+ performance_column = "final_objective" # make sure to give this a unique name (not same as the " objective" field)
72
73
anchor_size_column = "anchor_sizes"
73
74
learning_curve_column = "learning_curve_data"
74
75
@@ -95,50 +96,54 @@ def run(args):
95
96
# job_ids = frame_workflow_job_task['job_id'].unique()
96
97
if len (workflow_ids ) > 1 or len (openml_task_ids ) > 1 :
97
98
raise ValueError ('Should not happen. %s %s' % (str (workflow_ids ), str (openml_task_ids )))
98
- if (workflow_ids [0 ], openml_task_ids [0 ]) not in id_results :
99
- id_results [(workflow_ids [0 ], openml_task_ids [0 ])] = list ()
100
-
101
- performance_values_new = list ()
102
- for index , row in frame_workflow_job_task .iterrows ():
103
- anchor_sizes = row [anchor_size_column ]
104
- performance_value_at_anchor = np .nan
105
- if args .anchor_value is not None :
106
- if args .anchor_value not in anchor_sizes :
107
- logging .warning ('Anchor %d not available in task %d workflow %s'
108
- % (args .anchor_value , openml_task_ids [0 ], workflow_ids [0 ])
109
- )
99
+
100
+ for current_anchor_value in args .anchor_values :
101
+ if (workflow_ids [0 ], openml_task_ids [0 ], current_anchor_value ) not in id_results :
102
+ id_results [(workflow_ids [0 ], openml_task_ids [0 ], current_anchor_value )] = list ()
103
+
104
+ performance_values_new = list ()
105
+ for index , row in frame_workflow_job_task .iterrows ():
106
+ anchor_sizes = row [anchor_size_column ]
107
+ performance_value_at_anchor = np .nan
108
+ if current_anchor_value != - 1 :
109
+ if current_anchor_value not in anchor_sizes :
110
+ logging .warning ('Anchor %d not available in task %d workflow %s'
111
+ % (current_anchor_value , openml_task_ids [0 ], workflow_ids [0 ])
112
+ )
113
+ else :
114
+ anchor_index = anchor_sizes .index (current_anchor_value )
115
+ performance_value_at_anchor = row [learning_curve_column ][anchor_index ]
110
116
else :
111
- anchor_index = anchor_sizes .index (args .anchor_value )
112
- performance_value_at_anchor = row [learning_curve_column ][anchor_index ]
113
- else :
114
- performance_value_at_anchor = row [learning_curve_column ][- 1 ]
115
- performance_values_new .append (performance_value_at_anchor )
116
- performance_values_new = np .array (performance_values_new , dtype = float )
117
- frame_workflow_job_task [performance_column ] = pd .Series (performance_values_new )
117
+ performance_value_at_anchor = row [learning_curve_column ][- 1 ]
118
+ performance_values_new .append (performance_value_at_anchor )
119
+ performance_values_new = np .array (performance_values_new , dtype = float )
120
+ frame_workflow_job_task [performance_column ] = pd .Series (performance_values_new )
118
121
119
- id_results [(workflow_ids [0 ], openml_task_ids [0 ])].append (frame_workflow_job_task )
122
+ id_results [(workflow_ids [0 ], openml_task_ids [0 ], current_anchor_value )].append (frame_workflow_job_task )
120
123
121
- load_count += 1
122
- if args .max_load and load_count >= args .max_load :
123
- break
124
+ load_count += 1
125
+ if args .max_load and load_count >= args .max_load :
126
+ break
124
127
125
128
task_ids = set ()
126
- for idx , (workflow_name , task_id ) in enumerate (id_results ):
129
+ for idx , (workflow_name , task_id , current_anchor_value ) in enumerate (id_results ):
127
130
task_ids .add (task_id )
128
- task_results = pd .concat (id_results [(workflow_name , task_id )])
131
+ task_results = pd .concat (id_results [(workflow_name , task_id , current_anchor_value )])
129
132
task_results = task_results .rename (workflow_hyperparameter_mapping , axis = 1 )
130
133
relevant_columns = list (workflow_hyperparameter_mapping .values ()) + [performance_column ]
131
134
task_results = task_results [relevant_columns ]
132
135
133
- logging .info ("Starting with task %d (%d/%d)" % (task_id , idx + 1 , len (id_results )))
134
- fanova_task_results = fanova_on_task (task_results , performance_column , config_space , args .n_trees )
136
+ logging .info ("Starting with task %d anchor %d (%d/%d)" % (task_id , current_anchor_value , idx + 1 , len (id_results )))
137
+ fanova_task_results = fanova_on_task (
138
+ task_results , performance_column , current_anchor_value , config_space , args .n_trees
139
+ )
135
140
fanova_all_results .extend (fanova_task_results )
136
141
137
142
fanova_all_results = pd .DataFrame (fanova_all_results )
138
143
139
144
# generate plot
140
145
fig , ax = plt .subplots (figsize = (16 , 9 ))
141
- sns .boxplot (x = "hyperparameter" , y = "fanova " , data = fanova_all_results , ax = ax )
146
+ sns .boxplot (x = "hyperparameter" , y = "variance_contribution" , hue = "anchor " , data = fanova_all_results , ax = ax )
142
147
ax .set_xticklabels (ax .get_xticklabels (), rotation = 45 , ha = "right" )
143
148
ax .set_ylabel ("Variance Contribution" )
144
149
ax .set_xlabel (None )
@@ -147,12 +152,14 @@ def run(args):
147
152
148
153
# save plot to file
149
154
filename_suffix = ""
150
- if args .anchor_value is not None :
151
- filename_suffix = "_anchor_%d " % args .anchor_value
152
- output_file = args .output_directory + '/fanova_%s%s.%s ' % (args .workflow_name , filename_suffix , args . output_filetype )
155
+ if args .anchor_values is not None :
156
+ filename_suffix = "_anchor_%s " % str ( args .anchor_values )
157
+ output_file_base = args .output_directory + '/fanova_%s%s' % (args .workflow_name , filename_suffix )
153
158
os .makedirs (args .output_directory , exist_ok = True )
154
- plt .savefig (output_file )
155
- logging .info ('saved to %s' % output_file )
159
+ fanova_all_results .to_csv (output_file_base + '.csv' )
160
+ plt .savefig (output_file_base + '.' + args .output_filetype )
161
+ logging .info ('saved plot to %s.%s' % (output_file_base , args .output_filetype ))
162
+ logging .info ('saved csv to %s.csv' % output_file_base )
156
163
157
164
158
165
if __name__ == '__main__' :
0 commit comments