Skip to content

Commit 8dc2efb

Browse files
authored
Update hyperparameter_importance.py
1 parent b9b1313 commit 8dc2efb

File tree

1 file changed

+45
-38
lines changed

1 file changed

+45
-38
lines changed

publications/2023-neurips/experiments/surf/snellius/analysis/hyperparameter_importance.py

+45-38
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@
1616
def parse_args():
1717
parser = argparse.ArgumentParser()
1818
parser.add_argument('--n_trees', type=int, default=16)
19-
parser.add_argument('--openml_ids', type=int, nargs='+', default=[3])
19+
parser.add_argument('--openml_ids', type=int, nargs='+', default=[3, 6])
2020
parser.add_argument('--workflow_name', type=str, default="lcdb.workflow.sklearn.LibLinearWorkflow")
2121
parser.add_argument('--openml_taskid_name', type=str, default="m:openmlid")
2222
parser.add_argument('--output_directory', type=str, default=os.path.expanduser('~/experiments/lcdb'))
2323
parser.add_argument('--output_filetype', type=str, choices=['pdf', 'png'], default='png')
2424
parser.add_argument('--max_load', type=int, default=None)
25-
parser.add_argument('--anchor_value', type=int, default=2048)
25+
parser.add_argument('--anchor_values', type=int, nargs='+', default=[128, 512, 2048, -1])
2626
return parser.parse_args()
2727

2828

@@ -44,7 +44,7 @@ def numeric_encode(df, config_space):
4444
return result
4545

4646

47-
def fanova_on_task(task_results, performance_column_name, config_space, n_trees):
47+
def fanova_on_task(task_results, performance_column_name, current_anchor_value, config_space, n_trees):
4848
fanova_results = []
4949

5050
evaluator = fanova.fanova.fANOVA(
@@ -61,14 +61,15 @@ def fanova_on_task(task_results, performance_column_name, config_space, n_trees)
6161

6262
fanova_results.append({
6363
"hyperparameter": pname,
64-
"fanova": importance[(idx,)]["individual importance"],
64+
"anchor": current_anchor_value,
65+
"variance_contribution": importance[(idx,)]["individual importance"],
6566
})
6667
return fanova_results
6768

6869

6970
def run(args):
7071
fanova_all_results = []
71-
performance_column = "objective"
72+
performance_column = "final_objective" # make sure to give this a unique name (not same as the "objective" field)
7273
anchor_size_column = "anchor_sizes"
7374
learning_curve_column = "learning_curve_data"
7475

@@ -95,50 +96,54 @@ def run(args):
9596
# job_ids = frame_workflow_job_task['job_id'].unique()
9697
if len(workflow_ids) > 1 or len(openml_task_ids) > 1:
9798
raise ValueError('Should not happen. %s %s' % (str(workflow_ids), str(openml_task_ids)))
98-
if (workflow_ids[0], openml_task_ids[0]) not in id_results:
99-
id_results[(workflow_ids[0], openml_task_ids[0])] = list()
100-
101-
performance_values_new = list()
102-
for index, row in frame_workflow_job_task.iterrows():
103-
anchor_sizes = row[anchor_size_column]
104-
performance_value_at_anchor = np.nan
105-
if args.anchor_value is not None:
106-
if args.anchor_value not in anchor_sizes:
107-
logging.warning('Anchor %d not available in task %d workflow %s'
108-
% (args.anchor_value, openml_task_ids[0], workflow_ids[0])
109-
)
99+
100+
for current_anchor_value in args.anchor_values:
101+
if (workflow_ids[0], openml_task_ids[0], current_anchor_value) not in id_results:
102+
id_results[(workflow_ids[0], openml_task_ids[0], current_anchor_value)] = list()
103+
104+
performance_values_new = list()
105+
for index, row in frame_workflow_job_task.iterrows():
106+
anchor_sizes = row[anchor_size_column]
107+
performance_value_at_anchor = np.nan
108+
if current_anchor_value != -1:
109+
if current_anchor_value not in anchor_sizes:
110+
logging.warning('Anchor %d not available in task %d workflow %s'
111+
% (current_anchor_value, openml_task_ids[0], workflow_ids[0])
112+
)
113+
else:
114+
anchor_index = anchor_sizes.index(current_anchor_value)
115+
performance_value_at_anchor = row[learning_curve_column][anchor_index]
110116
else:
111-
anchor_index = anchor_sizes.index(args.anchor_value)
112-
performance_value_at_anchor = row[learning_curve_column][anchor_index]
113-
else:
114-
performance_value_at_anchor = row[learning_curve_column][-1]
115-
performance_values_new.append(performance_value_at_anchor)
116-
performance_values_new = np.array(performance_values_new, dtype=float)
117-
frame_workflow_job_task[performance_column] = pd.Series(performance_values_new)
117+
performance_value_at_anchor = row[learning_curve_column][-1]
118+
performance_values_new.append(performance_value_at_anchor)
119+
performance_values_new = np.array(performance_values_new, dtype=float)
120+
frame_workflow_job_task[performance_column] = pd.Series(performance_values_new)
118121

119-
id_results[(workflow_ids[0], openml_task_ids[0])].append(frame_workflow_job_task)
122+
id_results[(workflow_ids[0], openml_task_ids[0], current_anchor_value)].append(frame_workflow_job_task)
120123

121-
load_count += 1
122-
if args.max_load and load_count >= args.max_load:
123-
break
124+
load_count += 1
125+
if args.max_load and load_count >= args.max_load:
126+
break
124127

125128
task_ids = set()
126-
for idx, (workflow_name, task_id) in enumerate(id_results):
129+
for idx, (workflow_name, task_id, current_anchor_value) in enumerate(id_results):
127130
task_ids.add(task_id)
128-
task_results = pd.concat(id_results[(workflow_name, task_id)])
131+
task_results = pd.concat(id_results[(workflow_name, task_id, current_anchor_value)])
129132
task_results = task_results.rename(workflow_hyperparameter_mapping, axis=1)
130133
relevant_columns = list(workflow_hyperparameter_mapping.values()) + [performance_column]
131134
task_results = task_results[relevant_columns]
132135

133-
logging.info("Starting with task %d (%d/%d)" % (task_id, idx + 1, len(id_results)))
134-
fanova_task_results = fanova_on_task(task_results, performance_column, config_space, args.n_trees)
136+
logging.info("Starting with task %d anchor %d (%d/%d)" % (task_id, current_anchor_value, idx + 1, len(id_results)))
137+
fanova_task_results = fanova_on_task(
138+
task_results, performance_column, current_anchor_value, config_space, args.n_trees
139+
)
135140
fanova_all_results.extend(fanova_task_results)
136141

137142
fanova_all_results = pd.DataFrame(fanova_all_results)
138143

139144
# generate plot
140145
fig, ax = plt.subplots(figsize=(16, 9))
141-
sns.boxplot(x="hyperparameter", y="fanova", data=fanova_all_results, ax=ax)
146+
sns.boxplot(x="hyperparameter", y="variance_contribution", hue="anchor", data=fanova_all_results, ax=ax)
142147
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
143148
ax.set_ylabel("Variance Contribution")
144149
ax.set_xlabel(None)
@@ -147,12 +152,14 @@ def run(args):
147152

148153
# save plot to file
149154
filename_suffix = ""
150-
if args.anchor_value is not None:
151-
filename_suffix = "_anchor_%d" % args.anchor_value
152-
output_file = args.output_directory + '/fanova_%s%s.%s' % (args.workflow_name, filename_suffix, args.output_filetype)
155+
if args.anchor_values is not None:
156+
filename_suffix = "_anchor_%s" % str(args.anchor_values)
157+
output_file_base = args.output_directory + '/fanova_%s%s' % (args.workflow_name, filename_suffix)
153158
os.makedirs(args.output_directory, exist_ok=True)
154-
plt.savefig(output_file)
155-
logging.info('saved to %s' % output_file)
159+
fanova_all_results.to_csv(output_file_base + '.csv')
160+
plt.savefig(output_file_base + '.' + args.output_filetype)
161+
logging.info('saved plot to %s.%s' % (output_file_base, args.output_filetype))
162+
logging.info('saved csv to %s.csv' % output_file_base)
156163

157164

158165
if __name__ == '__main__':

0 commit comments

Comments
 (0)