-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvisualization_helpers.py
357 lines (296 loc) · 13.6 KB
/
visualization_helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
def save_and_show_plot(fig, output_dir, filename):
"""
Saves the plot to a specified directory and shows it.
Parameters:
fig (matplotlib.figure.Figure): The figure to save and show.
output_dir (str): Directory where the figure should be saved.
filename (str): The name of the file (including extension) to save the figure as.
"""
os.makedirs(output_dir, exist_ok=True) # Ensure the output directory exists
file_path = os.path.join(output_dir, filename)
fig.savefig(file_path)
print(f"Figure saved to {file_path}")
plt.show() # Display the figure
def visualize_trends(df, x_axis, hue, y_axis='FinalObjectiveValue', title='Trend Analysis', output_dir=".", filename="trend_analysis.png"):
"""
Visualizes trends in the DataFrame for better insight.
Parameters:
df (pd.DataFrame): The input DataFrame containing the data to be visualized.
x_axis (str): The column to be used for the x-axis (e.g., 'loss_function' or 'optimizer').
hue (str): The column to use for coloring the categories (e.g., 'optimizer' or 'loss_function').
y_axis (str): The column to use for the y-axis values. Defaults to 'FinalObjectiveValue'.
title (str): The title of the plot. Defaults to 'Trend Analysis'.
output_dir (str): Directory to save the plot. Defaults to current directory.
filename (str): Name of the file to save the plot. Defaults to 'trend_analysis.png'.
"""
plt.figure(figsize=(14, 8))
sns.barplot(data=df, x=x_axis, y=y_axis, hue=hue, dodge=True)
plt.suptitle(title, fontsize=16)
plt.xlabel(x_axis.replace('_', ' ').title(), fontsize=12)
plt.ylabel(y_axis.replace('_', ' ').title(), fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.legend(title=hue.replace('_', ' ').title(), fontsize=10)
# plt.tight_layout()
plt.grid()
# Save and display the plot
save_and_show_plot(plt.gcf(), output_dir, filename)
def plot_objective_metric_progress(df, figtitle="Objective Metric Progress", output_dir=".", filename="objective_metric_progress.png"):
"""
Plots the progress of the objective metric over training experiments.
Parameters:
df (pd.DataFrame): DataFrame containing the data.
figtitle (str): Title of the plot.
output_dir (str): Directory to save the plot. Defaults to current directory.
filename (str): Name of the file to save the plot. Defaults to 'objective_metric_progress.png'.
"""
completed_jobs = df.query("TrainingJobStatus == 'Completed'")
objective_value_progress = completed_jobs["FinalObjectiveValue"].reset_index(drop=True)
# Calculate best objective value so far
best_so_far = objective_value_progress.cummax()
# Plot results
plt.figure(figsize=(12, 6))
# Scatter plot of all completed objective values
plt.scatter(objective_value_progress.index, objective_value_progress, label="Objective Values")
# Plot best objective value so far as a solid line
plt.plot(best_so_far.index, best_so_far, color='red', linestyle='-', label="Best Objective So Far")
# Enhance the plot
plt.grid()
plt.xlabel("Experiment Index")
plt.ylabel("Objective Metric Value")
plt.suptitle(figtitle)
plt.legend()
# plt.tight_layout()
# Save and display the plot
save_and_show_plot(plt.gcf(), output_dir, filename)
def calculate_feature_importance(df, metric_column="FinalObjectiveValue", features=None, model_type="random_forest"):
"""
Calculates feature importance using a tree-based model.
Parameters:
df (pd.DataFrame): DataFrame containing hyperparameters and their objective values.
metric_column (str): The column name for the objective metric.
features (list, optional): List of hyperparameters to analyze. If None, all features except the metric_column
and irrelevant columns will be used.
model_type (str): Type of tree-based model to use ('random_forest'). Defaults to Random Forest.
Returns:
pd.DataFrame: A DataFrame containing features and their calculated importances.
"""
# Select relevant features (exclude the metric column and irrelevant columns)
if features is None:
exclude_columns = [metric_column, "TrainingElapsedTimeSeconds", "TrainingJobStatus"]
features = [col for col in df.columns if col not in exclude_columns]
X = df[features].copy()
y = df[metric_column]
# Encode categorical features
for col in X.select_dtypes(include=["object", "category"]).columns:
X[col] = LabelEncoder().fit_transform(X[col])
# Handle missing values
X = X.fillna(0) # Replace missing values in features
y = y.fillna(y.mean()) # Replace missing target values
# Train a tree-based model
if model_type == "random_forest":
model = RandomForestRegressor(n_estimators=100, random_state=42)
else:
raise ValueError(f"Unsupported model_type: {model_type}")
model.fit(X, y)
# Extract feature importances
importances = model.feature_importances_
# Create a DataFrame of feature importances
importance_df = pd.DataFrame({
"Feature": X.columns,
"Importance (%)": (importances / importances.sum()) * 100 # Normalize to percentage
}).sort_values(by="Importance (%)", ascending=False)
return importance_df
def plot_relative_importance(df,
metric_column="FinalObjectiveValue",
features=None,
bar_height=0.2,
importance_threshold=2,
figtitle="Relative Importance of Hyperparameters",
output_dir=".",
filename="relative_importance.png"):
"""
Plots the relative importance of all hyperparameters based on their impact on the objective metric
as a horizontal bar chart with values displayed on the bars.
Parameters:
df (pd.DataFrame): DataFrame containing hyperparameters and their objective values.
metric_column (str): The column name for the objective metric.
features (list, optional): List of hyperparameters to analyze. If None, all features except the metric_column
and other irrelevant columns will be used.
bar_height (float, optional): Height of each bar in the plot (default is 0.5 for clean spacing).
output_dir (str): Directory to save the plot. Defaults to current directory.
filename (str): Name of the file to save the plot. Defaults to 'relative_importance.png'.
"""
# Calculate feature importance
importance_df = calculate_feature_importance(df, metric_column, features)
# Split into above and below threshold
above_threshold = importance_df[importance_df["Importance (%)"] >= importance_threshold]
below_threshold = importance_df[importance_df["Importance (%)"] < importance_threshold]
# Add "Others" row for below threshold features
if not below_threshold.empty:
others_importance = below_threshold["Importance (%)"].sum()
others_row = pd.DataFrame({"Feature": ["Others"], "Importance (%)": [others_importance]})
truncated_features = below_threshold["Feature"].tolist()
importance_df = pd.concat([above_threshold, others_row], ignore_index=True)
else:
truncated_features = []
# Dynamically adjust the figure height
num_features = len(above_threshold)
fig_height = max(5, num_features) # Minimum height of 3 inches
fig_width = 10
# Plot relative importance as a horizontal bar plot
plt.figure(figsize=(fig_width, fig_height))
ax = sns.barplot(
y="Feature",
x="Importance (%)",
data=importance_df,
orient="h"
)
plt.suptitle(figtitle, fontsize = 16)
plt.xlabel("Relative Importance (%)")
plt.ylabel("Hyperparameters")
plt.grid(axis="x", linestyle="--", alpha=0.7) # Add gridlines for x-axis
# Add text annotations at the end of each bar
for container in ax.containers: # Iterate through the bar containers
for bar in container:
width = bar.get_width() # Get the bar width (value of Importance (%))
y = bar.get_y() + bar.get_height() / 2 # Position at the center of the bar height
ax.text(
width,
y,
f"{width:.2f}%",
ha="left",
va="center",
color="red",
fontsize=13
)
# Add a text box for "Others"
if truncated_features:
# Create two columns of truncated features
truncated_text = "\n".join([
f"{truncated_features[i]:<20}{truncated_features[i+1]:<20}" if i+1 < len(truncated_features)
else f"{truncated_features[i]}"
for i in range(0, len(truncated_features), 2)
])
# Add the text box
plt.gcf().text(
.95,
0.5,
f"Features grouped as 'Others':\n{truncated_text}",
ha="left",
va="center",
bbox=dict(boxstyle="round", facecolor="lightgrey", edgecolor="black")
)
# Save and display the plot
save_and_show_plot(plt.gcf(), output_dir, filename)
def visualize_phase(df, filtered_df, phase_number, output_dir="."):
"""
Visualizes trends and metrics for a given phase.
Parameters:
df (pd.DataFrame): The full DataFrame containing all data.
filtered_df (pd.DataFrame): The filtered DataFrame containing data for the specific phase.
phase_number (int): The phase number.
output_dir (str): Directory to save the plots. Defaults to current directory.
"""
# Create phase-specific subfolder
phase_dir = os.path.join(output_dir, f"phase_{phase_number}")
os.makedirs(phase_dir, exist_ok=True)
# Fig1: Final Objective Value by Loss Function with Optimizer as Dimension
if phase_number < 3:
visualize_trends(
filtered_df,
x_axis='loss_function',
hue='optimizer',
title=f'Final Objective Value by Loss Function with Optimizer as Dimension - Phase {phase_number}',
output_dir=phase_dir,
filename=f"loss_function_vs_optimizer_phase_{phase_number}_dim_opt.png"
)
visualize_trends(
filtered_df,
x_axis='optimizer',
hue='loss_function',
title=f'Final Objective Value by Optimizer with Loss Function as Dimension - Phase {phase_number}',
output_dir=phase_dir,
filename=f"loss_function_vs_optimizer_phase_{phase_number}_dim_loss.png"
)
# Fig2: Objective Metric Progress
plot_objective_metric_progress(
df,
figtitle=f"Objective Metric Progress - Phase {phase_number}",
output_dir=phase_dir,
filename=f"objective_metric_progress_phase_{phase_number}.png"
)
# Fig3: Variable Importance
plot_relative_importance(
df=filtered_df,
figtitle=f"Relative Importance of Hyperparameters - Phase {phase_number}",
output_dir=phase_dir,
filename=f"relative_importance_phase_{phase_number}.png"
)
def plot_metrics_with_best_epoch(
metric_dataframes,
metrics_to_plot=["ValidationTxtR1", "ValidationImgR1", "ValidationZS1"],
objective_metric="ObjectiveValue",
best_epoch_line=True,
figsize=(15, 5)
):
"""
Plot multiple metrics with an indication of the best epoch.
Parameters:
- metric_dataframes (dict): Dictionary of DataFrames for each metric.
- metrics_to_plot (list): List of metrics to plot.
- objective_metric (str): Name of the objective metric.
- best_epoch_line (bool): Whether to add vertical and horizontal lines for the best epoch.
- figsize (tuple): Size of the figure.
"""
plt.figure(figsize=figsize)
# Plot each metric with dashed lines
for metric in metrics_to_plot:
plt.plot(
metric_dataframes[metric].index,
metric_dataframes[metric]['value'],
label=metric,
linestyle='--'
)
# Plot the objective metric with a solid line
plt.plot(
metric_dataframes[objective_metric].index,
metric_dataframes[objective_metric]['value'],
label=f"{objective_metric} (Average)",
linewidth=2
)
# Add vertical and horizontal lines for the best epoch
if best_epoch_line:
# Find the best epoch based on the maximum value of the objective metric
best_epoch_idx = metric_dataframes[objective_metric]['value'].idxmax()
best_epoch_value = metric_dataframes[objective_metric].loc[best_epoch_idx, 'value']
# Add vertical line for the best epoch
plt.axvline(
x=best_epoch_idx,
color='red',
linestyle='-.',
label=f'Best Epoch: {best_epoch_idx}'
)
# Add horizontal line for the best objective value
plt.axhline(
y=best_epoch_value,
color='green',
linestyle='-.',
label=f'Best Value: {best_epoch_value:.2f}'
)
# Add labels, title, and grid
plt.xlabel('Epoch')
plt.ylabel('Metric Value')
plt.title('Validation Metrics During Training')
plt.grid(True)
# Add legend outside the plot area
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
# Adjust layout to make space for the legend
plt.tight_layout()
# Show the plot
plt.show()