Skip to content

Commit 00f5302

Browse files
committed
[BE] dump compile trace to CI output for debugging, and reduce CI workload
ghstack-source-id: fe1076b Pull Request resolved: #739
1 parent 5ce8a0c commit 00f5302

File tree

2 files changed

+9
-12
lines changed

2 files changed

+9
-12
lines changed

test_runner.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@ def build_test_list():
4747
integration_tests_flavors["debug_model.toml"] = [
4848
OverrideDefinitions(
4949
[
50-
[],
50+
[
51+
"--profiling.enable_profiling",
52+
"--metrics.enable_tensorboard",
53+
],
5154
],
5255
"default",
5356
"default",
@@ -138,7 +141,6 @@ def build_test_list():
138141
OverrideDefinitions(
139142
[
140143
[
141-
"--checkpoint.enable_checkpoint",
142144
"--experimental.pipeline_parallel_degree 4",
143145
"--experimental.pipeline_parallel_schedule InterleavedZeroBubble",
144146
],
@@ -150,7 +152,6 @@ def build_test_list():
150152
OverrideDefinitions(
151153
[
152154
[
153-
"--checkpoint.enable_checkpoint",
154155
"--experimental.pipeline_parallel_degree 2",
155156
"--experimental.pipeline_parallel_schedule 1F1B",
156157
"--training.data_parallel_shard_degree 1",
@@ -163,7 +164,6 @@ def build_test_list():
163164
OverrideDefinitions(
164165
[
165166
[
166-
"--checkpoint.enable_checkpoint",
167167
"--experimental.pipeline_parallel_degree 2",
168168
"--experimental.pipeline_parallel_schedule GPipe",
169169
"--training.data_parallel_shard_degree 1",
@@ -176,7 +176,6 @@ def build_test_list():
176176
OverrideDefinitions(
177177
[
178178
[
179-
"--checkpoint.enable_checkpoint",
180179
"--experimental.pipeline_parallel_degree 2",
181180
"--experimental.pipeline_parallel_schedule 1F1B",
182181
"--training.data_parallel_shard_degree 2",
@@ -188,7 +187,6 @@ def build_test_list():
188187
OverrideDefinitions(
189188
[
190189
[
191-
"--checkpoint.enable_checkpoint",
192190
"--experimental.pipeline_parallel_degree 2",
193191
"--experimental.pipeline_parallel_schedule GPipe",
194192
"--training.data_parallel_shard_degree 2",
@@ -200,7 +198,6 @@ def build_test_list():
200198
OverrideDefinitions(
201199
[
202200
[
203-
"--checkpoint.enable_checkpoint",
204201
"--experimental.pipeline_parallel_degree 2",
205202
"--training.tensor_parallel_degree 2",
206203
],
@@ -244,7 +241,6 @@ def build_test_list():
244241
OverrideDefinitions(
245242
[
246243
[
247-
"--checkpoint.enable_checkpoint",
248244
"--experimental.pipeline_parallel_degree 4",
249245
"--experimental.pipeline_parallel_schedule Interleaved1F1B",
250246
],
@@ -256,7 +252,6 @@ def build_test_list():
256252
OverrideDefinitions(
257253
[
258254
[
259-
"--checkpoint.enable_checkpoint",
260255
"--experimental.pipeline_parallel_degree 2",
261256
"--experimental.pipeline_parallel_schedule PipelineScheduleMulti",
262257
"--experimental.pipeline_parallel_schedule_csv ./test/assets/custom_schedule.csv",
@@ -413,6 +408,8 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
413408

414409
for override_arg in test_flavor.override_args:
415410
cmd = f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} ./run_llama_train.sh"
411+
# dump compile trace for debugging purpose
412+
cmd = f'TORCH_TRACE="{output_dir}/{test_name}/compile_trace" ' + cmd
416413
if test_name == "fsdp2_memory_estimation":
417414
cmd = (
418415
f"CONFIG_FILE={full_path} NGPU={test_flavor.ngpu} LOG_RANK={all_ranks} "

train_configs/debug_model.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ description = "Llama 3 debug training"
66
use_for_integration_test = true
77

88
[profiling]
9-
enable_profiling = true
9+
enable_profiling = false
1010
save_traces_folder = "profile_trace"
1111
profile_freq = 10
1212
enable_memory_snapshot = false
@@ -15,7 +15,7 @@ save_memory_snapshot_folder = "memory_snapshot"
1515
[metrics]
1616
log_freq = 1
1717
enable_color_printing = true
18-
enable_tensorboard = true
18+
enable_tensorboard = false
1919
save_tb_folder = "tb"
2020
enable_wandb = false
2121

@@ -51,7 +51,7 @@ enable_async_tensor_parallel = false
5151
enable_checkpoint = false
5252
folder = "checkpoint"
5353
interval_type = "steps"
54-
interval = 5
54+
interval = 10
5555
model_weights_only = false
5656
export_dtype = "float32"
5757
async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]

0 commit comments

Comments
 (0)