@@ -47,7 +47,10 @@ def build_test_list():
47
47
integration_tests_flavors ["debug_model.toml" ] = [
48
48
OverrideDefinitions (
49
49
[
50
- [],
50
+ [
51
+ "--profiling.enable_profiling" ,
52
+ "--metrics.enable_tensorboard" ,
53
+ ],
51
54
],
52
55
"default" ,
53
56
"default" ,
@@ -138,7 +141,6 @@ def build_test_list():
138
141
OverrideDefinitions (
139
142
[
140
143
[
141
- "--checkpoint.enable_checkpoint" ,
142
144
"--experimental.pipeline_parallel_degree 4" ,
143
145
"--experimental.pipeline_parallel_schedule InterleavedZeroBubble" ,
144
146
],
@@ -150,7 +152,6 @@ def build_test_list():
150
152
OverrideDefinitions (
151
153
[
152
154
[
153
- "--checkpoint.enable_checkpoint" ,
154
155
"--experimental.pipeline_parallel_degree 2" ,
155
156
"--experimental.pipeline_parallel_schedule 1F1B" ,
156
157
"--training.data_parallel_shard_degree 1" ,
@@ -163,7 +164,6 @@ def build_test_list():
163
164
OverrideDefinitions (
164
165
[
165
166
[
166
- "--checkpoint.enable_checkpoint" ,
167
167
"--experimental.pipeline_parallel_degree 2" ,
168
168
"--experimental.pipeline_parallel_schedule GPipe" ,
169
169
"--training.data_parallel_shard_degree 1" ,
@@ -176,7 +176,6 @@ def build_test_list():
176
176
OverrideDefinitions (
177
177
[
178
178
[
179
- "--checkpoint.enable_checkpoint" ,
180
179
"--experimental.pipeline_parallel_degree 2" ,
181
180
"--experimental.pipeline_parallel_schedule 1F1B" ,
182
181
"--training.data_parallel_shard_degree 2" ,
@@ -188,7 +187,6 @@ def build_test_list():
188
187
OverrideDefinitions (
189
188
[
190
189
[
191
- "--checkpoint.enable_checkpoint" ,
192
190
"--experimental.pipeline_parallel_degree 2" ,
193
191
"--experimental.pipeline_parallel_schedule GPipe" ,
194
192
"--training.data_parallel_shard_degree 2" ,
@@ -200,7 +198,6 @@ def build_test_list():
200
198
OverrideDefinitions (
201
199
[
202
200
[
203
- "--checkpoint.enable_checkpoint" ,
204
201
"--experimental.pipeline_parallel_degree 2" ,
205
202
"--training.tensor_parallel_degree 2" ,
206
203
],
@@ -244,7 +241,6 @@ def build_test_list():
244
241
OverrideDefinitions (
245
242
[
246
243
[
247
- "--checkpoint.enable_checkpoint" ,
248
244
"--experimental.pipeline_parallel_degree 4" ,
249
245
"--experimental.pipeline_parallel_schedule Interleaved1F1B" ,
250
246
],
@@ -256,7 +252,6 @@ def build_test_list():
256
252
OverrideDefinitions (
257
253
[
258
254
[
259
- "--checkpoint.enable_checkpoint" ,
260
255
"--experimental.pipeline_parallel_degree 2" ,
261
256
"--experimental.pipeline_parallel_schedule PipelineScheduleMulti" ,
262
257
"--experimental.pipeline_parallel_schedule_csv ./test/assets/custom_schedule.csv" ,
@@ -413,6 +408,8 @@ def run_test(test_flavor: OverrideDefinitions, full_path: str, output_dir: str):
413
408
414
409
for override_arg in test_flavor .override_args :
415
410
cmd = f"CONFIG_FILE={ full_path } NGPU={ test_flavor .ngpu } LOG_RANK={ all_ranks } ./run_llama_train.sh"
411
+ # dump compile trace for debugging purpose
412
+ cmd = f'TORCH_TRACE="{ output_dir } /{ test_name } /compile_trace" ' + cmd
416
413
if test_name == "fsdp2_memory_estimation" :
417
414
cmd = (
418
415
f"CONFIG_FILE={ full_path } NGPU={ test_flavor .ngpu } LOG_RANK={ all_ranks } "
0 commit comments