add torch.compile + FSDP2 float8 all-gather in CI (#468)

weifengpy · web-flow · commit 9b374084f05b · 2024-07-18T19:17:30.000-07:00
fixed my bug in float8_experimental. now we can torch.compile transfromer blocks with FSDP float8 all-gather meta-pytorch/float8_experimental#321 local test: `CONFIG_FILE="./train_configs/debug_model.toml" ./run_llama_train.sh --training.enable_float8_linear --training.enable_fsdp_float8_all_gather --training.precompute_float8_dynamic_scale_for_fsdp --training.compile` profiler traces: I can see compiled region in cpu thread and float8 malmul `sm90_xmma_gemm_e4m3bf16...` in cuda stream <img width="1468" alt="Screenshot 2024-07-18 at 4 22 17 PM" src="https://github.com/user-attachments/assets/0cf58dee-aae1-4582-a3f1-b8aa48b45129">
diff --git a/test_runner.py b/test_runner.py
@@ -305,6 +305,18 @@ def build_test_list():
             "FSDP2 with float8 all-gather and precomputed dynamic scales",
             "fsdp2_float8_all_gather_precompute_dynamic_scales",
         ),
+        OverrideDefinitions(
+            [
+                [
+                    "--training.enable_float8_linear",
+                    "--training.enable_fsdp_float8_all_gather",
+                    "--training.precompute_float8_dynamic_scale_for_fsdp",
+                    "--training.compile",
+                ]
+            ],
+            "FSDP2 with float8 all-gather and precomputed dynamic scales",
+            "fsdp2_float8_all_gather_precompute_dynamic_scales_compile",
+        ),
         OverrideDefinitions(
             [
                 [