Remove flex_attn configs from JobConfig (#1111)

fegin · web-flow · commit 8c6bf93e5c58 · 2025-04-17T00:18:01.000-07:00
There are conflicts between JobConfig and ModelArgs. Specfically, if we
let ModelArgs arguments be in JobConfig, then users have to control
these arguments via toml files or command line arguments. However, for
some flext_attn configurations, the requirement doesn't make sense as
some models have the desired settings.

This PR removes these configurations from JobConfig and uses model
flavor to control whether flex_attn should be enabled or not.
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
@@ -322,8 +322,7 @@ def build_test_list():
                 [
                     "--parallelism.data_parallel_shard_degree=4",
                     "--activation_checkpoint.mode='full'",
-                    "--model.use_flex_attn",
-                    "--model.attn_mask_type='block_causal'",
+                    "--model.flavor=debugmodel_flex_attn",
                 ]
             ],
             "FSDP+FLEX_ATTN",
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -193,25 +193,6 @@ def __init__(self):
             choices=["layernorm", "np_layernorm", "rmsnorm"],
             help="Type of layer normalization to use [layernorm, np_layernorm, rmsnorm]",
         )
-        self.parser.add_argument(
-            "--model.use_flex_attn",
-            action="store_true",
-            help="""
-                Whether to use Flex Attention.
-                Mixed usage of SDPA and FlexAttention is not upported yet.
-            """,
-        )
-        self.parser.add_argument(
-            "--model.attn_mask_type",
-            type=str,
-            default="causal",
-            choices=["causal", "block_causal"],
-            help="""
-                Specifies the type of bias/mask used for attention. If SDPA is used,
-                only the causal mask is supported by default. If FlexAttention is used,
-                both causal and block_causal masks are supported.
-            """,
-        )
         self.parser.add_argument(
             "--model.tokenizer_path",
             type=str,
diff --git a/torchtitan/experiments/llama4/__init__.py b/torchtitan/experiments/llama4/__init__.py
@@ -29,8 +29,6 @@
         n_layers=6,
         n_heads=16,
         rope_theta=500000,
-        every_n_layers_nope=4,
-        fixed_attn_block_size=256,
     ),
     "17bx16e": TransformerModelArgs(
         dim=5120,
@@ -53,6 +51,16 @@
         rope_theta=500000,
         num_experts=128,
     ),
+    "debugmodel_irope": TransformerModelArgs(
+        dim=256,
+        n_layers=6,
+        n_heads=16,
+        rope_theta=500000,
+        every_n_layers_nope=4,
+        fixed_attn_block_size=256,
+        use_flex_attn=True,
+        attn_mask_type="block_causal",
+    ),
     "17bx16e_irope": TransformerModelArgs(
         dim=5120,
         n_layers=48,
@@ -64,6 +72,8 @@
         num_experts=16,
         interleave_moe_layer_step=1,
         every_n_layers_nope=4,
+        use_flex_attn=True,
+        attn_mask_type="block_causal",
     ),
     "17bx128e_irope": TransformerModelArgs(
         dim=5120,
@@ -75,6 +85,8 @@
         rope_theta=500000,
         num_experts=128,
         every_n_layers_nope=4,
+        use_flex_attn=True,
+        attn_mask_type="block_causal",
     ),
 }
 
diff --git a/torchtitan/experiments/llama4/infra/parallelize_llama.py b/torchtitan/experiments/llama4/infra/parallelize_llama.py
@@ -65,14 +65,6 @@ def parallelize_llama(
         apply_moe_tp(model, world_mesh["tp"])
 
     if job_config.activation_checkpoint.mode != "none":
-        if (
-            job_config.activation_checkpoint.mode == "selective"
-            and job_config.model.use_flex_attn
-        ):
-            raise ValueError(
-                "FlexAttention is not compatible with selective AC yet. "
-                "See https://github.com/pytorch/pytorch/issues/147879"
-            )
         apply_ac(model, job_config.activation_checkpoint)
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
diff --git a/torchtitan/experiments/llama4/model/args.py b/torchtitan/experiments/llama4/model/args.py
@@ -34,8 +34,8 @@ class TransformerModelArgs(BaseModelArgs):
     depth_init: bool = True
     norm_type: str = "rmsnorm"
 
-    use_flex_attn: bool = True
-    attn_mask_type: str = "block_causal"
+    use_flex_attn: bool = False
+    attn_mask_type: str = "causal"
     eos_id: int = 0
     # iRoPE settings
     # When ``every_n_layers_nope`` is specified, NoPE (no positional embedding) is
@@ -62,13 +62,24 @@ def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> Non
         self.norm_type = job_config.model.norm_type
         self.vocab_size = tokenizer.n_words
         self.max_seq_len = job_config.training.seq_len
-        self.use_flex_attn = job_config.model.use_flex_attn
         if self.use_grouped_mm and not has_cuda_capability(9, 0):
             logger.warning(
                 "Failed to use grouped mm, which is only supported on SM90 or later",
             )
             self.use_grouped_mm = False
 
+        if job_config.activation_checkpoint.mode == "selective" and self.use_flex_attn:
+            raise ValueError(
+                "FlexAttention is not compatible with selective AC yet. "
+                "See https://github.com/pytorch/pytorch/issues/147879"
+            )
+
+        if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
+            raise ValueError(
+                "FlexAttention is not compatible with CP yet. "
+                "We are still working on this."
+            )
+
     def get_nparams_and_flops(
         self, model: nn.Module, seq_len: int
     ) -> tuple[int, float]:
diff --git a/torchtitan/experiments/llama4/train_configs/debug_model.toml b/torchtitan/experiments/llama4/train_configs/debug_model.toml
@@ -25,8 +25,6 @@ norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm
 # test tokenizer.model, for debug purpose only
 tokenizer_path = "./tests/assets/test_tiktoken.model"
 # converters = "float8"
-use_flex_attn = false
-attn_mask_type = "causal"  # causal / block_causal
 
 [optimizer]
 # TODO: currently grouped mm in MoE doesn't work with AdamW, need to investigate
diff --git a/torchtitan/models/llama3/__init__.py b/torchtitan/models/llama3/__init__.py
@@ -30,6 +30,14 @@
     "debugmodel": TransformerModelArgs(
         dim=256, n_layers=6, n_heads=16, rope_theta=500000
     ),
+    "debugmodel_flex_attn": TransformerModelArgs(
+        dim=256,
+        n_layers=6,
+        n_heads=16,
+        rope_theta=500000,
+        use_flex_attn=True,
+        attn_mask_type="block_causal",
+    ),
     "8B": TransformerModelArgs(
         dim=4096,
         n_layers=32,
diff --git a/torchtitan/models/llama3/model.py b/torchtitan/models/llama3/model.py
@@ -47,8 +47,18 @@ def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> Non
         self.norm_type = job_config.model.norm_type
         self.vocab_size = tokenizer.n_words
         self.max_seq_len = job_config.training.seq_len
-        self.use_flex_attn = job_config.model.use_flex_attn
-        self.attn_mask_type = job_config.model.attn_mask_type
+
+        if job_config.activation_checkpoint.mode == "selective" and self.use_flex_attn:
+            raise ValueError(
+                "FlexAttention is not compatible with selective AC yet. "
+                "See https://github.com/pytorch/pytorch/issues/147879"
+            )
+
+        if job_config.parallelism.context_parallel_degree > 1 and self.use_flex_attn:
+            raise ValueError(
+                "FlexAttention is not compatible with CP yet. "
+                "We are still working on this."
+            )
 
     def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
         nparams = sum(p.numel() for p in model.parameters())
diff --git a/torchtitan/models/llama3/parallelize_llama.py b/torchtitan/models/llama3/parallelize_llama.py
@@ -72,19 +72,6 @@ def parallelize_llama(
             enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
         )
 
-    if job_config.model.use_flex_attn:
-        if job_config.activation_checkpoint.mode == "selective":
-            raise ValueError(
-                "FlexAttention is not compatible with selective AC yet. "
-                "See https://github.com/pytorch/pytorch/issues/147879"
-            )
-
-        if parallel_dims.cp_enabled:
-            raise ValueError(
-                "FlexAttention is not compatible with CP yet. "
-                "We are still working on this."
-            )
-
     if job_config.activation_checkpoint.mode != "none":
         apply_ac(model, job_config.activation_checkpoint)
 

Original file line number	Diff line number	Diff line change
`@@ -322,8 +322,7 @@ def build_test_list():`
`322`	`322`	`[`
`323`	`323`	`"--parallelism.data_parallel_shard_degree=4",`
`324`	`324`	`"--activation_checkpoint.mode='full'",`
`325`		`- "--model.use_flex_attn",`
`326`		`- "--model.attn_mask_type='block_causal'",`
	`325`	`+ "--model.flavor=debugmodel_flex_attn",`
`327`	`326`	`]`
`328`	`327`	`],`
`329`	`328`	`"FSDP+FLEX_ATTN",`