BambaFlashAttentionKwargs

garrett361 · garrett361 · commit b59e5c9676b6 · 2025-05-20T14:17:46.000Z
diff --git a/src/transformers/models/bamba/modeling_bamba.py b/src/transformers/models/bamba/modeling_bamba.py
@@ -24,7 +24,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Callable, Optional, Tuple, Union
+from functools import partial
+from typing import Callable, Optional, Tuple, TypedDict, Union
 
 import torch
 from torch import nn
@@ -61,6 +62,31 @@
 logger = logging.get_logger(__name__)
 
 
+class BambaFlashAttentionKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
+    Use cases include padding-free training and fewer `torch.compile` graph breaks.
+
+    Attributes:
+        cu_seq_lens_q (`torch.LongTensor`)
+            Gets cumulative sequence length for query state.
+        cu_seq_lens_k (`torch.LongTensor`)
+            Gets cumulative sequence length for key state.
+        max_length_q (`int`):
+            Maximum sequence length for query state.
+        max_length_k (`int`):
+            Maximum sequence length for key state.
+        seq_idx (`torch.IntTensor):
+            Index of each packed sequence.
+    """
+
+    cu_seq_lens_q: torch.LongTensor
+    cu_seq_lens_k: torch.LongTensor
+    max_length_q: int
+    max_length_k: int
+    seq_idx: torch.IntTensor
+
+
 # Adapted from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache for the v2 mixer
 class HybridMambaAttentionDynamicCache(modeling_jamba.HybridMambaAttentionDynamicCache):
     """
@@ -866,6 +892,7 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         seq_idx: Optional[torch.IntTensor] = None,
+        **kwargs,
     ):
         if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
             return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask, seq_idx)
@@ -946,12 +973,7 @@ def forward(
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        cu_seq_lens_q: Optional[torch.LongTensor] = None,
-        cu_seq_lens_k: Optional[torch.LongTensor] = None,
-        max_length_q: Optional[int] = None,
-        max_length_k: Optional[int] = None,
-        seq_idx: Optional[torch.IntTensor] = None,
-        **kwargs,
+        **kwargs: Unpack[BambaFlashAttentionKwargs],
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -970,20 +992,9 @@ def forward(
             position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                 Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                 with `head_dim` being the embedding dimension of each attention head.
-            cu_seq_lens_q (`torch.LongTensor`, *optional*):
-                Cumulative query sequence lengths. For padding-free training with flash attention.
-            cu_seq_lens_k (`torch.LongTensor`, *optional*):
-                Cumulative key sequence lengths. For padding-free training with flash attention.
-            max_length_q (`int`, *optional*):
-                Maximum query length. For padding-free training with flash attention.
-            max_length_k (`int`, *optional*):
-                Maximum key length. For padding-free training with flash attention.
-            seq_idx (`torch.IntTensor`, *optional*):
-                Sequence index of each packed example. For padding-free training with fast mamba_ssm and causal_conv1d
-                kernels.
             kwargs (`dict`, *optional*):
-                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
-                into the model
+                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
+                padding-free training and/or improve torch.compile performance.
         """
 
         residual = hidden_states
@@ -997,7 +1008,7 @@ def forward(
                 cache_params=past_key_value,
                 cache_position=cache_position,
                 attention_mask=attention_mask,
-                seq_idx=seq_idx,
+                **kwargs,
             )
             self_attn_weights = None
         elif self.layer_type == "attention":
@@ -1010,10 +1021,6 @@ def forward(
                 use_cache=use_cache,
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
-                cu_seq_lens_q=cu_seq_lens_q,
-                cu_seq_lens_k=cu_seq_lens_k,
-                max_length_q=max_length_q,
-                max_length_k=max_length_k,
                 **kwargs,
             )
 
@@ -1104,12 +1111,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        cu_seq_lens_q: Optional[torch.LongTensor] = None,
-        cu_seq_lens_k: Optional[torch.LongTensor] = None,
-        max_length_q: Optional[int] = None,
-        max_length_k: Optional[int] = None,
-        seq_idx: Optional[torch.IntTensor] = None,
-        **kwargs,  # NOOP kwargs, for now
+        **kwargs: Unpack[BambaFlashAttentionKwargs],
     ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1120,22 +1122,6 @@ def forward(
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
-        padding_free_kwargs = (cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k, seq_idx)
-        num_padding_free_kwargs_used = sum(k is not None for k in padding_free_kwargs)
-        if num_padding_free_kwargs_used:
-            if num_padding_free_kwargs_used != len(padding_free_kwargs):
-                raise ValueError(
-                    "All of (cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k, seq_idx) must be specified for padding-free training."
-                )
-            if position_ids is None:
-                raise ValueError(
-                    "position_ids must also be specified when (cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k, seq_idx) are provided for padding-free training."
-                )
-            if attention_mask is not None:
-                raise ValueError(
-                    "attention_mask must be None when providing (cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k, seq_idx, position_ids) for padding-free training."
-                )
-
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
                 "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
@@ -1177,7 +1163,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **kwargs),
                     hidden_states,
                     layer_mask,
                     position_ids,
@@ -1186,11 +1172,6 @@ def forward(
                     use_cache,
                     cache_position,
                     position_embeddings,
-                    cu_seq_lens_q,
-                    cu_seq_lens_k,
-                    max_length_q,
-                    max_length_k,
-                    seq_idx,
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -1202,11 +1183,7 @@ def forward(
                     use_cache=use_cache,
                     cache_position=cache_position,
                     position_embeddings=position_embeddings,
-                    cu_seq_lens_q=cu_seq_lens_q,
-                    cu_seq_lens_k=cu_seq_lens_k,
-                    max_length_q=max_length_q,
-                    max_length_k=max_length_k,
-                    seq_idx=seq_idx,
+                    **kwargs,
                 )
 
             hidden_states = layer_outputs[0]
diff --git a/src/transformers/models/bamba/modular_bamba.py b/src/transformers/models/bamba/modular_bamba.py
@@ -19,7 +19,8 @@
 # limitations under the License.
 """PyTorch Bamba model."""
 
-from typing import Optional, Tuple, Union
+from functools import partial
+from typing import Optional, Tuple, TypedDict, Union
 
 import torch
 import torch.utils.checkpoint
@@ -46,7 +47,12 @@
 from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...utils import auto_docstring, can_return_tuple, logging
+from ...processing_utils import Unpack
+from ...utils import (
+    auto_docstring,
+    can_return_tuple,
+    logging,
+)
 from ...utils.import_utils import is_causal_conv1d_available, is_flash_attn_2_available, is_mamba_2_ssm_available
 from .configuration_bamba import BambaConfig
 
@@ -71,6 +77,31 @@
 logger = logging.get_logger(__name__)
 
 
+class BambaFlashAttentionKwargs(TypedDict, total=False):
+    """
+    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
+    Use cases include padding-free training and fewer `torch.compile` graph breaks.
+
+    Attributes:
+        cu_seq_lens_q (`torch.LongTensor`)
+            Gets cumulative sequence length for query state.
+        cu_seq_lens_k (`torch.LongTensor`)
+            Gets cumulative sequence length for key state.
+        max_length_q (`int`):
+            Maximum sequence length for query state.
+        max_length_k (`int`):
+            Maximum sequence length for key state.
+        seq_idx (`torch.IntTensor):
+            Index of each packed sequence.
+    """
+
+    cu_seq_lens_q: torch.LongTensor
+    cu_seq_lens_k: torch.LongTensor
+    max_length_q: int
+    max_length_k: int
+    seq_idx: torch.IntTensor
+
+
 # Adapted from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache for the v2 mixer
 class HybridMambaAttentionDynamicCache(modeling_jamba.HybridMambaAttentionDynamicCache):
     """
@@ -657,6 +688,7 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         seq_idx: Optional[torch.IntTensor] = None,
+        **kwargs,
     ):
         if is_fast_path_available and "cuda" in self.in_proj.weight.device.type:
             return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask, seq_idx)
@@ -708,12 +740,7 @@ def forward(
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        cu_seq_lens_q: Optional[torch.LongTensor] = None,
-        cu_seq_lens_k: Optional[torch.LongTensor] = None,
-        max_length_q: Optional[int] = None,
-        max_length_k: Optional[int] = None,
-        seq_idx: Optional[torch.IntTensor] = None,
-        **kwargs,
+        **kwargs: Unpack[BambaFlashAttentionKwargs],
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -732,20 +759,9 @@ def forward(
             position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                 Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                 with `head_dim` being the embedding dimension of each attention head.
-            cu_seq_lens_q (`torch.LongTensor`, *optional*):
-                Cumulative query sequence lengths. For padding-free training with flash attention.
-            cu_seq_lens_k (`torch.LongTensor`, *optional*):
-                Cumulative key sequence lengths. For padding-free training with flash attention.
-            max_length_q (`int`, *optional*):
-                Maximum query length. For padding-free training with flash attention.
-            max_length_k (`int`, *optional*):
-                Maximum key length. For padding-free training with flash attention.
-            seq_idx (`torch.IntTensor`, *optional*):
-                Sequence index of each packed example. For padding-free training with fast mamba_ssm and causal_conv1d
-                kernels.
             kwargs (`dict`, *optional*):
-                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
-                into the model
+                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
+                padding-free training and/or improve torch.compile performance.
         """
 
         residual = hidden_states
@@ -759,7 +775,7 @@ def forward(
                 cache_params=past_key_value,
                 cache_position=cache_position,
                 attention_mask=attention_mask,
-                seq_idx=seq_idx,
+                **kwargs,
             )
             self_attn_weights = None
         elif self.layer_type == "attention":
@@ -772,10 +788,6 @@ def forward(
                 use_cache=use_cache,
                 cache_position=cache_position,
                 position_embeddings=position_embeddings,
-                cu_seq_lens_q=cu_seq_lens_q,
-                cu_seq_lens_k=cu_seq_lens_k,
-                max_length_q=max_length_q,
-                max_length_k=max_length_k,
                 **kwargs,
             )
 
@@ -866,12 +878,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
-        cu_seq_lens_q: Optional[torch.LongTensor] = None,
-        cu_seq_lens_k: Optional[torch.LongTensor] = None,
-        max_length_q: Optional[int] = None,
-        max_length_k: Optional[int] = None,
-        seq_idx: Optional[torch.IntTensor] = None,
-        **kwargs,  # NOOP kwargs, for now
+        **kwargs: Unpack[BambaFlashAttentionKwargs],
     ) -> BaseModelOutputWithPast:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -882,22 +889,6 @@ def forward(
         if (input_ids is None) ^ (inputs_embeds is not None):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
 
-        padding_free_kwargs = (cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k, seq_idx)
-        num_padding_free_kwargs_used = sum(k is not None for k in padding_free_kwargs)
-        if num_padding_free_kwargs_used:
-            if num_padding_free_kwargs_used != len(padding_free_kwargs):
-                raise ValueError(
-                    "All of (cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k, seq_idx) must be specified for padding-free training."
-                )
-            if position_ids is None:
-                raise ValueError(
-                    "position_ids must also be specified when (cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k, seq_idx) are provided for padding-free training."
-                )
-            if attention_mask is not None:
-                raise ValueError(
-                    "attention_mask must be None when providing (cu_seq_lens_q, cu_seq_lens_k, max_length_q, max_length_k, seq_idx, position_ids) for padding-free training."
-                )
-
         if self.gradient_checkpointing and self.training and use_cache:
             logger.warning_once(
                 "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
@@ -939,7 +930,7 @@ def forward(
 
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
+                    partial(decoder_layer.__call__, **kwargs),
                     hidden_states,
                     layer_mask,
                     position_ids,
@@ -948,11 +939,6 @@ def forward(
                     use_cache,
                     cache_position,
                     position_embeddings,
-                    cu_seq_lens_q,
-                    cu_seq_lens_k,
-                    max_length_q,
-                    max_length_k,
-                    seq_idx,
                 )
             else:
                 layer_outputs = decoder_layer(
@@ -964,11 +950,7 @@ def forward(
                     use_cache=use_cache,
                     cache_position=cache_position,
                     position_embeddings=position_embeddings,
-                    cu_seq_lens_q=cu_seq_lens_q,
-                    cu_seq_lens_k=cu_seq_lens_k,
-                    max_length_q=max_length_q,
-                    max_length_k=max_length_k,
-                    seq_idx=seq_idx,
+                    **kwargs,
                 )
 
             hidden_states = layer_outputs[0]