[meta registration] fix _efficient_attention_forward for jagged inputs (pytorch#118657)

davidberard98 · pytorchmergebot · commit 1b0342352653 · 2024-01-31T00:11:39.000Z
Fixes the meta registration for the logsumexp output, whose shape should be defined by the size of the offsets tensor when it exists. https://github.com/pytorch/pytorch/blob/644f64f2d112b7c0b758b044821cf3972c0c17e9/aten/src/ATen/native/transformers/cuda/attention.cu#L1045 Differential Revision: [D53234217](https://our.internmc.facebook.com/intern/diff/D53234217) Pull Request resolved: pytorch#118657 Approved by: https://github.com/YuqingJ
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
@@ -5447,9 +5447,10 @@ def meta__efficient_attention_forward(
 
     res = torch.empty(B, M, num_heads, Kv, dtype=query.dtype, device=query.device)
 
+    logsumexp_batch_dim = cu_seqlens_q.size(0) - 1 if (cu_seqlens_q is not None) else B
     logsumexp_dim = math.ceil(M / 32) * 32 if compute_log_sumexp else 0
     logsum_exp = torch.empty(
-        (B, num_heads, logsumexp_dim),
+        (logsumexp_batch_dim, num_heads, logsumexp_dim),
         dtype=torch.float,
         device=query.device,
     )
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py
@@ -8531,6 +8531,26 @@ def sample_inputs_efficient_attention_forward(op_info, device, dtype, requires_g
         )
     )
 
+    # jagged (with query/keys offsets)
+    samples.append(
+        SampleInput(
+            make((4, 2, 64)).view(-1, 8, 8).unsqueeze(0),
+            make((6, 64)).view(-1, 8, 8).unsqueeze(0),
+            make((6, 64)).view(-1, 8, 8).unsqueeze(0),
+            bias=None,
+            cu_seqlens_q=torch.tensor((0, 2, 4, 6, 8), dtype=torch.int32, device=device),
+            cu_seqlens_k=torch.tensor((0, 1, 3, 5, 6), dtype=torch.int32, device=device),
+            max_seqlen_q=2,
+            max_seqlen_k=2,
+            dropout_p=0.0,
+            custom_mask_type=0,  # No Mask
+            compute_log_sumexp=requires_grad,
+            scale=None,
+            causal_diagonal=None,
+            seqlen_k=None,
+        )
+    )
+
     yield from samples
 
 def sample_inputs_flash_attention_forward(op_info, device, dtype, requires_grad, **kwargs):