[export] Add unit test for SDPA export result (pytorch#117390)

larryliu0820 · pytorchmergebot · commit 1a8545164aca · 2024-01-14T00:21:28.000Z
Summary: A follow up for pytorch#117097. In that PR I didn't add `_scaled_dot_product_attention_for_cpu` into the core_aten_decomposition table. This PR does that and also add a unit test. Test Plan: python test/export/test_export.py -k test_scaled_dot_product_attention Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: pytorch#117390 Approved by: https://github.com/drisspg
diff --git a/test/expect/HasDecompTest.test_aten_core_operators.expect b/test/expect/HasDecompTest.test_aten_core_operators.expect
@@ -15,7 +15,6 @@ aten::_native_batch_norm_legit
 aten::_native_batch_norm_legit.no_stats
 aten::_native_batch_norm_legit_functional
 aten::_native_batch_norm_legit_no_training
-aten::_scaled_dot_product_flash_attention_for_cpu
 aten::_softmax
 aten::_softmax.out
 aten::_to_copy
diff --git a/test/export/test_export.py b/test/export/test_export.py
@@ -7,6 +7,7 @@
 from dataclasses import dataclass
 
 import torch
+import torch.nn.functional as F
 import torch._dynamo as torchdynamo
 from functorch.experimental.control_flow import cond, map
 from torch import Tensor
@@ -35,6 +36,13 @@
 from torch.export import Constraint, Dim
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+)
+from torch.testing._internal.common_device_type import (
+    onlyCPU,
+    onlyCUDA,
+)
 from torch.testing._internal.common_utils import run_tests
 from torch._dynamo.test_case import TestCase
 from torch.utils._pytree import (
@@ -2528,6 +2536,74 @@ def forward(self, x):
         # this doesn't work today
         gm_unflat_strict = unflatten(ep)
 
+@unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
+class TestOneOffModelExportResult(TestCase):
+    def test_scaled_dot_product_attention_cpu(self):
+        """
+        This test makes sure we are always getting the same decomposition result for SDPA.
+        As of now _scaled_dot_product_flash_attention_for_cpu is expected to show up in
+        export() result. Some downstream backend then further decompose it into core ATen
+        ops in torch/_decomp/decompositions.py (search for
+        _scaled_dot_product_flash_attention_for_cpu).
+
+        Export is decomposing based on the CompositeImplicitAutograd kernel implementation
+        of SDPA. If this test fails, it means the kernel is being modified. In this case
+        we strongly encourage you to change the decomposition rule under
+        torch/_decomp/decompositions.py along with the kernel changes, so all of the
+        downstream backends are not being affected.
+        """
+        class ScaledDotProductAttention(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, q, k, v):
+                attn_output = F.scaled_dot_product_attention(
+                    q, k, v, None, dropout_p=0.0, is_causal=True
+                )
+                return attn_output
+        q = torch.randn(1, 1, 8, 8, device="cpu")
+        k = torch.randn(1, 1, 8, 8, device="cpu")
+        v = torch.randn(1, 1, 8, 8, device="cpu")
+
+        ep = torch.export.export(ScaledDotProductAttention(), (q, k, v))
+        self.assertExpectedInline(ep.graph_module.code.strip(), """\
+def forward(self, l_q_, l_k_, l_v_):
+    _scaled_dot_product_flash_attention_for_cpu = torch.ops.aten._scaled_dot_product_flash_attention_for_cpu.default(l_q_, l_k_, l_v_, 0.0, True);  l_q_ = l_k_ = l_v_ = None
+    getitem = _scaled_dot_product_flash_attention_for_cpu[0];  _scaled_dot_product_flash_attention_for_cpu = None
+    return (getitem,)""")
+
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+        "Can't run fused SDPA on this platform",
+    )
+    def test_scaled_dot_product_attention_cuda(self):
+        """
+        This test makes sure we are always getting the same decomposition result for SDPA.
+        As of now _scaled_dot_product_flash_attention is expected to show up in
+        export() result (GPU tensors are given). Currently there's no downstream
+        backend relies on this export result so if this test fails, feel free to
+        change it to the latest export() result.
+        """
+        class ScaledDotProductAttention(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, q, k, v):
+                attn_output = F.scaled_dot_product_attention(
+                    q, k, v, None, dropout_p=0.0, is_causal=True
+                )
+                return attn_output
+        q = torch.randn(1, 16, 16, 64, dtype = torch.bfloat16, device="cuda")
+        k = torch.randn(1, 16, 16, 64, dtype = torch.bfloat16, device="cuda")
+        v = torch.randn(1, 16, 16, 64, dtype = torch.bfloat16, device="cuda")
+
+        ep = torch.export.export(ScaledDotProductAttention(), (q, k, v))
+        self.assertExpectedInline(ep.graph_module.code.strip(), """\
+def forward(self, l_q_, l_k_, l_v_):
+    _scaled_dot_product_flash_attention = torch.ops.aten._scaled_dot_product_flash_attention.default(l_q_, l_k_, l_v_, 0.0, True, scale = 0.125);  l_q_ = l_k_ = l_v_ = None
+    getitem = _scaled_dot_product_flash_attention[0];  _scaled_dot_product_flash_attention = None
+    return (getitem,)""")
+
 
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/_decomp/__init__.py b/torch/_decomp/__init__.py
@@ -389,7 +389,7 @@ def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
             aten.rrelu_with_noise_,
             aten.rsub.Scalar,
             aten.rsub.Tensor,
-            aten._scaled_dot_product_flash_attention.default,
+            aten._scaled_dot_product_flash_attention_for_cpu.default,
             aten.select_backward,
             aten.select_scatter,
             aten.sgn,
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
@@ -73,7 +73,7 @@
 # the Inductor decomp table.
 decomps_to_exclude = [
     aten._unsafe_index,
-    aten._scaled_dot_product_flash_attention.default,  # See comments in torch/_decomp/decompositions.py
+    aten._scaled_dot_product_flash_attention_for_cpu.default,  # See comments in torch/_decomp/decompositions.py
     aten.clamp_max,
     aten.clamp_min,
     aten.glu,  # inductor lowers this directly