check fake/real mismatches during real tensor prop (pytorch#137747)

pianpwk · pytorchmergebot · commit a678eaf1adac · 2024-11-04T23:39:48.000Z
Summary: While testing exportability for PT2 Inference models, we found various cases of invalid op inputs during tracing, for example errors like: `a and b must have same reduction dim`, `expected scalar type Long but found Int`, etc. Looking more closely, these happened to due the same few meta kernels & eager kernels producing mismatched outputs upstream (e.g. different output tensor dtype, int output). Adding checks to catch mismatched outputs in real tensor prop upstream, so errors are raised at the mismatched op, instead of the downstream ops taking them as inputs. Relies a lot on utils from [CrossRefFakeMode](https://github.com/pytorch/pytorch/blob/929797dedbf23376123ce95230c01a7e3b71e130/torch/_subclasses/fake_utils.py#L78) Follow ups: could add more checks, and maybe have a flag to only enable these for cases like draft mode, so perf doesn't suffer? Test Plan: test_export, test_fake_tensor Differential Revision: D64210055 Pull Request resolved: pytorch#137747 Approved by: https://github.com/zou3519
diff --git a/test/export/test_export.py b/test/export/test_export.py
@@ -1078,6 +1078,120 @@ def forward(self, x):
         ep_model = export(model, (x,), strict=False).module()
         self.assertTrue(torch.allclose(model(x), ep_model(x)))
 
+    def test_real_tensor_size_mismatch(self):
+        from torch._subclasses.fake_tensor import MetadataMismatchError
+
+        class M(torch.nn.Module):
+            def forward(self, a, b):
+                return torch.ops.mylib.foo(a, b)
+
+        @torch.library.custom_op("mylib::foo", mutates_args={})
+        def foo(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+            return a + b
+
+        @foo.register_fake
+        def foo_fake_impl(a, b):
+            m, n = a.shape
+            return torch.empty(n, m)  # incorrectly permute
+
+        error_type = (
+            MetadataMismatchError
+            if is_non_strict_test(self._testMethodName)
+            else torch._dynamo.exc.TorchRuntimeError
+        )
+        with torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True):
+            # won't catch anything if dims are equal
+            export(
+                M(),
+                (torch.randn(4, 4), torch.randn(4, 4)),
+            )
+            # catch concrete inequality
+            with self.assertRaisesRegex(
+                error_type,
+                "Real tensor propagation found an output size mismatch between fake shape 8 and real shape 4, "
+                "at output index 0, dimension 0 for func: mylib.foo.default",
+            ):
+                export(
+                    M(),
+                    (torch.randn(4, 8), torch.randn(4, 8)),
+                )
+            # same test with dynamic shapes
+            d0 = Dim("d0")
+            d1 = Dim("d1")
+            export(
+                M(),
+                (torch.randn(4, 4), torch.randn(4, 4)),
+                dynamic_shapes={
+                    "a": (d0, d1),
+                    "b": (d0, d1),
+                },
+            )
+            with self.assertRaisesRegex(
+                error_type,
+                "Real tensor propagation found an output size mismatch between fake shape s1 and real shape 4, "
+                "at output index 0, dimension 0 for func: mylib.foo.default",
+            ):
+                export(
+                    M(),
+                    (torch.randn(4, 8), torch.randn(4, 8)),
+                    dynamic_shapes={
+                        "a": (d0, d1),
+                        "b": (d0, d1),
+                    },
+                )
+
+    def test_real_tensor_alias_dtype_mismatch(self):
+        from torch._subclasses.fake_tensor import MetadataMismatchError
+
+        error_type = (
+            MetadataMismatchError
+            if is_non_strict_test(self._testMethodName)
+            else torch._dynamo.exc.TorchRuntimeError
+        )
+
+        # test alias case
+        class M(torch.nn.Module):
+            def forward(self, a):
+                return torch.ops.mylib.foo_alias(a)
+
+        @torch.library.custom_op("mylib::foo_alias", mutates_args={})
+        def foo_alias(a: torch.Tensor) -> torch.Tensor:
+            return a * 2
+
+        @foo_alias.register_fake
+        def foo_fake_impl(a):
+            return a
+
+        with torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True):
+            with self.assertRaisesRegex(
+                error_type,
+                r"Real tensor propagation found an aliasing mismatch between fake output (.*\n)*.* "
+                r"and real output (.*\n)*.* for func: mylib.foo_alias.default",
+            ):
+                ep = export(M(), (torch.randn(4, 4),))
+
+        # test dtype case
+        class N(torch.nn.Module):
+            def forward(self, a):
+                return torch.ops.mylib.foo_dtype(a)
+
+        @torch.library.custom_op("mylib::foo_dtype", mutates_args={})
+        def foo_dtype(a: torch.Tensor) -> torch.Tensor:
+            return a * 2
+
+        @foo_dtype.register_fake
+        def foo_fake_impl(a):
+            m, n = a.shape
+            return torch.empty([m, n], dtype=torch.int32)
+
+        with torch._functorch.config.patch(fake_tensor_propagate_real_tensors=True):
+            with self.assertRaisesRegex(
+                error_type,
+                r"Real tensor propagation found a metadata mismatch between fake tensor (.*\n)*.* "
+                r"and real tensor (.*\n)*.* at output index 0, for func: mylib.foo_dtype.default",
+            ):
+                ep = export(N(), (torch.randn(4, 4),))
+
     def test_real_tensor_for_max_op(self):
         class Foo(torch.nn.Module):
             def forward(self, x, y):
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
@@ -28,6 +28,7 @@
     _CacheKeyState,
     DynamicOutputShapeException,
     extract_tensor_metadata,
+    MetadataMismatchError,
     FakeTensor,
     FakeTensorConverter,
     FakeTensorMode,
@@ -1377,14 +1378,20 @@ def forward(self, arg1, arg2, arg3):
             try:
                 with torch._subclasses.CrossRefFakeMode():
                     Repro()(*args)
-            except RuntimeError as e:
+            except MetadataMismatchError as e:
                 # We expect the cross ref to succed for the first output to fail
                 # for the rng state, see Note [Seed and Offset]
                 self.assertTrue("output[0]" not in str(e))
-                self.assertTrue(
-                    "found mismatched tensor metadata for output[6]: Devices cpu and cuda:0 are not equal!"
-                    in str(e)
-                )
+                if self.__class__.__name__.startswith("PropagateRealTensors"):
+                    self.assertTrue(
+                        "Real tensor propagation found a metadata mismatch"
+                        in str(e)
+                    )
+                else:
+                    self.assertTrue(
+                        "found mismatched tensor metadata for output"
+                        in str(e)
+                    )
 
     # IMPORTANT!!! Always run even if CUDA is not available
     def test_fake_gpu_no_init(self):
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
@@ -2131,6 +2131,12 @@ def _compute_reduction_shape(self, dims, keepdim):
 def device_hint(tensor) -> "str":
     if isinstance(tensor, torch._subclasses.FakeTensor):
         return tensor.fake_device.type
+    elif (
+        hasattr(tensor, "device")
+        and hasattr(tensor.device, "type")
+        and tensor.device.type != "meta"
+    ):
+        return tensor.device.type
     else:
         return "cuda"  # default to cuda
 
diff --git a/torch/_prims_common/__init__.py b/torch/_prims_common/__init__.py
@@ -136,6 +136,7 @@ def _maybe_get_pytype(t):
 def compare_tensor_meta(
     a: TensorLikeType,
     b: TensorLikeType,
+    check_sizes=True,
     check_strides=False,
     *,
     allow_rhs_unbacked=False,
@@ -148,16 +149,20 @@ def compare_tensor_meta(
     In the future this will validate additional metadata, like
     strides.
     """
+    from torch._subclasses.fake_tensor import MetadataMismatchError
+
     assert isinstance(a, TensorLike)
     assert isinstance(b, TensorLike)
 
-    if not same_shape(a.shape, b.shape, allow_rhs_unbacked=allow_rhs_unbacked):
+    if check_sizes and not same_shape(
+        a.shape, b.shape, allow_rhs_unbacked=allow_rhs_unbacked
+    ):
         msg = f"Shapes {a.shape} and {b.shape} are not equal!"
-        raise AssertionError(msg)
+        raise MetadataMismatchError(msg)
 
     if a.dtype != b.dtype:
         msg = f"Dtypes {a.dtype} and {b.dtype} are not equal!"
-        raise AssertionError(msg)
+        raise MetadataMismatchError(msg)
 
     if a.device != b.device:
         # Handles special cuda:0 vs cuda case
@@ -168,27 +173,27 @@ def compare_tensor_meta(
             pass
         else:
             msg = f"Devices {a.device} and {b.device} are not equal!"
-            raise AssertionError(msg)
+            raise MetadataMismatchError(msg)
 
     # Stride checking is currently disabled, see https://github.com/pytorch/pytorch/issues/78050
     if check_strides:
         same_strides, idx = check_significant_strides(a, b)
         if not same_strides:
             msg = f"Stride mismatch! Strides are {a.stride()} and {b.stride()} (mismatched at {idx})!"
-            raise RuntimeError(msg)
+            raise MetadataMismatchError(msg)
 
         if a.storage_offset() != b.storage_offset():
             msg = f"Storage offset mismatch! Storage offsets are {a.storage_offset()} and {b.storage_offset()}!"
-            raise RuntimeError(msg)
+            raise MetadataMismatchError(msg)
 
     if check_conj:
         if a.is_conj() != b.is_conj():
-            raise RuntimeError(
+            raise MetadataMismatchError(
                 f"Conj mismatch! is_conj is set to {a.is_conj()} and {b.is_conj()}"
             )
 
     if a.is_neg() != b.is_neg():
-        raise RuntimeError(
+        raise MetadataMismatchError(
             f"Neg mismatch! is_neg is set to {a.is_neg()} and {b.is_neg()}"
         )
 
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
@@ -140,6 +140,11 @@ class UnsupportedOperatorException(RuntimeError):
     func: OpOverload
 
 
+@dataclass
+class MetadataMismatchError(RuntimeError):
+    reason: str
+
+
 def ordered_set(*items: T) -> Dict[T, Literal[True]]:
     return dict.fromkeys(items, True)
 
@@ -2031,6 +2036,11 @@ def maybe_to_real_tensor(
         def maybe_propagate_real_tensors(fake_out: T) -> T:
             import sympy
 
+            from torch._subclasses.fake_utils import (
+                _check_alias_info,
+                _check_fake_real_tensors,
+            )
+
             log.debug("maybe_propagate_real_tensors %s", func)
 
             def go(t: object, real_t: Tensor) -> None:
@@ -2057,6 +2067,33 @@ def go(t: object, real_t: Tensor) -> None:
                         assert self.shape_env is not None
                         self.shape_env.set_unbacked_var_to_val(s, int(real_t))
 
+            def _check_fake_real_vals(fake: Any, real: Any) -> None:
+                # use real values + ShapeEnv to check mismatches between potentially symbolic values
+                if isinstance(fake, (SymInt, SymFloat)):
+                    # symbolic expression, ask ShapeEnv to substitute known backed/unbacked values
+                    assert self.shape_env is not None
+                    if (
+                        not fake.node.expr.free_symbols
+                        - self.shape_env.var_to_val.keys()
+                        - self.shape_env.unbacked_var_to_val.keys()
+                    ):
+                        if (
+                            self.shape_env._maybe_evaluate_static(
+                                sympy.Eq(fake.node.expr, real), compute_hint=True
+                            )
+                            is not sympy.S.true
+                        ):
+                            raise MetadataMismatchError(
+                                f"mismatch between fake value {fake} and real value {real} "
+                            )
+                elif isinstance(
+                    fake, (int, float, bool)
+                ):  # concrete value, check direct equality
+                    if fake != real:
+                        raise MetadataMismatchError(
+                            f"mismatch between fake value {fake} and real value {real} "
+                        )
+
             if real_out is not nil:
                 if (
                     not isinstance(fake_out, Tensor)
@@ -2073,6 +2110,65 @@ def go(t: object, real_t: Tensor) -> None:
                 else:
                     tree_map_(go, fake_out, real_out)
 
+                # check fake/real alias info
+                try:
+                    _check_alias_info(
+                        "Real tensor propagation found",
+                        real_out,
+                        (real_args, real_kwargs),
+                        fake_out,
+                        (args, kwargs),
+                    )
+                except MetadataMismatchError as exc:
+                    raise MetadataMismatchError(
+                        f"Real tensor propagation found an aliasing mismatch between "
+                        f"fake output {fake_out} and real output {real_out}, "
+                        f" for func: {func}"
+                    ) from exc
+
+                # check fake/real tensor properies, sizes & output values
+                for i, (_real_out, _fake_out) in enumerate(
+                    zip(pytree.tree_leaves(real_out), pytree.tree_leaves(fake_out))
+                ):
+                    if isinstance(_fake_out, torch.Tensor):
+                        try:
+                            _check_fake_real_tensors(
+                                _fake_out,
+                                _real_out,
+                                context="Real tensor propagation found",
+                                sizes=False,  # manual check below
+                                strides=False,  # skip strides
+                                storage_offset=True,
+                                requires_grad=False,  # issues with FakeTensorConverter preserving requires_grad
+                            )
+                        except MetadataMismatchError as exc:
+                            raise MetadataMismatchError(
+                                f"Real tensor propagation found a metadata mismatch between "
+                                f"fake tensor {_fake_out} and real tensor {_real_out}, "
+                                f" at output index {i}, for func: {func}"
+                            ) from exc
+
+                        for j, (s_fake, s_real) in enumerate(
+                            zip(_fake_out.size(), _real_out.size())
+                        ):
+                            try:
+                                _check_fake_real_vals(s_fake, s_real)
+                            except MetadataMismatchError as exc:
+                                raise MetadataMismatchError(
+                                    f"Real tensor propagation found an output size mismatch between "
+                                    f"fake shape {s_fake} and real shape {s_real}, at output "
+                                    f"index {i}, dimension {j} for func: {func}"
+                                ) from exc
+                    else:
+                        try:
+                            _check_fake_real_vals(_fake_out, _real_out)
+                        except MetadataMismatchError as exc:
+                            raise MetadataMismatchError(
+                                f"Real tensor propagation found an output value mismatch between "
+                                f"fake output value {_fake_out} and real output value {_real_out}, "
+                                f" at output index {i}, for func: {func}"
+                            ) from exc
+
                 # If a data-dependent op is used in a decomposition, we
                 # may need to get the unbacked settings "early"
                 # TODO: Is this really needed?
diff --git a/torch/_subclasses/fake_utils.py b/torch/_subclasses/fake_utils.py
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py