[RELAND] Error grad mode op in export API (pytorch#117420)

tugsbayasgalan · pytorchmergebot · commit 1a790f5a61a9 · 2024-01-13T21:36:29.000Z
Summary: Title Test Plan: CI Differential Revision: D52706691 Pull Request resolved: pytorch#117420 Approved by: https://github.com/angelayi
diff --git a/test/export/test_safeguard.py b/test/export/test_safeguard.py
@@ -0,0 +1,87 @@
+# Owner(s): ["module: dynamo"]
+import unittest
+
+import torch
+import torch._dynamo as torchdynamo
+from torch.export import export
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+@unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
+class TestSafeguard(TestCase):
+    # If the autograd state doesn't change, dynamo eliminates autograd state manager op and later export can succeed.
+    # Otherwise, autograd can be preserved in the produced gragh, and export will fail.
+    def test_global_autograd(self):
+        def f1(a):
+            with torch.no_grad():
+                b = a + a
+            return b
+
+        def f2(a):
+            with torch.enable_grad():
+                b = a + a
+            return b
+
+        def f3(a):
+            with torch.set_grad_enabled(False):
+                b = a + a
+            return b
+
+        def f4(a):
+            with torch.set_grad_enabled(True):
+                b = a + a
+            return b
+
+        a = torch.randn(10)
+        with torch.no_grad():
+            export(f1, (a,))
+            export(f2, (a,))
+            export(f3, (a,))
+            export(f4, (a,))
+
+        with torch.enable_grad():
+            export(f2, (a,))
+            export(f4, (a,))
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Encountered autograd state manager op.*"
+            ):
+                export(f1, (a,))
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Encountered autograd state manager op.*"
+            ):
+                export(f3, (a,))
+
+    def test_tensor_autograd(self):
+        # dynamo errors when Tensor.requires_grad_ change the autograd state
+        def f1(a):
+            a.requires_grad_(True)
+            b = a + a
+            return b
+
+        # dynamo errors when Tensor.requires_grad_ change the autograd state
+        def f2(a):
+            a.requires_grad_(False)
+            b = a + a
+            return b
+
+        # dynamo always errors on Tensor.requires_grad
+        def f3(a):
+            a.requires_grad = False
+            b = a + a
+            return b
+
+        export(f1, (torch.randn(10, requires_grad=True),))
+        export(f2, (torch.randn(10, requires_grad=False),))
+
+        with self.assertRaises(RuntimeError):
+            export(f1, (torch.randn(10, requires_grad=False),))
+        with self.assertRaises(RuntimeError):
+            export(f2, (torch.randn(10, requires_grad=True),))
+        with self.assertRaises(RuntimeError):
+            export(f3, (torch.randn(10, requires_grad=False),))
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/onnx/test_fx_to_onnx.py b/test/onnx/test_fx_to_onnx.py
@@ -669,11 +669,12 @@ def forward(self, x):
                 return self.normal.sample(x.shape)
 
         x = torch.randn(2, 3)
-        exported_program = torch.export.export(Model(), args=(x,))
-        _ = torch.onnx.dynamo_export(
-            exported_program,
-            x,
-        )
+        with torch.no_grad():
+            exported_program = torch.export.export(Model(), args=(x,))
+            _ = torch.onnx.dynamo_export(
+                exported_program,
+                x,
+            )
 
     def test_aten_linalg_vector_norm_with_reducel2(self):
         class Net(nn.Module):
diff --git a/torch/export/_safeguard.py b/torch/export/_safeguard.py
@@ -0,0 +1,42 @@
+import torch
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode
+from torch.overrides import TorchFunctionMode
+
+
+class AutogradStateOpsFailSafeguard(TorchFunctionMode):
+    """
+    Detect grad state ops during exporting the graph and fail the process by
+    raising an error, to avoid unexpected behavior. Those grad mode ops could be:
+    `torch.no_grad`
+    `torch.enable_grad`
+    `torch.set_grad_enabled`
+
+    Export with predispatch mode is exempted.
+    """
+
+    def __torch_function__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+        unsupported_grad_mode_ops = [
+            torch._C._set_grad_enabled,
+        ]
+        # It's only enabled while tracing, by confirming the torch dispatch mode is
+        # any active PROXY. This is to allow the autograd ops out of tracing.
+        current_state = torch._C.is_grad_enabled()
+        if func in unsupported_grad_mode_ops:
+            assert len(args) == 1
+            changed_state = args[0]
+            mode = torch._C._get_dispatch_mode(torch._C._TorchDispatchModeKey.PROXY)
+            # Intend to check if it's not the pre_dispatch mode. It's allowed to use
+            # autograd ops in pre_dispatch mode, e.g. `torch.no_grad`
+            if (
+                mode
+                and isinstance(mode, ProxyTorchDispatchMode)
+                and not mode.pre_dispatch
+                and changed_state != current_state
+            ):
+                raise RuntimeError(
+                    f"Encountered autograd state manager op {func} trying to change global autograd state "
+                    "while exporting. This is unsafe because we don't capture this op in torch.export "
+                    "today, hence we can't reflect the user intention soundly."
+                )
+        return func(*args, **kwargs)
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
@@ -4,6 +4,7 @@
 import logging
 import re
 from collections import OrderedDict
+from contextlib import nullcontext
 from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
@@ -30,6 +31,8 @@
 from torch.fx.graph import _PyTreeCodeGen, _PyTreeInfo
 from torch.utils._sympy.value_ranges import ValueRangeError
 
+from ._safeguard import AutogradStateOpsFailSafeguard
+
 from .dynamic_shapes import _process_constraints, Constraint
 from .exported_program import (
     _disable_prexisiting_fake_mode,
@@ -380,10 +383,19 @@ def _export_non_strict(
     transform=lambda x: x,  # TODO(zhxchen17) Revisit if this is needed later.
     pre_dispatch=False,
 ):
+    # [NOTE] If the user is exporting under training mode, we want to detect if there is any
+    # state change in the autograd global state and error. If the user is exporting under inference
+    # mode, we don't care.
+    is_grad_enabled = torch._C.is_grad_enabled()
+    grad_safe_guard = (
+        AutogradStateOpsFailSafeguard() if is_grad_enabled else nullcontext()
+    )
     # This _reparametrize_module makes sure inputs and module.params/buffers have the same fake_mode,
     # otherwise aot_export_module will error out because it sees a mix of fake_modes.
     # And we want aot_export_module to use the fake_tensor mode in dynamo to keep the pipeline easy to reason about.
-    with torch.nn.utils.stateless._reparametrize_module(mod, fake_params_buffers):
+    with torch.nn.utils.stateless._reparametrize_module(
+        mod, fake_params_buffers
+    ), grad_safe_guard:  # type: ignore[attr-defined]
         gm, graph_signature = transform(aot_export_module)(
             mod,
             (*fake_args, *fake_kwargs.values()),