fix: torch-TRT runtime cache attribute + standard-TRT fast refit regression (#4225)

tp5uiuc · web-flow · commit e85844a6c3cb · 2026-05-04T13:34:18.000-07:00
Signed-off-by: tejaswinp &lt;tejaswinp@nvidia.com&gt;
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
@@ -12,7 +12,7 @@
 from torch.export import ExportedProgram
 from torch.fx.experimental.proxy_tensor import unset_fake_temporarily
 from torch_tensorrt._enums import dtype
-from torch_tensorrt._features import needs_refit
+from torch_tensorrt._features import ENABLED_FEATURES, needs_refit
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo import partitioning
 from torch_tensorrt.dynamo._exporter import inline_torch_modules
@@ -41,7 +41,6 @@
     TorchTensorRTModule,
 )
 from torch_tensorrt.dynamo.utils import (
-    CPU_DEVICE,
     check_module_output,
     check_output_equal,
     get_model_device,
@@ -199,18 +198,24 @@ def _refit_single_trt_engine_with_gm(
                     weight_dtype, weight.data_ptr(), torch.numel(weight)
                 )
                 refitter.set_named_weights(layer_name, trt_wt_tensor, trt_wt_location)
-            # Check completeness via two methods:
-            # 1. get_missing_weights(): reports weights in connected engines
-            #    that were not set.
-            # 2. Compare weights actually set vs all engine weights: catches
-            #    weights in independent engines that get_missing_weights() may not report.
+            # get_missing_weights(): reports weights in connected engines
+            # that were not set.
             missing_weights = refitter.get_missing_weights()
-            unset_weights = {w for w in weight_list if w not in mapping}
-            assert len(missing_weights) == 0 and len(unset_weights) == 0, (
-                f"Fast refitting failed due to incomplete mapping"
-                f" ({len(missing_weights)} missing,"
-                f" {len(unset_weights)} unset)"
+            assert len(missing_weights) == 0, (
+                f"Fast refit failed: refitter.get_missing_weights() reports "
+                f"{len(missing_weights)} of {len(weight_list)} engine weight(s) "
+                f"were never set."
             )
+            if ENABLED_FEATURES.tensorrt_rtx:
+                # Compare weights actually set vs all engine weights: catches
+                # weights in independent engines that get_missing_weights() may not report.
+                unset_weights = {w for w in weight_list if w not in mapping}
+                assert len(unset_weights) == 0, (
+                    f"Fast refit failed on TensorRT-RTX: {len(unset_weights)} of "
+                    f"{len(weight_list)} engine weight(s) had no entry in "
+                    f"weight_name_map. "
+                    f"Unset (showing up to 5): {sorted(unset_weights)[:5]}"
+                )
 
         else:
             mapping = construct_refit_mapping(new_gm, input_list, settings)
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -24,7 +24,8 @@
     multi_gpu_device_check,
 )
 
-import tensorrt as trt
+# must import after torch_tensorrt to resolve tensorrt_rtx alias
+import tensorrt as trt  # isort: skip
 
 logger = logging.getLogger(__name__)
 
@@ -529,6 +530,12 @@ def _load_from_state_dict(
         self.output_names = state_dict[prefix + "output_names"]
         self.target_platform = state_dict[prefix + "platform"]
 
+        # Same rationale as __setstate__: ensure these exist before
+        # setup_engine() so __del__ -> _save_runtime_cache() is safe even
+        # if a future caller invokes this without __init__ having run.
+        self.runtime_config = None
+        self.runtime_cache = None
+
         # Run multi-gpu device check to validate engine instantiation
         multi_gpu_device_check()
         self.setup_engine()
@@ -547,6 +554,12 @@ def __setstate__(self, state: Dict[str, Any]) -> None:
         self.__dict__.update(state)
         # reset after unpickling, apbose: is this required though?
         self._nccl_comm = None
+        # __getstate__ pops these; re-initialize before setup_engine() so
+        # __del__ -> _save_runtime_cache() can always read them, including
+        # on standard (non-RTX) TRT where setup_engine() does not call
+        # _setup_runtime_config().
+        self.runtime_config = None
+        self.runtime_cache = None
         self.setup_engine()
 
     def __deepcopy__(self, memo: Any) -> PythonTorchTensorRTModule: