Added CPU offloading

cehongwang · cehongwang · commit 1bcb34cdfb93 · 2025-03-26T16:31:19.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -421,6 +421,7 @@ def compile(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
+    offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -550,15 +551,6 @@ def compile(
                 "`immutable_weights` must be False when `refit_identical_engine_weights` is True."
             )
 
-    if (
-        not immutable_weights
-        and not refit_identical_engine_weights
-        and enable_weight_streaming
-    ):
-        raise ValueError(
-            "TensorRT's `REFIT` flag is not compatible with `enable_weight_streaming=True` for now. This issue was reported on https://github.com/pytorch/TensorRT/issues/3305"
-        )
-
     if (
         "enable_cross_compile_for_windows" in kwargs.keys()
         and kwargs["enable_cross_compile_for_windows"]
@@ -674,6 +666,7 @@ def compile(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "offload_module_to_cpu": offload_module_to_cpu,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -684,6 +677,9 @@ def compile(
     )
 
     gm = exported_program.module()
+    # TODO: Memory control prototyping. Under discussion
+    if offload_module_to_cpu:
+        exported_program.module().to("cpu")
     logger.debug("Input graph: " + str(gm.graph))
 
     # Apply lowering on the graph module
@@ -820,6 +816,7 @@ def contains_metadata(gm: torch.fx.GraphModule) -> bool:
     trt_modules = {}
     # Iterate over all components that can be accelerated
     # Generate the corresponding TRT Module for those
+
     for name, _ in partitioned_module.named_children():
         submodule = getattr(partitioned_module, name)
         # filter on the GraphModule
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -49,6 +49,7 @@
 TILING_OPTIMIZATION_LEVEL = "none"
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
+OFFLOAD_MODULE_TO_CPU = False
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -25,6 +25,7 @@
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
     NUM_AVG_TIMING_ITERS,
+    OFFLOAD_MODULE_TO_CPU,
     OPTIMIZATION_LEVEL,
     PASS_THROUGH_BUILD_FAILURES,
     REFIT_IDENTICAL_ENGINE_WEIGHTS,
@@ -140,6 +141,7 @@ class CompilationSettings:
     tiling_optimization_level: str = TILING_OPTIMIZATION_LEVEL
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
+    offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
 
 
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -488,14 +488,12 @@ def _save_weight_mapping(self) -> None:
         # Stage 1: Name mapping
         torch_device = to_torch_device(self.compilation_settings.device)
         gm_is_on_cuda = get_model_device(self.module).type == "cuda"
-        if not gm_is_on_cuda:
-            # If the model original position is on CPU, move it GPU
-            sd = {
-                k: v.reshape(-1).to(torch_device)
-                for k, v in self.module.state_dict().items()
-            }
-        else:
-            sd = {k: v.reshape(-1) for k, v in self.module.state_dict().items()}
+        # If the model original position is on CPU, move it GPU
+        sd = {
+            k: v.reshape(-1).to(torch_device)
+            for k, v in self.module.state_dict().items()
+        }
+
         weight_name_map: dict[str, Any] = {}
         np_map = {}
         constant_mapping = {}
@@ -733,7 +731,11 @@ def run(
         self._create_timing_cache(
             builder_config, self.compilation_settings.timing_cache_path
         )
-
+        # TODO: Memory control prototyping. Under discussion
+        if self.compilation_settings.offload_module_to_cpu:
+            del self.module
+            gc.collect()
+            torch.cuda.empty_cache()
         serialized_engine = self.builder.build_serialized_network(
             self.ctx.net, builder_config
         )