Chagned CPU offload to default

cehongwang · cehongwang · commit 4242743bc19a · 2025-04-07T14:38:58.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -421,7 +421,6 @@ def compile(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
-    offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -666,7 +665,6 @@ def compile(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
-        "offload_module_to_cpu": offload_module_to_cpu,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -677,9 +675,11 @@ def compile(
     )
 
     gm = exported_program.module()
-    # TODO: Memory control prototyping. Under discussion
-    if offload_module_to_cpu:
-        exported_program.module().to("cpu")
+    # Move the weights in the state_dict to CPU
+    exported_program.module().to("cpu")
+    logger.info(
+        "The model is moved to CPU during compilation. If you want to keep the model on GPU, call module.to('cuda') on the model after compilation."
+    )
     logger.debug("Input graph: " + str(gm.graph))
 
     # Apply lowering on the graph module
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -49,7 +49,6 @@
 TILING_OPTIMIZATION_LEVEL = "none"
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
-OFFLOAD_MODULE_TO_CPU = False
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -25,7 +25,6 @@
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
     NUM_AVG_TIMING_ITERS,
-    OFFLOAD_MODULE_TO_CPU,
     OPTIMIZATION_LEVEL,
     PASS_THROUGH_BUILD_FAILURES,
     REFIT_IDENTICAL_ENGINE_WEIGHTS,
@@ -141,7 +140,6 @@ class CompilationSettings:
     tiling_optimization_level: str = TILING_OPTIMIZATION_LEVEL
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
-    offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
 
 
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -44,7 +44,7 @@
     get_trt_tensor,
     to_torch,
 )
-from torch_tensorrt.dynamo.utils import DYNAMIC_DIM, get_model_device, to_torch_device
+from torch_tensorrt.dynamo.utils import DYNAMIC_DIM, delete_module, to_torch_device
 from torch_tensorrt.fx.observer import Observer
 from torch_tensorrt.logging import TRT_LOGGER
 
@@ -491,8 +491,6 @@ def _save_weight_mapping(self) -> None:
         _LOGGER.info("Building weight name mapping...")
         # Stage 1: Name mapping
         torch_device = to_torch_device(self.compilation_settings.device)
-        gm_is_on_cuda = get_model_device(self.module).type == "cuda"
-        # If the model original position is on CPU, move it GPU
         sd = {
             k: v.reshape(-1).to(torch_device)
             for k, v in self.module.state_dict().items()
@@ -735,11 +733,8 @@ def run(
         self._create_timing_cache(
             builder_config, self.compilation_settings.timing_cache_path
         )
-        # TODO: Memory control prototyping. Under discussion
-        if self.compilation_settings.offload_module_to_cpu:
-            del self.module
-            gc.collect()
-            torch.cuda.empty_cache()
+
+        delete_module(self.module)
         serialized_engine = self.builder.build_serialized_network(
             self.ctx.net, builder_config
         )