Fixed the comments

cehongwang · cehongwang · commit 98cbd76500ca · 2025-04-07T15:03:33.000Z
diff --git a/examples/apps/flux-demo.py b/examples/apps/flux-demo.py
@@ -3,8 +3,7 @@
 import gradio as gr
 import torch
 import torch_tensorrt
-from diffusers import FluxPipeline, StableDiffusionPipeline
-from torch.export._trace import _export
+from diffusers import FluxPipeline
 
 DEVICE = "cuda:0"
 pipe = FluxPipeline.from_pretrained(
@@ -43,13 +42,7 @@
     "debug": False,
     "use_python_runtime": True,
     "immutable_weights": False,
-    # "cache_built_engines": True,
-    # "reuse_cached_engines": True,
-    # "timing_cache_path": "/home/engine_cache/flux.bin",
-    # "engine_cache_size": 40 * 1 << 30,
-    # "enable_weight_streaming": True,
-    # "weight_streaming_budget": 8 * 1 << 30
-    # "enable_cuda_graph": True,
+    "enable_cuda_graph": True,
 }
 
 trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings)
@@ -69,7 +62,6 @@ def generate_image(prompt, inference_step, batch_size=2):
 
 generate_image(["Test"], 2)
 torch.cuda.empty_cache()
-# torch_tensorrt.MutableTorchTensorRTModule.save(trt_gm, "weight_streaming_Flux.pkl")
 
 
 def model_change(model):
@@ -97,8 +89,6 @@ def load_lora(path):
 
 
 generate_image(["Test"], 2)
-# load_lora("")
-# generate_image(["A golden retriever holding a sign to code"], 2)
 
 # Create Gradio interface
 with gr.Blocks(title="Flux Demo with Torch-TensorRT") as demo:
diff --git a/examples/dynamo/refit_engine_example.py b/examples/dynamo/refit_engine_example.py
@@ -101,6 +101,7 @@
 )
 
 # Check the output
+model2.to("cuda")
 expected_outputs, refitted_outputs = exp_program2.module()(*inputs), new_trt_gm(*inputs)
 for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):
     assert torch.allclose(
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -422,7 +422,6 @@ def compile(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
-    offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -667,7 +666,6 @@ def compile(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
-        "offload_module_to_cpu": offload_module_to_cpu,
     }
 
     settings = CompilationSettings(**compilation_options)
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -49,7 +49,6 @@
 TILING_OPTIMIZATION_LEVEL = "none"
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
-OFFLOAD_MODULE_TO_CPU = True
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
@@ -36,7 +36,9 @@
     TorchTensorRTModule,
 )
 from torch_tensorrt.dynamo.utils import (
+    CPU_DEVICE,
     check_module_output,
+    delete_module,
     get_model_device,
     get_torch_inputs,
     set_log_level,
@@ -314,9 +316,6 @@ def refit_module_weights(
         get_decompositions(settings.enable_experimental_decompositions)
     )
     new_gm = new_weight_module.module()
-    # TODO: Memory control prototyping. Under discussion
-    if settings.offload_module_to_cpu:
-        new_weight_module.module().to("cpu")
 
     logger.debug("Input graph: " + str(new_gm.graph))
     # Apply lowering on the graph module
@@ -395,7 +394,7 @@ def refit_module_weights(
 
     # Iterate over all components that can be accelerated
     # Generate the corresponding TRT Module for those
-
+    new_weight_module.module().to(CPU_DEVICE)
     for name, new_submodule in new_partitioned_module.named_children():
         # Refit each submodule
         # Extract engine from the submodule
@@ -498,11 +497,7 @@ def refit_module_weights(
                     settings=settings,
                     weight_name_map=None,
                 )
-        # TODO: Memory control prototyping. Under discussion
-        if settings.offload_module_to_cpu:
-            del new_submodule
-            gc.collect()
-            torch.cuda.empty_cache()
+        delete_module(new_submodule)
 
         # clear EXCLUDE_WEIGHTS flag
         serialization_config = engine.create_serialization_config()
@@ -525,20 +520,18 @@ def refit_module_weights(
         gc.collect()
         torch.cuda.empty_cache()
 
-    # TODO: Memory control prototyping. Under discussion
-    if settings.offload_module_to_cpu:
-        del new_partitioned_module
-        gc.collect()
-        torch.cuda.empty_cache()
+    delete_module(new_partitioned_module)
 
     if verify_output and arg_inputs is not None:
+        new_gm.to(torch.cuda.current_device())
         if check_module_output(
             new_module=new_gm,
             refitted_module=compiled_module,
             arg_inputs=torch_inputs,
             kwarg_inputs=torch_kwarg_inputs,
         ):
             logger.info("Refitting Succeed!")
+            new_gm.to(CPU_DEVICE)
         else:
             if weight_name_map:
                 logger.warning(
@@ -554,6 +547,7 @@ def refit_module_weights(
                     in_place=in_place,
                 )
             logger.error("Refitting Failed! The outputs do not match.")
+            new_gm.to(CPU_DEVICE)
     else:
         logger.info("Refitting Completed! Output verification skipped.")
 
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -25,7 +25,6 @@
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
     NUM_AVG_TIMING_ITERS,
-    OFFLOAD_MODULE_TO_CPU,
     OPTIMIZATION_LEVEL,
     PASS_THROUGH_BUILD_FAILURES,
     REFIT_IDENTICAL_ENGINE_WEIGHTS,
@@ -141,7 +140,6 @@ class CompilationSettings:
     tiling_optimization_level: str = TILING_OPTIMIZATION_LEVEL
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
-    offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
 
 
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
@@ -209,7 +209,7 @@ def forward(a, b, c=0, d=0):
 
         self.refit_state.set_state(RefitFlag.NEEDS_RECOMPILE)
 
-    def _get_total_dynamic_shapes(self) -> dict[str, Any] | None:
+    def _get_total_dynamic_shapes(self) -> Union[dict[str, Any], None]:
         if not self.arg_dynamic_shapes and not self.kwarg_dynamic_shapes:
             return None
         total_dynamic_shape = {}
@@ -490,7 +490,8 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
     def to(self, *args: Any, **kwargs: Any) -> None:
         logger.warning(
             "Trying to move the original PyTorch model. This will cause CPU offloading failing and increase GPU memory usage."
-            + "If this is absolute necessary, please call module.pytorch_model.to(...)"
+            + "If this is absolute necessary, please call module.pytorch_model.to(...) \n"
+            + "The model is still on the original device."
         )
 
     @property
diff --git a/py/torch_tensorrt/runtime/_cudagraphs.py b/py/torch_tensorrt/runtime/_cudagraphs.py
@@ -77,8 +77,8 @@ def __enter__(self) -> torch.nn.Module | torch.fx.GraphModule:
             self.old_module = self.compiled_module.gm
             self.compiled_module.gm = get_cuda_graph_module(self.compiled_module.gm)
             return self.compiled_module
-
-        return get_cuda_graph_module(self.compiled_module)
+        else:
+            return get_cuda_graph_module(self.compiled_module)
 
     def __exit__(self, *args: Any) -> None:
         # Set cudagraphs back to old mode
diff --git a/tests/py/dynamo/models/test_model_refit.py b/tests/py/dynamo/models/test_model_refit.py

Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,7 @@`
`101`	`101`	`)`
`102`	`102`
`103`	`103`	`# Check the output`
	`104`	`+model2.to("cuda")`
`104`	`105`	`expected_outputs, refitted_outputs = exp_program2.module()(inputs), new_trt_gm(inputs)`
`105`	`106`	`for expected_output, refitted_output in zip(expected_outputs, refitted_outputs):`
`106`	`107`	`assert torch.allclose(`