Fixed memory overhead and enabled Flux with Mutable Module

cehongwang · cehongwang · commit c2ae18ccd712 · 2025-03-26T07:58:44.000Z
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -421,6 +421,7 @@ def compile(
     enable_weight_streaming: bool = _defaults.ENABLE_WEIGHT_STREAMING,
     tiling_optimization_level: str = _defaults.TILING_OPTIMIZATION_LEVEL,
     l2_limit_for_tiling: int = _defaults.L2_LIMIT_FOR_TILING,
+    offload_module_to_cpu: bool = _defaults.OFFLOAD_MODULE_TO_CPU,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -674,6 +675,7 @@ def compile(
         "enable_weight_streaming": enable_weight_streaming,
         "tiling_optimization_level": tiling_optimization_level,
         "l2_limit_for_tiling": l2_limit_for_tiling,
+        "offload_module_to_cpu": offload_module_to_cpu,
     }
 
     settings = CompilationSettings(**compilation_options)
@@ -685,7 +687,8 @@ def compile(
 
     gm = exported_program.module()
     # TODO: Memory control prototyping. Under discussion
-    exported_program.module().to("cpu")
+    if offload_module_to_cpu:
+        exported_program.module().to("cpu")
     logger.debug("Input graph: " + str(gm.graph))
 
     # Apply lowering on the graph module
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -49,6 +49,7 @@
 TILING_OPTIMIZATION_LEVEL = "none"
 L2_LIMIT_FOR_TILING = -1
 USE_DISTRIBUTED_MODE_TRACE = False
+OFFLOAD_MODULE_TO_CPU = True
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
@@ -2,6 +2,7 @@
 
 import collections.abc
 import copy
+import gc
 import logging
 from typing import Any, List, Optional, Sequence, Tuple
 
@@ -307,6 +308,10 @@ def refit_module_weights(
         get_decompositions(settings.enable_experimental_decompositions)
     )
     new_gm = new_weight_module.module()
+    # TODO: Memory control prototyping. Under discussion
+    if settings.offload_module_to_cpu:
+        new_weight_module.module().to("cpu")
+
     logger.debug("Input graph: " + str(new_gm.graph))
     # Apply lowering on the graph module
 
@@ -462,12 +467,21 @@ def refit_module_weights(
                     settings=settings,
                     weight_name_map=None,
                 )
+        # TODO: Memory control prototyping. Under discussion
+        if settings.offload_module_to_cpu:
+            del new_submodule
+            gc.collect()
+            torch.cuda.empty_cache()
 
         # clear EXCLUDE_WEIGHTS flag
         serialization_config = engine.create_serialization_config()
         serialization_config.clear_flag(trt.SerializationFlag.EXCLUDE_WEIGHTS)
         serialized_engine = engine.serialize_with_config(serialization_config)
 
+        del engine
+        gc.collect()
+        torch.cuda.empty_cache()
+
         if isinstance(
             compiled_submodule, (PythonTorchTensorRTModule, TorchTensorRTModule)
         ):
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -25,6 +25,7 @@
     MAX_AUX_STREAMS,
     MIN_BLOCK_SIZE,
     NUM_AVG_TIMING_ITERS,
+    OFFLOAD_MODULE_TO_CPU,
     OPTIMIZATION_LEVEL,
     PASS_THROUGH_BUILD_FAILURES,
     REFIT_IDENTICAL_ENGINE_WEIGHTS,
@@ -140,6 +141,7 @@ class CompilationSettings:
     tiling_optimization_level: str = TILING_OPTIMIZATION_LEVEL
     l2_limit_for_tiling: int = L2_LIMIT_FOR_TILING
     use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
+    offload_module_to_cpu: bool = OFFLOAD_MODULE_TO_CPU
 
 
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -734,9 +734,10 @@ def run(
             builder_config, self.compilation_settings.timing_cache_path
         )
         # TODO: Memory control prototyping. Under discussion
-        del self.module
-        gc.collect()
-        torch.cuda.empty_cache()
+        if self.compilation_settings.offload_module_to_cpu:
+            del self.module
+            gc.collect()
+            torch.cuda.empty_cache()
         serialized_engine = self.builder.build_serialized_network(
             self.ctx.net, builder_config
         )
diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
@@ -2,17 +2,14 @@
 import logging
 from copy import deepcopy
 from enum import Enum, auto
-from typing import Any, Collection, Dict, Iterator, List, Optional, Set, Union
+from typing import Any, Dict, Iterator, Optional, Union
 
 import numpy as np
 import torch
-from torch.fx.node import Target
 from torch_tensorrt._Device import Device
-from torch_tensorrt._enums import EngineCapability, dtype
 from torch_tensorrt.dynamo import _defaults
 from torch_tensorrt.dynamo._compiler import compile as dynamo_compile
 from torch_tensorrt.dynamo._refit import refit_module_weights
-from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.utils import (
     check_output_equal,
     to_torch_device,
@@ -63,35 +60,8 @@ def __init__(
         pytorch_model: torch.nn.Module,
         *,
         device: Optional[Union[Device, torch.device, str]] = _defaults.DEVICE,
-        disable_tf32: bool = _defaults.DISABLE_TF32,
-        assume_dynamic_shape_support: bool = _defaults.ASSUME_DYNAMIC_SHAPE_SUPPORT,
-        sparse_weights: bool = _defaults.SPARSE_WEIGHTS,
-        enabled_precisions: Set[
-            Union[torch.dtype, dtype]
-        ] = _defaults.ENABLED_PRECISIONS,
-        engine_capability: EngineCapability = _defaults.ENGINE_CAPABILITY,
-        immutable_weights: bool = False,
-        debug: bool = _defaults.DEBUG,
-        num_avg_timing_iters: int = _defaults.NUM_AVG_TIMING_ITERS,
-        workspace_size: int = _defaults.WORKSPACE_SIZE,
-        dla_sram_size: int = _defaults.DLA_SRAM_SIZE,
-        dla_local_dram_size: int = _defaults.DLA_LOCAL_DRAM_SIZE,
-        dla_global_dram_size: int = _defaults.DLA_GLOBAL_DRAM_SIZE,
-        truncate_double: bool = _defaults.TRUNCATE_DOUBLE,
-        require_full_compilation: bool = _defaults.REQUIRE_FULL_COMPILATION,
-        min_block_size: int = _defaults.MIN_BLOCK_SIZE,
-        torch_executed_ops: Optional[Collection[Target]] = None,
-        torch_executed_modules: Optional[List[str]] = None,
-        pass_through_build_failures: bool = _defaults.PASS_THROUGH_BUILD_FAILURES,
-        max_aux_streams: Optional[int] = _defaults.MAX_AUX_STREAMS,
-        version_compatible: bool = _defaults.VERSION_COMPATIBLE,
-        optimization_level: Optional[int] = _defaults.OPTIMIZATION_LEVEL,
         use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME,
-        use_fast_partitioner: bool = _defaults.USE_FAST_PARTITIONER,
-        enable_experimental_decompositions: bool = _defaults.ENABLE_EXPERIMENTAL_DECOMPOSITIONS,
-        dryrun: bool = _defaults.DRYRUN,
-        hardware_compatible: bool = _defaults.HARDWARE_COMPATIBLE,
-        timing_cache_path: str = _defaults.TIMING_CACHE_PATH,
+        immutable_weights: bool = False,
         **kwargs: Any,
     ) -> None:
         """
@@ -154,50 +124,15 @@ def __init__(
         self.exp_program: Any = None
         self.arg_inputs: tuple[Any, ...] = tuple()
         self.kwarg_inputs: dict[str, Any] = {}
-        device = to_torch_tensorrt_device(device)
-        enabled_precisions = {dtype._from(p) for p in enabled_precisions}
+        self.additional_settings = kwargs
+        self.use_python_runtime = use_python_runtime
+        self.trt_device = to_torch_tensorrt_device(device)
         assert (
             not immutable_weights
-        ), "`immutable_weights` has to be False for a MutableTorchTensorRTModule."
-        compilation_options = {
-            "enabled_precisions": (
-                enabled_precisions
-                if enabled_precisions
-                else _defaults.ENABLED_PRECISIONS
-            ),
-            "debug": debug,
-            "device": device,
-            "assume_dynamic_shape_support": assume_dynamic_shape_support,
-            "workspace_size": workspace_size,
-            "min_block_size": min_block_size,
-            "torch_executed_ops": (
-                torch_executed_ops if torch_executed_ops is not None else set()
-            ),
-            "pass_through_build_failures": pass_through_build_failures,
-            "max_aux_streams": max_aux_streams,
-            "version_compatible": version_compatible,
-            "optimization_level": optimization_level,
-            "use_python_runtime": use_python_runtime,
-            "truncate_double": truncate_double,
-            "use_fast_partitioner": use_fast_partitioner,
-            "num_avg_timing_iters": num_avg_timing_iters,
-            "enable_experimental_decompositions": enable_experimental_decompositions,
-            "require_full_compilation": require_full_compilation,
-            "disable_tf32": disable_tf32,
-            "sparse_weights": sparse_weights,
-            "immutable_weights": immutable_weights,
-            "engine_capability": engine_capability,
-            "dla_sram_size": dla_sram_size,
-            "dla_local_dram_size": dla_local_dram_size,
-            "dla_global_dram_size": dla_global_dram_size,
-            "dryrun": dryrun,
-            "hardware_compatible": hardware_compatible,
-            "timing_cache_path": timing_cache_path,
-        }
+        ), "`immutable_weights has to be False for a MutableTorchTensorRTModule"
+
         self.arg_dynamic_shapes: Optional[tuple[Any]] = None
         self.kwarg_dynamic_shapes: Optional[dict[Any, Any]] = None
-
-        self.settings = CompilationSettings(**compilation_options)
         self.run_info: Optional[tuple[Any, ...]] = None
         self.state_dict_metadata: dict[str, torch.Size] = {}
         self._store_state_dict_metadata()
@@ -293,7 +228,7 @@ def update_refit_condition(self) -> None:
         # to determine whether refit/recompilation is needed. If the output is the same, no further process needed.
         if self.run_info:
             args, kwargs, result = self.run_info
-            self.original_model.to(to_torch_device(self.settings.device))
+            self.original_model.to(to_torch_device(self.trt_device))
             new_result = self.original_model(*args, **kwargs)
             self.original_model.cpu()
             torch.cuda.empty_cache()
@@ -325,7 +260,7 @@ def refit_gm(self) -> None:
         MutableTorchTensorRTModule automatically catches weight value updates and call this function to refit the module.
         If it fails to catch the changes, please call this function manually to update the TRT graph module.
         """
-        self.original_model.to(to_torch_device(self.settings.device))
+        self.original_model.to(to_torch_device(self.trt_device))
         if self.exp_program is None:
             self.exp_program = torch.export.export(
                 self.original_model, self.arg_inputs, kwargs=self.kwarg_inputs
@@ -356,25 +291,30 @@ def compile(self) -> None:
         If it fails to catch the changes, please call this function manually to recompile the TRT graph module.
         """
         # Export the module
-        self.original_model.to(to_torch_device(self.settings.device))
-        self.exp_program = torch.export.export(
+        self.original_model.to(to_torch_device(self.trt_device))
+        self.exp_program = torch.export._trace._export(
             self.original_model,
             self.arg_inputs,
             kwargs=self.kwarg_inputs,
             dynamic_shapes=self._get_total_dynamic_shapes(),
+            strict=False,
+            allow_complex_guards_as_runtime_asserts=True,
+            # **self.additional_settings
         )
         self.gm = dynamo_compile(
             self.exp_program,
             arg_inputs=self.arg_inputs,
             kwarg_inputs=self.kwarg_inputs,
-            **self.settings.__dict__,
+            immutable_weights=False,
+            use_python_runtime=self.use_python_runtime,
+            **self.additional_settings,
         )
         self.original_model.cpu()
         torch.cuda.empty_cache()
 
     def _validate_inputs(self, *args: Any, **kwargs: Any) -> None:
 
-        if not self.arg_inputs:
+        if not self.arg_inputs and not self.kwarg_inputs:
             logger.info("First time compilation initiated. This may take some time.")
             self.refit_state.set_state(RefitFlag.NEEDS_RECOMPILE)
             self._store_inputs(args, kwargs)
@@ -628,7 +568,7 @@ def _check_tensor_shapes_with_dynamic_shapes(
     def save(module: Any, path: str) -> None:
         # Cast the object back to MutableTorchTensorRTModule to save
         assert (
-            not module.settings.use_python_runtime
+            not module.use_python_runtime
         ), "Python runtime does not support serialization. Save failed."
         module.init_finished = False
         module.__class__ = MutableTorchTensorRTModule
@@ -658,7 +598,7 @@ def load(path: str) -> Any:
         module.pytorch_model = _make_refit_change_trigger(
             module.original_model, module.refit_state
         )
-        module.original_model.to(to_torch_device(module.settings.device))
+        module.original_model.to(to_torch_device(module.device))
         module.exp_program = torch.export.export(
             module.original_model, module.arg_inputs, kwargs=module.kwarg_inputs
         )

Original file line number	Diff line number	Diff line change
`@@ -734,9 +734,10 @@ def run(`
`734`	`734`	`builder_config, self.compilation_settings.timing_cache_path`
`735`	`735`	`)`
`736`	`736`	`# TODO: Memory control prototyping. Under discussion`
`737`		`- del self.module`
`738`		`- gc.collect()`
`739`		`- torch.cuda.empty_cache()`
	`737`	`+ if self.compilation_settings.offload_module_to_cpu:`
	`738`	`+ del self.module`
	`739`	`+ gc.collect()`
	`740`	`+ torch.cuda.empty_cache()`
`740`	`741`	`serialized_engine = self.builder.build_serialized_network(`
`741`	`742`	`self.ctx.net, builder_config`
`742`	`743`	`)`