restructuring the code to include option use_distributed_mode_trace

apbose · apbose · commit 3adafafc42b7 · 2025-03-04T11:13:15.000-08:00
diff --git a/examples/distributed_inference/tensor_parallel_llama3.py b/examples/distributed_inference/tensor_parallel_llama3.py
@@ -51,7 +51,7 @@
             "use_python_runtime": True,
             "workspace_size": 1 << 33,
             "debug": False,
-            "use_aot_joint_export": False,
+            "use_distributed_mode_trace": True,
         },
         dynamic=False,
     )
diff --git a/examples/distributed_inference/tensor_parallel_simple_example.py b/examples/distributed_inference/tensor_parallel_simple_example.py
@@ -74,7 +74,7 @@ def forward(self, x):
         "enabled_precisions": {torch.float32, torch.float16},
         "use_python_runtime": True,
         "min_block_size": 1,
-        "use_aot_joint_export": False,
+        "use_distributed_mode_trace": True,
     },
     dynamic=False,
 )
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -46,7 +46,7 @@
 IMMUTABLE_WEIGHTS = True
 ENABLE_WEIGHT_STREAMING = False
 ENABLE_CROSS_COMPILE_FOR_WINDOWS = False
-USE_AOT_JOINT_EXPORT = True
+USE_DISTRIBUTED_MODE_TRACE = False
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -33,7 +33,7 @@
     STRIP_ENGINE_WEIGHTS,
     TIMING_CACHE_PATH,
     TRUNCATE_DOUBLE,
-    USE_AOT_JOINT_EXPORT,
+    USE_DISTRIBUTED_MODE_TRACE,
     USE_EXPLICIT_TYPING,
     USE_FAST_PARTITIONER,
     USE_FP32_ACC,
@@ -92,7 +92,7 @@ class CompilationSettings:
         enable_weight_streaming (bool): Enable weight streaming.
         enable_cross_compile_for_windows (bool): By default this is False means TensorRT engines can only be executed on the same platform where they were built.
             True will enable cross-platform compatibility which allows the engine to be built on Linux and run on Windows
-        use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors
+        USE_DISTRIBUTED_MODE_TRACE (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -133,7 +133,7 @@ class CompilationSettings:
     immutable_weights: bool = IMMUTABLE_WEIGHTS
     enable_weight_streaming: bool = ENABLE_WEIGHT_STREAMING
     enable_cross_compile_for_windows: bool = ENABLE_CROSS_COMPILE_FOR_WINDOWS
-    use_aot_joint_export: bool = USE_AOT_JOINT_EXPORT
+    use_distributed_mode_trace: bool = USE_DISTRIBUTED_MODE_TRACE
 
 
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -52,33 +52,35 @@ def aot_torch_tensorrt_aten_backend(
     gm: torch.fx.GraphModule, sample_inputs: Sequence[Any], **kwargs: Any
 ) -> torch.nn.Module:
     settings, engine_cache = parse_dynamo_kwargs(kwargs)
-    if settings.use_aot_joint_export:
-        return _pretraced_backend(gm, sample_inputs, settings, engine_cache)
-    logger.debug("Wrapping the backend with aot_autograd\n")
-    _pretraced_backend_autograd = functools.partial(
-        _pretraced_backend, settings=settings, engine_cache=engine_cache
-    )
-    settings_aot_autograd = {}
-    settings_aot_autograd["decompositions"] = get_decompositions(
-        settings.enable_experimental_decompositions
-    )
-    # This is added since detach lowering leads to alias nodes
-    # Error - View operation returned a tensor that is the same as the input base tensor
-    # torch nop_decompositions in torch/_decomp/decompositions.py
-    # transpose key deleted since not desirable to lower it to permute
-    to_delete = {
-        key
-        for key in settings_aot_autograd["decompositions"]
-        if "transpose" in key._name or "detach" in key._name
-    }
-
-    for key in to_delete:
-        del settings_aot_autograd["decompositions"][key]
-
-    return aot_autograd(
-        fw_compiler=_pretraced_backend_autograd,
-        decompositions=settings_aot_autograd["decompositions"],
-    )(gm, sample_inputs)
+
+    if settings.use_distributed_mode_trace:
+        logger.debug(
+            "Wrapping the backend with aot_autograd for Distributed examples\n"
+        )
+        _pretraced_backend_autograd = functools.partial(
+            _pretraced_backend, settings=settings, engine_cache=engine_cache
+        )
+        settings_aot_autograd = {}
+        settings_aot_autograd["decompositions"] = get_decompositions(
+            settings.enable_experimental_decompositions
+        )
+        # This is added since detach lowering leads to alias nodes
+        # Error - View operation returned a tensor that is the same as the input base tensor
+        # torch nop_decompositions in torch/_decomp/decompositions.py
+        # transpose key deleted since not desirable to lower it to permute
+        to_delete = {
+            key
+            for key in settings_aot_autograd["decompositions"]
+            if "transpose" in key._name or "detach" in key._name
+        }
+        for key in to_delete:
+            del settings_aot_autograd["decompositions"][key]
+
+        return aot_autograd(
+            fw_compiler=_pretraced_backend_autograd,
+            decompositions=settings_aot_autograd["decompositions"],
+        )(gm, sample_inputs)
+    return _pretraced_backend(gm, sample_inputs, settings, engine_cache)
 
 
 def _pretraced_backend(
@@ -129,7 +131,7 @@ def _pretraced_backend(
                     )
 
             # Invoke AOTAutograd to translate operators to aten
-            if settings.use_aot_joint_export:
+            if not settings.use_distributed_mode_trace:
                 gm = aot_export_joint_simple(
                     gm,
                     sample_inputs,
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -17,7 +17,7 @@
 
 
 class TestGatherNcclOpsConverter(DispatchTestCase):
-    @parameterized.expand([(8)])
+    @parameterized.expand([8])
     def test_nccl_ops(self, linear_layer_dim):
         class DistributedGatherModel(nn.Module):
             def __init__(self, input_dim):
@@ -42,7 +42,7 @@ def forward(self, x):
             enable_passes=True,
         )
 
-    @parameterized.expand([(8)])
+    @parameterized.expand([8])
     def test_nccl_ops_scatter(self, linear_layer_dim):
 
         class DistributedReduceScatterModel(nn.Module):

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@`
`51`	`51`	`"use_python_runtime": True,`
`52`	`52`	`"workspace_size": 1 << 33,`
`53`	`53`	`"debug": False,`
`54`		`- "use_aot_joint_export": False,`
	`54`	`+ "use_distributed_mode_trace": True,`
`55`	`55`	`},`
`56`	`56`	`dynamic=False,`
`57`	`57`	`)`
Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@ def forward(self, x):`
`74`	`74`	`"enabled_precisions": {torch.float32, torch.float16},`
`75`	`75`	`"use_python_runtime": True,`
`76`	`76`	`"min_block_size": 1,`
`77`		`- "use_aot_joint_export": False,`
	`77`	`+ "use_distributed_mode_trace": True,`
`78`	`78`	`},`
`79`	`79`	`dynamic=False,`
`80`	`80`	`)`