adding test cases

apbose · apbose · commit d2aaa44b76af · 2025-03-03T09:06:21.000-08:00
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/nccl_ops.py b/py/torch_tensorrt/dynamo/conversion/impl/nccl_ops.py
@@ -3,12 +3,11 @@
 from typing import Optional, Tuple, Union
 
 import numpy as np
+import tensorrt as trt
 from torch.fx.node import Argument, Target
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
 from torch_tensorrt.fx.converters.converter_utils import SourceIR, set_layer_name
 
-import tensorrt as trt
-
 
 # class for AllReduce
 class AllReduceStrategy(IntEnum):
@@ -94,7 +93,7 @@ def nccl_reduce_scatter(
         "group", np.array(group, dtype=np.int32), trt.PluginFieldType.INT32
     )
 
-    p_dtype = trt.float16
+    p_dtype = trt.float32
     pf_dtype = trt.PluginField(
         "type_id", np.array([int(p_dtype)], np.int32), trt.PluginFieldType.INT32
     )
diff --git a/tests/py/dynamo/conversion/harness.py b/tests/py/dynamo/conversion/harness.py
@@ -351,6 +351,7 @@ def generate_graph(
         enable_passes: bool,
         propagate_shapes: bool = False,
         settings: CompilationSettings = CompilationSettings(),
+        fuse_distributed_ops: bool = False,
         torch_export_dynamic_shapes: Optional[Any] = None,
     ):
         mod = mod.eval()
@@ -366,6 +367,16 @@ def generate_graph(
                 tuple(torch_export_inputs),
                 dynamic_shapes=torch_export_dynamic_shapes,
             )
+            if fuse_distributed_ops:
+                exported_program = exported_program.run_decompositions(
+                    get_decompositions(False)
+                )
+                from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
+                    fuse_distributed_ops,
+                )
+
+                gm = exported_program.graph_module
+                gm = fuse_distributed_ops(gm, settings)
             if enable_passes:
                 exported_program = pre_export_lowering(exported_program, settings)
                 exported_program = exported_program.run_decompositions(
@@ -404,6 +415,7 @@ def run_test(
         propagate_shapes=False,
         int32_reqd=False,
         immutable_weights=True,
+        fuse_distributed_ops=False,
     ):
         # TODO: lan to remove this and set use_dynamo_traccer to True by default
         # once all the converter test files are moved to use_dynamo_tracer
@@ -424,6 +436,7 @@ def run_test(
             enable_passes=enable_passes,
             propagate_shapes=propagate_shapes,
             settings=compilation_settings,
+            fuse_distributed_ops=fuse_distributed_ops,
         )
 
         num_inputs = len(inputs)
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -0,0 +1,84 @@
+import os
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from parameterized import parameterized
+from torch.testing._internal.common_utils import run_tests
+
+
+def set_environment_variables():
+    os.environ["WORLD_SIZE"] = str(1)
+    os.environ["RANK"] = str(0)
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = str(29500)
+    os.environ["USE_TRTLLM_PLUGINS"] = "1"
+
+
+set_environment_variables()
+dist.init_process_group(backend="nccl", init_method="env://")
+group = dist.new_group(ranks=[0])
+group_name = group.group_name
+world_size = 1
+
+from conversion.harness import DispatchTestCase
+
+
+class TestGatherNcclOpsConverter(DispatchTestCase):
+    @parameterized.expand([(8)])
+    def test_nccl_ops(self, linear_layer_dim):
+        class DistributedGatherModel(nn.Module):
+            def __init__(self, input_dim):
+                super().__init__()
+                self.fc = torch.nn.Linear(input_dim, input_dim)
+
+            def forward(self, x):
+                x = self.fc(x)
+                gathered_tensor = torch.ops._c10d_functional.all_gather_into_tensor(
+                    x, world_size, group_name
+                )
+                gathered_tensor = torch.ops._c10d_functional.wait_tensor(
+                    gathered_tensor
+                )
+                return gathered_tensor
+
+        inputs = [torch.randn(1, linear_layer_dim).to("cuda")]
+        self.run_test(
+            DistributedGatherModel(linear_layer_dim).cuda(),
+            inputs,
+            use_dynamo_tracer=True,
+            fuse_distributed_ops=True,
+        )
+
+    @parameterized.expand([(8)])
+    def test_nccl_ops_scatter(self, linear_layer_dim):
+
+        class DistributedReduceScatterModel(nn.Module):
+            def __init__(self, input_dim):
+                super().__init__()
+                self.fc = torch.nn.Linear(input_dim, input_dim)
+
+            def forward(self, x):
+                x = self.fc(x)
+                scatter_reduce_tensor = (
+                    torch.ops._c10d_functional.reduce_scatter_tensor(
+                        x, "sum", world_size, group_name
+                    )
+                )
+                scatter_reduce_tensor = torch.ops._c10d_functional.wait_tensor(
+                    scatter_reduce_tensor
+                )
+                return scatter_reduce_tensor
+
+        inputs = [torch.zeros(1, linear_layer_dim).to("cuda")]
+
+        self.run_test(
+            DistributedReduceScatterModel(linear_layer_dim).cuda(),
+            inputs,
+            use_dynamo_tracer=True,
+            fuse_distributed_ops=True,
+        )
+
+
+if __name__ == "__main__":
+    run_tests()