feat: support masked_scatter by lowering path

chohk88 · chohk88 · commit 946c3c1ed636 · 2025-03-12T12:56:45.000Z
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/select.py b/py/torch_tensorrt/dynamo/conversion/impl/select.py
@@ -2,6 +2,7 @@
 from typing import Optional, Sequence, Union
 
 import numpy as np
+import tensorrt as trt
 import torch
 from torch.fx.node import Target
 from torch_tensorrt.dynamo._SourceIR import SourceIR
@@ -23,8 +24,6 @@
 )
 from torch_tensorrt.fx.types import TRTTensor
 
-import tensorrt as trt
-
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
 
@@ -463,6 +462,7 @@ def gather(
 ) -> TRTTensor:
     input_shape = input.shape
     dim = get_positive_dim(dim, len(input_shape))
+    index = cast_trt_tensor(ctx, index, trt.int32, name + "_cast_index_tensor")
     gather_layer = ctx.net.add_gather(input, index, axis=dim)
     gather_layer.mode = trt.GatherMode.ELEMENT
     set_layer_name(gather_layer, target, name + "_gather_layer_element", source_ir)
diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
@@ -566,6 +566,46 @@ def scaled_dot_product_cudnn_attention_decomposition(
     return attn, None, None, None, 0, 0, None, None, None
 
 
+@register_torch_trt_decomposition(
+    aten.masked_scatter, registry=TORCH_TRT_DECOMPOSITIONS
+)
+def masked_scatter_decomposition(
+    input: torch.Tensor,
+    mask: torch.Tensor,
+    source: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Performs an operation equivalent to `input[mask] = source`.
+    Steps:
+      1) Broadcast `input` and `mask` to a common shape
+      2) Flatten them
+      3) Convert `mask` to int64, compute its cumsum, and subtract 1 to get gather indices
+      4) Use `gather` to select elements from `source`
+      5) Use `torch.where` to place gathered elements where `mask` is True
+      6) Reshape the result to the original shape
+    """
+
+    # 1) Broadcast `input` and `mask` to a common shape
+    input_b, mask_b = aten.broadcast_tensors([input, mask])
+
+    # 2) Flatten the broadcasted tensors and the source tensor
+    input_flat = input_b.flatten()
+    mask_flat = mask_b.flatten()
+    source_flat = source.flatten()
+
+    # 3) Compute gather indices: (cumsum of mask as int64) - 1
+    source_idx = mask_flat.to(torch.int64).cumsum(0) - 1
+
+    # 4) Gather elements from source_flat using these indices
+    gathered = source_flat.gather(0, source_idx)
+
+    # 5) Replace positions where mask is True with gathered values, otherwise keep original
+    replaced = torch.where(mask_flat, gathered, input_flat)
+
+    # 6) Reshape the result back to the broadcasted shape
+    return replaced.view(input_b.shape)
+
+
 def get_decompositions(
     enable_experimental_decompositions: bool = False,
 ) -> Dict[OpOverload, Callable[[Any], Any]]:
diff --git a/tests/py/dynamo/lowering/test_decompositions.py b/tests/py/dynamo/lowering/test_decompositions.py
@@ -2117,6 +2117,86 @@ def forward(self, query, key, value, attn_bias=None):
             msg="Scaled_dot_product_cudnn_attention TRT outputs don't match with the original model.",
         )
 
+    @parameterized.expand(
+        [
+            ("float32_2d", torch.float32, (4, 4)),
+            ("float16_3d", torch.float16, (2, 3, 4)),
+        ]
+    )
+    def test_masked_scatter(self, _, dtype, shape):
+        """
+        Test that masked_scatter.default is correctly decomposed into
+        (cumsum, gather, where, etc.) and that final TRT results match PyTorch.
+        """
+
+        class TestModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x, mask, source):
+                return torch.ops.aten.masked_scatter.default(x, mask, source)
+
+        x = torch.randn(*shape, dtype=dtype, device="cuda")
+
+        mask = torch.rand(*shape, device="cuda") > 0.5
+        num_trues = mask.sum().item()
+        if num_trues == 0:
+            mask[0] = True
+            num_trues = 1
+        source = torch.arange(num_trues, dtype=dtype, device="cuda")
+
+        inputs = [x, mask, source]
+
+        fx_graph = torch.fx.symbolic_trace(TestModule())
+
+        expected_ops = {
+            torch.ops.aten.where.self,
+            torch.ops.aten.gather.default,
+            torch.ops.aten.cumsum.default,
+        }
+        unexpected_ops = {torch.ops.aten.masked_scatter.default}
+
+        unexpected_ops_seen, expected_ops_unseen = lower_graph_testing(
+            fx_graph,
+            inputs,
+            expected_ops=expected_ops,
+            unexpected_ops=unexpected_ops,
+            min_block_size=1,
+        )
+
+        self.assertEqual(
+            len(unexpected_ops_seen),
+            0,
+            f"The following unexpected ops were encountered: {unexpected_ops_seen}",
+        )
+
+        self.assertEqual(
+            len(expected_ops_unseen),
+            0,
+            f"The following expected ops were not encountered: {expected_ops_unseen}",
+        )
+
+        torch._dynamo.reset()
+
+        trt_model = torch_tensorrt.compile(
+            fx_graph,
+            "torch_compile",
+            inputs,
+            min_block_size=1,
+            pass_through_build_failures=True,
+        )
+        with torch.no_grad():
+            trt_results = trt_model(*inputs).detach().cpu()
+            torch_results = fx_graph(*inputs).detach().cpu()
+
+        max_diff = float(torch.max(torch.abs(trt_results - torch_results)))
+        self.assertAlmostEqual(
+            max_diff,
+            0,
+            DECIMALS_OF_AGREEMENT,
+            f"Masked_scatter TRT outputs don't match with the original model. (diff={max_diff})",
+        )
+
 
 if __name__ == "__main__":
     run_tests()