[user-triton] handle inline_asm_case (pytorch#148043)

sijiac · pytorchmergebot · commit 4995e058bf6c · 2025-02-28T20:52:51.000Z
Summary: We currently failed the mutation analysis for all inline_asm ops. In this diff, we handle the case when "is_pure" is set to True since it indicates the operation doesn't mutate the input value Test Plan: ../buck-out/v2/gen/fbcode/854b9ed00d28c5c5/caffe2/test/inductor/__triton_kernels__/triton_kernels.par --r test_mutations_inline_asm_kernel ``` test_mutations_inline_asm_kernel_is_pure_true (caffe2.test.inductor.test_triton_kernels.MutationTests) ... W0226 18:10:34.261000 1906801 /data/users/sijiac/fbsource/fbcode/caffe2/torch/_higher_order_ops/triton_kernel_wrap.py:656] TTIR mutation analysis: Skipping pure tt.elementwise_inline_asm op (is_pure=True) ok ---------------------------------------------------------------------- Ran 2 tests in 0.706s OK ``` Differential Revision: D69878591 Pull Request resolved: pytorch#148043 Approved by: https://github.com/zou3519
diff --git a/test/inductor/test_triton_kernels.py b/test/inductor/test_triton_kernels.py
@@ -3127,9 +3127,13 @@ def branch_with_multiple_yield_args(
             {"ptr": t, "n_elements": 4, "BLOCK_SIZE": 4},
             ["ptr"],
         ],
-        # Cant optimize since the kernel contains a tl.inline_asm_elementwise
         [
-            inline_asm_kernel,
+            inline_asm_kernel_is_pure_true,
+            {"X": t, "Y": t, "Z": t, "n": 4, "BLOCK": 4},
+            ["Z"],
+        ],
+        [
+            inline_asm_kernel_is_pure_false,
             {"X": t, "Y": t, "Z": t, "n": 4, "BLOCK": 4},
             ["X", "Y", "Z"],
         ],
diff --git a/torch/_higher_order_ops/triton_kernel_wrap.py b/torch/_higher_order_ops/triton_kernel_wrap.py
@@ -158,6 +158,9 @@ class Op:
     ret: Intermediate = dataclasses.field(repr=False)
     # used for scf.yield: see [Note: scf.yield fix-up]
     sub_idx: Optional[int] = None
+    # used for tt.elementwise_inline_asm
+    # `is_pure = True` assumes the asm block has no side-effects
+    is_pure: bool = False
 
     def __post_init__(self) -> None:
         if self.name == "tt.call":
@@ -572,14 +575,22 @@ def mlir_to_functions(op: "TritonIROperation") -> None:
                 Intermediate(operand) for operand in operand_ids
             ]
             block_ops = op_stack[parent_block_id]
+
+            is_pure = False
+            # Handle the case for tt.elementwise_inline_asm to set `is_pure` for mutation analysis
+            if name == "tt.elementwise_inline_asm":
+                is_pure = op.get_bool_attr("pure")
+
             if result_ids:
                 for result_id in result_ids:
                     res = Intermediate(result_id)
-                    block_ops[res].append(Op(name, callee, args, res))
+                    block_ops[res].append(Op(name, callee, args, res, is_pure=is_pure))
             else:
                 next_fake_intermediate -= 1
                 fake_res = Intermediate(next_fake_intermediate)
-                block_ops[fake_res].append(Op(name, callee, args, fake_res))
+                block_ops[fake_res].append(
+                    Op(name, callee, args, fake_res, is_pure=is_pure)
+                )
 
     ttir_module.walk(mlir_to_functions)
 
@@ -640,7 +651,14 @@ def analyze_kernel_mutations(
     ops = functions[fn_name]
     for op_list in ops.values():
         for op in op_list:
+            # If we encounter an operation with effects that cannot be reliably analyzed
+            # (e.g. `tt.elementwise_inline_asm`), we assume it does not mutate any input parameters.
             if op.name in UNKNOWN_OPS:
+                if op.name == "tt.elementwise_inline_asm" and op.is_pure:
+                    log.warning(
+                        "TTIR mutation analysis: Skipping pure tt.elementwise_inline_asm op (is_pure=True)"
+                    )
+                    continue
                 raise RuntimeError(
                     f"ttir analysis hit an op we do not know how to analyze: {op.name}"
                 )
diff --git a/torch/testing/_internal/triton_utils.py b/torch/testing/_internal/triton_utils.py
@@ -346,7 +346,9 @@ def double_strided_kernel(
         tl.store(out_ptr + dst_offsets, src * 2.0)
 
     @triton.jit
-    def inline_asm_kernel(X, Y, Z, n: "tl.constexpr", BLOCK: "tl.constexpr"):
+    def inline_asm_kernel_is_pure_true(
+        X, Y, Z, n: "tl.constexpr", BLOCK: "tl.constexpr"
+    ):
         x = tl.load(X + tl.arange(0, BLOCK))
         y = tl.load(Y + tl.arange(0, BLOCK))
         s = tl.full([BLOCK], n, tl.int32)
@@ -360,6 +362,23 @@ def inline_asm_kernel(X, Y, Z, n: "tl.constexpr", BLOCK: "tl.constexpr"):
         )
         tl.store(Z + tl.arange(0, BLOCK), z)
 
+    @triton.jit
+    def inline_asm_kernel_is_pure_false(
+        X, Y, Z, n: "tl.constexpr", BLOCK: "tl.constexpr"
+    ):
+        x = tl.load(X + tl.arange(0, BLOCK))
+        y = tl.load(Y + tl.arange(0, BLOCK))
+        s = tl.full([BLOCK], n, tl.int32)
+        z = tl.inline_asm_elementwise(
+            "shf.l.wrap.b32 $0, $1, $2, $3;",
+            "=r,r, r, r",
+            [x, y, s],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+        tl.store(Z + tl.arange(0, BLOCK), z)
+
     @triton.jit
     def add_kernel_with_block_ptr(
         x_ptr,