drisspg
diff --git a/‎test/inductor/test_max_autotune.py
Lines changed: 209 additions & 0 deletions b/‎test/inductor/test_max_autotune.py
Lines changed: 209 additions & 0 deletions
diff --git a/‎torch/_inductor/choices.py
Lines changed: 11 additions & 1 deletion b/‎torch/_inductor/choices.py
Lines changed: 11 additions & 1 deletion
diff --git a/‎torch/_inductor/codecache.py
Lines changed: 1 addition & 1 deletion b/‎torch/_inductor/codecache.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/_inductor/codegen/common.py
Lines changed: 10 additions & 2 deletions b/‎torch/_inductor/codegen/common.py
Lines changed: 10 additions & 2 deletions
diff --git a/‎torch/_inductor/codegen/cpp.py
Lines changed: 2 additions & 0 deletions b/‎torch/_inductor/codegen/cpp.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
Lines changed: 1 addition & 0 deletions b/‎torch/_inductor/codegen/cuda/cuda_cpp_scheduling.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/_inductor/codegen/cuda_combined_scheduling.py
Lines changed: 8 additions & 5 deletions b/‎torch/_inductor/codegen/cuda_combined_scheduling.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py
Lines changed: 1 addition & 0 deletions b/‎torch/_inductor/codegen/rocm/rocm_cpp_scheduling.py
Lines changed: 1 addition & 0 deletions
@@ -1,4 +1,5 @@
 # Owner(s): ["module: inductor"]
+import contextlib
 import os
 import unittest
 from typing import Callable, List, Optional
@@ -980,6 +981,214 @@ def test_tuning_pool_multiple_devices(self):
             tuning_pool.terminate()
 
 
+@instantiate_parametrized_tests
+class TestPrologueFusion(TestCase):
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls._stack = contextlib.ExitStack()
+        cls._stack.enter_context(
+            config.patch(
+                {
+                    "max_autotune": True,
+                    "prologue_fusion": True,
+                    "benchmark_epilogue_fusion": False,
+                    "shape_padding": False,
+                    "max_autotune_gemm_backends": "TRITON",
+                    "test_configs.max_mm_configs": 4,  # significantly speeds up tests
+                }
+            )
+        )
+
+    def check_code(self, code_str, num_kernels, num_allocs, num_deallocs):
+        FileCheck().check("def call").check_count(
+            ".run", num_kernels, exactly=True
+        ).run(code_str)
+
+        if num_allocs is not None:
+            FileCheck().check("def call").check_count(
+                "empty_strided", num_allocs, exactly=True
+            ).run(code_str)
+
+        if num_deallocs is not None:
+            FileCheck().check("def call").check_count(
+                "del", num_deallocs, exactly=True
+            ).run(code_str)
+
+    @parametrize("sizes", ((64, 128, 256), (128, 128, 128), (63, 120, 250)))
+    def test_upcast(self, sizes):
+        M, K, N = sizes
+
+        x = torch.rand([M, K], dtype=torch.float16, device="cuda")
+        y = torch.rand([K, N], dtype=torch.float, device="cuda")
+
+        def foo(x, y):
+            return x.to(y.dtype) @ y
+
+        out, code = run_and_get_code(torch.compile(foo), x, y)
+        self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
+
+    def test_downcast(self):
+        # per heuristics, dont fuse a downcast into a mm because it would lead to more reads inside kernel
+        M, K, N = (64, 128, 256)
+        x = torch.rand([M, K], dtype=torch.float, device="cuda")
+        y = torch.rand([K, N], dtype=torch.float16, device="cuda")
+
+        def foo(x, y):
+            return x.to(y.dtype) @ y
+
+        out, code = run_and_get_code(torch.compile(foo), x, y)
+        self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=3)
+
+    @parametrize("sizes", ((64, 128, 256), (64, 64, 64), (64, 120, 64)))
+    def test_multiple_fusions(self, sizes):
+        M, K, N = sizes
+
+        def foo(x, y):
+            return ((x - 1.1) @ (y + 1.1)) * 1.1
+
+        x = torch.rand([M, K], dtype=torch.float, device="cuda")
+        y = torch.rand([K, N], dtype=torch.float, device="cuda")
+
+        out, code = run_and_get_code(torch.compile(foo), x, y)
+        self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
+
+        # check that we do not CSE any variables between prologues, epilogues
+        FileCheck().check("def triton").check_count("= 1.1", 3, exactly=True).check(
+            "tl.store"
+        ).run(code[0])
+
+    @parametrize("sizes", ((64, 128, 256), (128, 128, 128), (63, 120, 250)))
+    def test_multiple_inputs(self, sizes):
+        M, K, N = sizes
+
+        def foo(x, y, z):
+            return (x + y).to(torch.float) @ z
+
+        x = torch.rand([M, K], dtype=torch.float16, device="cuda")
+        y = torch.rand([M, K], dtype=torch.float16, device="cuda")
+        z = torch.rand([K, N], dtype=torch.float, device="cuda")
+        out_eager = foo(x, y, z)
+        out, code = run_and_get_code(torch.compile(foo), x, y, z)
+        self.assertEqual(out, out_eager, atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=3)
+
+    def test_storage_offset_prologue(self):
+        def foo(a):
+            q = a[:64, :]
+            k = a[64:, :]
+            return torch.mm(q + 2, k - 2)
+
+        inp = torch.randn(128, 64, device="cuda")
+        out, code = run_and_get_code(torch.compile(foo), inp)
+        self.assertEqual(out, foo(inp), atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=1)
+
+    @config.patch(realize_reads_threshold=1, realize_opcount_threshold=1)
+    @parametrize("sizes", ((64, 128, 256), (128, 128, 128), (63, 120, 250)))
+    def test_prologue_multiple_nodes(self, sizes):
+        M, K, N = sizes
+
+        def foo(x, y):
+            return ((((x * 2) - 1) / 2) @ (y * 4)) * 3.0
+
+        x = torch.rand([M, K], dtype=torch.float, device="cuda")
+        y = torch.rand([K, N], dtype=torch.float, device="cuda")
+
+        out, code = run_and_get_code(torch.compile(foo), x, y)
+        self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
+
+    @parametrize("K", (63, 64))
+    def test_broadcast_x(self, K):
+        def foo(x, y):
+            return (x.expand([1, y.shape[0]]) + 1) @ y
+
+        x = torch.rand([1, 1], dtype=torch.float, device="cuda")
+        y = torch.rand([K, 128], dtype=torch.float, device="cuda")
+
+        out, code = run_and_get_code(torch.compile(foo, dynamic=True), x, y)
+        self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
+
+    def test_broadcast_y(self):
+        def foo(x, y):
+            return x @ y
+
+        M = 20
+        N = K = 1
+        x = torch.rand([M, K], dtype=torch.float, device="cuda")
+        y = torch.rand([K, N], dtype=torch.float, device="cuda")
+        torch._dynamo.mark_dynamic(x, 0)
+
+        out, code = run_and_get_code(torch.compile(foo, dynamic=True), x, y)
+        self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
+
+    @config.patch(realize_reads_threshold=1, realize_opcount_threshold=1)
+    @parametrize("benchmark_fusion", (True, False))
+    def test_prologue_read_into_both_inputs(self, benchmark_fusion):
+        M = K = N = 256
+
+        # not supported today. it could be, but typically the pointwise nodes would get
+        # inlined into separate nodes.
+
+        def foo(x):
+            y = (x + 1) * 2
+            return y @ (y - 2)
+
+        with config.patch(benchmark_epilogue_fusion=benchmark_fusion):
+            x = torch.rand([M, K], dtype=torch.float, device="cuda")
+
+            out, code = run_and_get_code(torch.compile(foo), x)
+            self.assertEqual(out, foo(x), atol=0.05, rtol=0.05)
+            # not guaranteed to fuse, but still checking correctness
+            if not benchmark_fusion:
+                self.check_code(
+                    code[0], num_kernels=2, num_allocs=None, num_deallocs=None
+                )
+
+    @config.patch(realize_reads_threshold=1, realize_opcount_threshold=1)
+    @config.patch(allow_buffer_reuse=False)
+    def test_mismatched_prologue_group(self):
+        def foo(x, y, z):
+            a = (x + 2) * 2
+            b = a * y
+            return b @ z
+
+        x = torch.rand([1, 256], device="cuda")
+        y = torch.rand([256, 256], device="cuda")
+        z = torch.rand([256, 128], device="cuda")
+
+        out, code = run_and_get_code(torch.compile(foo), x, y, z)
+        self.assertEqual(out, foo(x, y, z), atol=0.05, rtol=0.05)
+        # theres one more dealloc than there should be because of a buffer reuse. TODO:
+        # not sure why disabling buffer reuse doesnt stop
+        self.check_code(code[0], num_kernels=2, num_allocs=2, num_deallocs=4)
+
+    @config.patch(shape_padding=True)
+    @config.patch(force_shape_pad=True)
+    @parametrize("sizes", ((250, 245, 128), (250, 256, 128), (256, 128, 62)))
+    def test_prologue_masked_load(self, sizes):
+        M, K, N = sizes
+
+        def foo(x, y):
+            return x @ y
+
+        # cat will turn into masked load
+        # TODO - we should not attempt fusion if it turns an aligned load
+        # into an unaligned load
+        x = torch.rand([250, 245], device="cuda")
+        y = torch.rand([245, 128], device="cuda")
+
+        out, code = run_and_get_code(torch.compile(foo), x, y)
+        self.assertEqual(out, foo(x, y), atol=0.05, rtol=0.05)
+        self.check_code(code[0], num_kernels=1, num_allocs=1, num_deallocs=2)
+
+
 if __name__ == "__main__":
     from torch._inductor.utils import is_big_gpu
 
 
@@ -306,8 +306,18 @@ def score_fusion(
             abs(node1.min_order - node2.max_order),
             abs(node2.min_order - node1.max_order),
         )
+
+        # prologue fusion always last
+        if node2.is_template():
+            template_score = 0
+        else:
+            template_score = 1 + (
+                (node1.is_template() == config.epilogue_fusion_first)
+                and memory_score > 0
+            )
+
         return (
-            node1.is_template() == config.epilogue_fusion_first and memory_score > 0,
+            template_score,
             node1.is_reduction() == node2.is_reduction() and memory_score > 0,
             memory_score,
             proximity_score,
 
@@ -1112,8 +1112,8 @@ def iterate_over_candidates() -> Generator[CompiledFxGraph, None, None]:
         metrics.CachedMetricsHelper.apply_deltas(graph.metrics_deltas)
         counters["inductor"] += graph.counter_deltas
 
-        output_code_log.debug("Output code written to: %s", artifact_path)
         output_code_log.debug("Output code: \n%s", code)
+        output_code_log.debug("Output code written to: %s", artifact_path)
         # On cache hit, use artifact path as filename
         trace_structured(
             "inductor_output_code",
 
@@ -1,4 +1,6 @@
 # mypy: allow-untyped-defs
+from __future__ import annotations
+
 import contextlib
 import dataclasses
 import enum
@@ -18,10 +20,16 @@
     List,
     NamedTuple,
     Optional,
+    Set,
     Tuple,
+    TYPE_CHECKING,
     Union,
 )
 
+
+if TYPE_CHECKING:
+    from typing import Never
+
 import sympy
 
 import torch
@@ -1460,7 +1468,7 @@ def __init__(
         self.invalidated_stores = OrderedSet()  # type: ignore[var-annotated]
         self.varname_map = varname_map or {}
 
-    def invalidate(self, keep_vars: OrderedSet[str]):
+    def invalidate(self, keep_vars: Union[OrderedSet[str], Set[Never]]):
         for name, tmp in list(self.store_cache.items()):
             if tmp not in keep_vars:
                 del self.store_cache[name]
@@ -2326,7 +2334,7 @@ def maybe_append_choice(self, choices, **kwargs):
         except NotImplementedError as e:
             return e
 
-    def generate(self, **kwargs) -> "torch._inductor.ir.ChoiceCaller":
+    def generate(self, **kwargs) -> torch._inductor.ir.ChoiceCaller:
         """
         Generates a ChoiceCaller instance from the given arguments.
         """
 
@@ -4802,10 +4802,12 @@ def codegen_template(
         self,
         template_node: BaseSchedulerNode,
         epilogue_nodes: Sequence[BaseSchedulerNode],
+        prologue_nodes: Sequence[BaseSchedulerNode],
     ):
         """
         Codegen a CPP template, possibly with fused epilogues
         """
+        assert not prologue_nodes
         counters["inductor"]["cpp_epilogue_fusion_counter"] += len(epilogue_nodes)
         assert self.is_cpp_template(
             template_node
 
@@ -82,6 +82,7 @@ def codegen_template(
         self,
         template_node: BaseSchedulerNode,
         epilogue_nodes: Sequence[BaseSchedulerNode],
+        prologue_nodes: Sequence[BaseSchedulerNode],
     ):
         """
         Codegen a CUDA template, possibly with fused epilogues
 
@@ -60,20 +60,23 @@ def codegen_template(
         self,
         template_node: BaseSchedulerNode,
         epilogue_nodes: Sequence[BaseSchedulerNode],
+        prologue_nodes: Sequence[BaseSchedulerNode],
     ):
         if self._cuda_cpp_scheduling.is_cuda_cpp_template(template_node):
-            assert epilogue_nodes is None or len(epilogue_nodes) == 0
+            assert not epilogue_nodes
+            assert not prologue_nodes
             return self._cuda_cpp_scheduling.codegen_template(
-                template_node, epilogue_nodes
+                template_node, epilogue_nodes, prologue_nodes
             )
         elif self._rocm_cpp_scheduling.is_rocm_cpp_template(template_node):
-            assert epilogue_nodes is None or len(epilogue_nodes) == 0
+            assert not epilogue_nodes
+            assert not prologue_nodes
             return self._rocm_cpp_scheduling.codegen_template(
-                template_node, epilogue_nodes
+                template_node, epilogue_nodes, prologue_nodes
             )
         else:
             return self._triton_scheduling.codegen_template(
-                template_node, epilogue_nodes
+                template_node, epilogue_nodes, prologue_nodes
             )
 
     def codegen_node(self, node: Union[FusedSchedulerNode, SchedulerNode]):
 
@@ -77,6 +77,7 @@ def codegen_template(
         self,
         template_node: BaseSchedulerNode,
         epilogue_nodes: Sequence[BaseSchedulerNode],
+        prologue_nodes: Sequence[BaseSchedulerNode],
     ):
         """
         Codegen a ROCm template, possibly with fused epilogues