[inductor][ck] manual kBatch heuristic (pytorch#148118)

coconutruben · pytorchmergebot · commit 6f91720e1c53 · 2025-02-28T20:36:16.000Z
Summary: # Why Leverage kBatch parameter for large splitK examples for CK for better than ATEN performance # What replace default kBatch = 1 with a manual heuristic - if K > 16 * max (M,N) - leverage k_per_block, and K and number of SMs on the chip - upper bound to 128, lower bound to 1 This is better than defaulting to 1, cheap to calculate, and shows performance beyond ATEN This is of course subject to change and improvement Test Plan: with minor modifications to to run torch.mm on the shape `M, N, K = 2048, 2048, 524288` ``` buck2 run -c fbcode.re_gpu_tests=False mode/opt-amd-gpu fbcode//deeplearning/aot_inductor/benchmark/sampling:test_gemm_autotune_benchmark_AMD_block_0 ``` ``` AUTOTUNE mm(2048x524288, 524288x2048) rocm_ck_gemm_template_49 10.4972 ms 100.0% rocm_ck_gemm_template_8 10.6132 ms 98.9% rocm_ck_gemm_template_9 10.6907 ms 98.2% [...] mm 18.9880 ms 55.3% ``` Reviewed By: ColinPeppler Differential Revision: D70224591 Pull Request resolved: pytorch#148118 Approved by: https://github.com/ColinPeppler
diff --git a/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py b/torch/_inductor/codegen/rocm/ck_universal_gemm_template.py
@@ -15,6 +15,7 @@
 from torch._inductor.codegen.rocm.compile_command import rocm_compile_command
 from torch._inductor.codegen.rocm.rocm_kernel import ROCmTemplateKernel
 from torch._inductor.ir import Buffer, Layout
+from torch._inductor.runtime.runtime_utils import next_power_of_2
 
 from ...utils import IndentedBuffer, try_import_ck_lib
 
@@ -876,6 +877,27 @@ def _is_rcr_f16(self):
             and Y_layout == "Row"
         )
 
+    # helper to calculate a potentially optimal kBatch(es) for a problem
+    def _get_kBatch(self, op):
+        # we only set a higher kBatch if K > 16 * the larger of M and N
+        # this is a hand-tuned heuristic to start
+        metas = [T.get_layout() for T in [*self.input_nodes]]
+        X_meta = metas[0]
+        W_meta = metas[1]
+        M = X_meta.size[-2]
+        K = X_meta.size[-1]
+        N = W_meta.size[-1]
+        if K < 16 * max(M, N):
+            return [1]
+        # Calculate the number of blocks needed for each dimension
+        total_k_blocks = math.ceil(K / op.k_per_block)
+        # we want to calculate how many blocks we need to fit per CU
+        cus = torch.cuda.get_device_properties(X_meta.device).multi_processor_count
+        # again, manual heuristics as much larger kBatch are significantly worse in
+        # initial testing
+        kBatch = min(max(next_power_of_2(total_k_blocks // cus), 1), 128)
+        return [kBatch]
+
     def gen_ops(self) -> list[InductorROCmOp]:
         """
         Creates a list of `CKGemmOperation` instances that match the GEMM operation this template represents.
@@ -905,14 +927,13 @@ def gen_ops(self) -> list[InductorROCmOp]:
 
         assert generator is not None
 
-        # NOTE(coconutruben): for now, we only support kBatch 1
-        # TODO(coconturuben): infer a better kBatch depending on the input shape
         # TODO(coconutruben): allow users to provide a list of kBatches to sweep over
-        kBatches = [1]
         rops = generator()
-        ops = [
-            InductorROCmOp(op=op, kBatch=kBatch) for op in rops for kBatch in kBatches
-        ]
+        ops = []
+        for o in rops:
+            kBatches = self._get_kBatch(o)
+            for kBatch in kBatches:
+                ops.append(InductorROCmOp(op=o, kBatch=kBatch))
 
         filtered_instances = list(filter(lambda op: self.filter_op(op), ops))