ROCm · valarLip · Jun 26, 2025 · Jun 27, 2025 · Jul 1, 2025 · Jul 14, 2025
diff --git a/aiter/dist/custom_all_reduce_utils.py b/aiter/dist/custom_all_reduce_utils.py
@@ -1,6 +1,6 @@
 """
-* Copyright © Advanced Micro Devices, Inc. All rights reserved.
-* Copyright (c) 2024, The vLLM team.
+* Copyright (C) Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2024-2025, The vLLM team.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.

diff --git a/aiter/dist/utils.py b/aiter/dist/utils.py
@@ -1,6 +1,6 @@
 """
-* Copyright © Advanced Micro Devices, Inc. All rights reserved.
-* Copyright (c) 2024, The vLLM team.
+* Copyright (C) Advanced Micro Devices, Inc. All rights reserved.
+* Copyright (C) 2024-2025, The vLLM team.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.

diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json
@@ -901,5 +901,33 @@
         ],
         "verbose": "False",
         "blob_gen_cmd": "''"
+    },
+    "module_mla_metadata": {
+        "srcs": [
+            "f'{AITER_CSRC_DIR}/pybind/mla_metadata_pybind.cu'",
+            "f'{AITER_CSRC_DIR}/kernels/mla/metadata.cu'",
+            "f'{AITER_CSRC_DIR}/kernels/mla/metadata/v1_comm.cuh'",
+            "f'{AITER_CSRC_DIR}/kernels/mla/metadata/v1_1_device.cuh'",
+            "f'{AITER_CSRC_DIR}/kernels/mla/metadata/v1_1_host.cuh'",
+            "f'{AITER_CSRC_DIR}/kernels/mla/metadata/v1_2_device.cuh'"
+        ],
+        "flags_extra_cc": [],
+        "flags_extra_hip": [],
+        "extra_ldflags": "None",
+        "extra_include": [],
+        "verbose": "False",
+        "blob_gen_cmd": "''"
+    },
+    "module_mla_reduce": {
+        "srcs": [
+            "f'{AITER_CSRC_DIR}/pybind/mla_reduce_pybind.cu'",
+            "f'{AITER_CSRC_DIR}/kernels/mla/reduce.cu'"
+        ],
+        "flags_extra_cc": [],
+        "flags_extra_hip": [],
+        "extra_ldflags": "None",
+        "extra_include": [],
+        "verbose": "False",
+        "blob_gen_cmd": "''"
     }
 }
diff --git a/aiter/mla.py b/aiter/mla.py
@@ -19,67 +19,80 @@ def _fwd_kernel_stage2_asm(
     O,
     qo_indptr,
     kv_indptr,
+    num_kv_splits_indptr,
     stride_mid_ob,
     stride_mid_oh,
     stride_mid_os,
     stride_obs,
     stride_oh,
-    bs,
-    nheads,
-    max_seqlen_q,
-    NUM_KV_SPLITS: tl.constexpr,
+    MAYBE_FINAL_OUT: tl.constexpr,
+    BATCH_NUM: tl.constexpr,
     BLOCK_DV: tl.constexpr,
     Lv: tl.constexpr,
     mgc: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
-    cur_qo_offs = tl.program_id(2)
-
     cur_qo_start = tl.load(qo_indptr + cur_batch)
     cur_qo_end = tl.load(qo_indptr + cur_batch + 1)
-    cur_qo = cur_qo_start + cur_qo_offs
-    if cur_qo > cur_qo_end:
-        return
+    cur_split_start = tl.load(num_kv_splits_indptr + cur_batch)
+    cur_split_end = tl.load(num_kv_splits_indptr + cur_batch + 1)
+    num_max_kv_splits = tl.load(num_kv_splits_indptr + BATCH_NUM)
     cur_kv_seq_len = tl.load(kv_indptr + cur_batch + 1) - tl.load(kv_indptr + cur_batch)
 
     offs_d = tl.arange(0, BLOCK_DV)
     mask_d = offs_d < Lv
 
-    e_sum = 0.0
-    e_max = -float("inf")
-    acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
-
-    offs_v = (cur_qo * stride_mid_ob + cur_head * stride_mid_oh) * Lv + offs_d
-    offs_logic = cur_qo * stride_mid_ob + cur_head * stride_mid_oh
-
-    for split_kv_id in range(0, NUM_KV_SPLITS):
-        kv_len_per_split = tl.maximum(mgc, tl.cdiv(cur_kv_seq_len, NUM_KV_SPLITS))
-        split_kv_start = kv_len_per_split * split_kv_id
-        split_kv_end = tl.minimum(split_kv_start + kv_len_per_split, cur_kv_seq_len)
-
-        if split_kv_end > split_kv_start:
-            tv = tl.load(
-                Mid_O + offs_v + split_kv_id * stride_mid_os * Lv,
+    offs_logic = cur_qo_start * stride_mid_ob + cur_head * stride_mid_oh
+    offs_v = offs_logic * Lv + offs_d
+    num_valid_kv_splits = tl.minimum(
+        cur_split_end - cur_split_start, tl.cdiv(cur_kv_seq_len, mgc)
+    )
+    FINAL_OUT = MAYBE_FINAL_OUT and num_max_kv_splits == BATCH_NUM
+
+    for cur_qo in range(cur_qo_start, cur_qo_end):
+        if FINAL_OUT:
+            input_ptr = Mid_O.to(tl.pointer_type(O.type.element_ty))
+            out = tl.load(
+                # input_ptr + offs_v + stride_mid_ob * Lv,
+                input_ptr
+                + Lv * (cur_qo * stride_mid_os + cur_head * stride_mid_oh)
+                + offs_d,
                 mask=mask_d,
                 other=0.0,
             )
-            tlogic = tl.load(Mid_lse + offs_logic + split_kv_id * stride_mid_os)
-            n_e_max = tl.maximum(tlogic, e_max)
-
-            old_scale = tl.exp(e_max - n_e_max)
-            acc *= old_scale
-            exp_logic = tl.exp(tlogic - n_e_max)
-            acc += exp_logic * tv
-
-            e_sum = e_sum * old_scale + exp_logic
-            e_max = n_e_max
-
-    tl.store(
-        O + cur_qo * stride_obs + cur_head * stride_oh + offs_d,
-        acc / e_sum,
-        mask=mask_d,
-    )
+            tl.store(
+                O + cur_qo * stride_obs + cur_head * stride_oh + offs_d,
+                out,
+                mask=mask_d,
+            )
+        else:
+            e_sum = 0.0
+            e_max = -float("inf")
+            acc = tl.zeros([BLOCK_DV], dtype=tl.float32)
+            for split_kv_id in range(0, num_valid_kv_splits):
+                tv = tl.load(
+                    Mid_O + offs_v + split_kv_id * stride_mid_os * Lv,
+                    mask=mask_d,
+                    other=0.0,
+                )
+                tlogic = tl.load(Mid_lse + offs_logic + split_kv_id * stride_mid_os)
+                n_e_max = tl.maximum(tlogic, e_max)
+
+                old_scale = tl.exp(e_max - n_e_max)
+                acc *= old_scale
+                exp_logic = tl.exp(tlogic - n_e_max)
+                acc += exp_logic * tv
+
+                e_sum = e_sum * old_scale + exp_logic
+                e_max = n_e_max
+            offs_logic += stride_mid_ob
+            offs_v += stride_mid_ob * Lv
+            tl.store(
+                O + cur_qo * stride_obs + cur_head * stride_oh + offs_d,
+                acc / e_sum,
+                mask=mask_d,
+            )
 
 
 @functools.lru_cache()
@@ -100,7 +113,6 @@ def get_meta_param(num_kv_splits, bs, total_kv, nhead, max_seqlen_q):
             for i in range(1, 17)
         ]
         num_kv_splits = sorted(tmp, key=lambda x: x[0], reverse=True)[0][1]
-        # num_kv_splits = min(16, max(1, cu_num // bs))
 
     get_mgc = {16: 16, 128: 16}
 
@@ -123,6 +135,15 @@ def mla_decode_fwd(
     sm_scale=None,  # 1.0 / (qk_head_dim**0.5)
     logit_cap=0.0,
     num_kv_splits=None,  # for experts only!!!
+    num_kv_splits_indptr=None,  # for experts only!!!
+    work_meta_data=None,
+    work_indptr=None,
+    work_info_set=None,
+    reduce_indptr=None,
+    reduce_final_map=None,
+    reduce_partial_map=None,
+    q_scale=None,
+    kv_scale=None,
 ):
     device = q.device
     assert logit_cap <= 0, f"{logit_cap=} is not support yet"
@@ -134,9 +155,14 @@ def mla_decode_fwd(
     bs = qo_indptr.shape[0] - 1
     total_kv = kv_indices.shape[0]
 
-    num_kv_splits, mgc = get_meta_param(
-        num_kv_splits, bs, total_kv, nhead, max_seqlen_q
-    )
+    if num_kv_splits_indptr is None and work_meta_data is None:
+        num_kv_splits, mgc = get_meta_param(None, bs, total_kv, nhead, max_seqlen_q)
+        num_kv_splits_indptr = torch.arange(
+            0, (bs + 1) * num_kv_splits, num_kv_splits, dtype=torch.int, device=device
+        )
+
+    if num_kv_splits is None:
+        num_kv_splits = get_cu_num()
 
     if nhead == 16 and max_seqlen_q == 1:
         # special case for 16 heads and max_seqlen_q == 1
@@ -145,22 +171,72 @@ def mla_decode_fwd(
             dtype=dtypes.fp32,
             device=device,
         )
+        MAYBE_FINAL_OUT = False
     elif nhead in [16, 128]:
-        logits = (
-            o.view((total_s, num_kv_splits, nhead, v_head_dim))
-            if num_kv_splits == 1
-            else torch.empty(
-                (total_s, num_kv_splits, nhead, v_head_dim),
-                dtype=dtypes.fp32,
-                device=device,
-            )
+        MAYBE_FINAL_OUT = True
+        logits = torch.empty(
+            (total_s, num_kv_splits, nhead, v_head_dim),
+            dtype=dtypes.fp32,
+            device=device,
         )
     else:
         assert False, f"{nhead=} not supported"
 
     attn_lse = torch.empty(
         (total_s, num_kv_splits, nhead, 1), dtype=dtypes.fp32, device=device
     )
+    final_lse = torch.empty((total_s, nhead), dtype=dtypes.fp32, device=device)
+
+    if num_kv_splits_indptr is not None:
+        aiter.mla_decode_stage1_asm_fwd(
+            q,
+            kv_buffer,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            kv_last_page_lens,
+            num_kv_splits_indptr,
+            None,
+            None,
+            None,
+            max_seqlen_q,
+            sm_scale,
+            logits,
+            attn_lse,
+            o,
+            q_scale,
+            kv_scale,
+        )
+
+        # if num_kv_splits == 1 and not (max_seqlen_q == 1 and nhead == 16):
+        #     return logits.view(total_s, nhead, v_head_dim), attn_lse
+        Lv = v_head_dim
+        BLOCK_DV = triton.next_power_of_2(Lv)
+        grid = (bs, nhead)
+        extra_kargs = {"waves_per_eu": 4}
+
+        _fwd_kernel_stage2_asm[grid](
+            logits,
+            attn_lse,
+            o,
+            qo_indptr,
+            kv_indptr,
+            num_kv_splits_indptr,
+            attn_lse.stride(0),
+            attn_lse.stride(2),
+            attn_lse.stride(1),
+            o.stride(0),
+            o.stride(1),
+            MAYBE_FINAL_OUT=MAYBE_FINAL_OUT,
+            BATCH_NUM=bs,
+            BLOCK_DV=BLOCK_DV,
+            Lv=Lv,
+            mgc=mgc,
+            num_warps=4,
+            num_stages=2,
+            **extra_kargs,
+        )
+        return logits, final_lse
 
     aiter.mla_decode_stage1_asm_fwd(
         q,
@@ -169,41 +245,30 @@ def mla_decode_fwd(
         kv_indptr,
         kv_indices,
         kv_last_page_lens,
+        num_kv_splits_indptr,
+        work_meta_data,
+        work_indptr,
+        work_info_set,
         max_seqlen_q,
         sm_scale,
         logits,
         attn_lse,
+        o,
+        q_scale,
+        kv_scale,
     )
 
-    if num_kv_splits == 1 and not (max_seqlen_q == 1 and nhead == 16):
-        return logits.view(total_s, nhead, v_head_dim), attn_lse
-    Lv = v_head_dim
-    BLOCK_DV = triton.next_power_of_2(Lv)
-    grid = (bs, nhead, max_seqlen_q)
-    extra_kargs = {"waves_per_eu": 4}
-    _fwd_kernel_stage2_asm[grid](
+    aiter.mla_reduce_v1(
         logits,
         attn_lse,
+        reduce_indptr,
+        reduce_final_map,
+        reduce_partial_map,
         o,
-        qo_indptr,
-        kv_indptr,
-        attn_lse.stride(0),
-        attn_lse.stride(2),
-        attn_lse.stride(1),
-        o.stride(0),
-        o.stride(1),
-        bs,
-        nhead,
-        max_seqlen_q,
-        NUM_KV_SPLITS=num_kv_splits,
-        BLOCK_DV=BLOCK_DV,
-        Lv=Lv,
-        mgc=mgc,
-        num_warps=4,
-        num_stages=2,
-        **extra_kargs,
+        final_lse,
     )
-    return logits, attn_lse
+
+    return logits, final_lse
 
 
 def mla_prefill_fwd(

diff --git a/aiter/ops/activation.py b/aiter/ops/activation.py
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 from torch import Tensor
 from ..jit.core import compile_ops