ROCm · ukannika · Oct 24, 2025 · Oct 25, 2025 · Oct 25, 2025 · Oct 25, 2025
diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json
@@ -901,5 +901,17 @@
         ],
         "verbose": "False",
         "blob_gen_cmd": "''"
+    },
+    "module_topk_per_row": {
+        "srcs": [
+            "f'{AITER_CSRC_DIR}/kernels/topk_per_row_kernels.cu'",
+            "f'{AITER_CSRC_DIR}/pybind/topk_per_row_pybind.cu'"
+        ],
+        "flags_extra_cc": [],
+        "flags_extra_hip": [],
+        "extra_ldflags": "None",
+        "extra_include": [],
+        "verbose": "False",
+        "blob_gen_cmd": "''"
     }
-}
+}
diff --git a/aiter/ops/topk.py b/aiter/ops/topk.py
@@ -194,3 +194,27 @@ def grouped_topk_torch(
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
 
     return topk_weights.to(dtypes.fp32), topk_ids.to(dtypes.i32)
+
+
+@compile_ops("module_topk_per_row")
+def topk_per_row(
+    logits: torch.Tensor,
+    rowStarts: torch.Tensor,
+    rowEnds: torch.Tensor,
+    indices: torch.Tensor,
+    numRows: int,
+    stride0: int,
+    stride1: int,
+) -> None: ...
+
+
+@compile_ops("module_topk_per_row_decode")
+def topk_per_row_decode(
+    logits: torch.Tensor,
+    next_n: int,
+    seqLens: torch.Tensor,
+    indices: torch.Tensor,
+    numRows: int,
+    stride0: int,
+    stride1: int,
+) -> None: ...
diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp
@@ -1178,12 +1178,12 @@
           py::arg("mat1"),                                                         \
           py::arg("mat2"),                                                         \
           py::arg("solution_index"),                                               \
-          py::arg("bias")      = std::nullopt,                                     \
-          py::arg("out_dtype") = std::nullopt,                                     \
-          py::arg("scaleA")    = std::nullopt,                                     \
-          py::arg("scaleB")    = std::nullopt,                                     \
-          py::arg("scaleOut")  = std::nullopt,                                      \
-          py::arg("bpreshuffle")  = std::nullopt);                                     \
+          py::arg("bias")        = std::nullopt,                                   \
+          py::arg("out_dtype")   = std::nullopt,                                   \
+          py::arg("scaleA")      = std::nullopt,                                   \
+          py::arg("scaleB")      = std::nullopt,                                   \
+          py::arg("scaleOut")    = std::nullopt,                                   \
+          py::arg("bpreshuffle") = std::nullopt);                                  \
     m.def("hipb_findallsols",                                                      \
           &hipb_findallsols,                                                       \
           "hipb_findallsols",                                                      \
@@ -1220,3 +1220,23 @@
     pybind11::implicitly_convertible<int, ActivationType>();
 #define GEMM_COMMON_PYBIND \
     m.def("get_padded_m", &getPaddedM, py::arg("M"), py::arg("N"), py::arg("K"), py::arg("gl"));
+
+#define TOPK_PER_ROW_PYBIND      \
+    m.def("topk_per_row",        \
+          &topk_per_row,         \
+          py::arg("logits"),     \
+          py::arg("rowStarts"),  \
+          py::arg("rowEnds"),    \
+          py::arg("indices"),    \
+          py::arg("numRows"),    \
+          py::arg("stride0"),    \
+          py::arg("stride1"));   \
+    m.def("topk_per_row_decode", \
+          &topk_per_row_decode,  \
+          py::arg("logits"),     \
+          py::arg("next_n"),     \
+          py::arg("seqLens"),    \
+          py::arg("indices"),    \
+          py::arg("numRows"),    \
+          py::arg("stride0"),    \
+          py::arg("stride1"));
diff --git a/csrc/include/topk_per_row.h b/csrc/include/topk_per_row.h
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: MIT
+// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#include <torch/extension.h>
+
+void topk_per_row(const torch::Tensor& logits,
+                  const torch::Tensor& rowStarts,
+                  const torch::Tensor& rowEnds,
+                  torch::Tensor& indices,
+                  int64_t numRows,
+                  int64_t stride0,
+                  int64_t stride1);
+
+void topk_per_row_decode(const torch::Tensor& logits,
+                         int64_t next_n,
+                         const torch::Tensor& seqLens,
+                         torch::Tensor& indices,
+                         int64_t numRows,
+                         int64_t stride0,
+                         int64_t stride1);