Add argpartition functionality to shortfin.array API (#1063)

stbaione · web-flow · commit 97d29ef9d704 · 2025-03-11T11:17:20.000-05:00
This implements `argpartition` on the `shortfin.array` API, using
`xtensor`.

Argpartition is a sorting algorithm that returns indices, where all
indices to the left of k are guaranteed to be the k-smallest elements
along an axis, and where all values to the right of k are larger.

One can use a positive k-value, for the first k elements along an axis
to be the smallest k elements,
Or a negative k-value, for the last k elements along an axis to be the
largest k elements.

Note that those top-k indices are not guaranteed to be in sorted order.
diff --git a/shortfin/python/array_host_ops.cc b/shortfin/python/array_host_ops.cc
@@ -148,6 +148,33 @@ Implemented for dtypes: float16, float32.
   A device_array of dtype=int64, allocated on the host and not visible to the device.
 )";
 
+static const char DOCSTRING_ARGPARTITION[] =
+    R"(Partitions the array `input` along the specified `axis` so that certain
+    elements occupy the first or last positions depending on `k`.
+    Similar to `numpy.argpartition`:
+
+    - If `k` is positive, the first `k` positions along `axis` are the indices of the
+      `k` smallest values, while all larger values occupy positions to the right of `k`.
+    - If `k` is negative, it counts from the end. For example, `k = -3` means the last
+      3 positions along `axis` are the indices of the 3 largest values, while all smaller
+      values occupy positions to the left of that boundary.
+
+Implemented for dtypes: float16, float32.
+
+Args:
+  input: An input array.
+  k: The number of maximum values to partition.
+  axis: Axis along which to sort. Defaults to the last axis (note that the
+    numpy default is into the flattened array, which we do not support).
+  out: Array to write into. If specified, it must have an expected shape and
+    int64 dtype.
+  device_visible: Whether to make the result array visible to devices. Defaults to
+    False.
+
+Returns:
+  A device_array of dtype=int64, allocated on the host and not visible to the device.
+)";
+
 static const char DOCSTRING_CONVERT[] =
     R"(Does an elementwise conversion from one dtype to another.
 
@@ -795,6 +822,53 @@ void BindArrayHostOps(py::module_ &m) {
       py::kw_only(), py::arg("keepdims") = false,
       py::arg("device_visible") = false, DOCSTRING_ARGMAX);
 
+  m.def(
+      "argpartition",
+      [](device_array &input, int k, int axis, std::optional<device_array> out,
+         bool device_visible) {
+        SHORTFIN_TRACE_SCOPE_NAMED("PyHostOp::argpartition");
+        if (axis < 0) axis += input.shape().size();
+        if (axis < 0 || axis >= input.shape().size()) {
+          throw std::invalid_argument(
+              fmt::format("Axis out of range: Must be [0, {}) but got {}",
+                          input.shape().size(), axis));
+        }
+        // Simulate numpy's negative `k` behavior for max argpartition
+        if (k < 0) k += input.shape()[axis];
+        if (k < 0 || k >= input.shape()[axis]) {
+          throw std::invalid_argument(
+              fmt::format("K out of range: Must be [-{}, {}) but got {}",
+                          input.shape()[axis], input.shape()[axis], k));
+        }
+        if (out && (out->dtype() != DType::int64())) {
+          throw std::invalid_argument("out array must have dtype=int64");
+        }
+        auto compute = [&]<typename EltTy>() {
+          auto input_t = input.map_xtensor<EltTy>();
+          auto result = xt::argpartition(*input_t, k, /*axis=*/axis);
+          if (!out) {
+            out.emplace(device_array::for_host(input.device(), result.shape(),
+                                               DType::int64(), device_visible));
+          }
+          auto out_t = out->map_xtensor_w<int64_t>();
+          *out_t = result;
+          return *out;
+        };
+
+        switch (input.dtype()) {
+          SF_UNARY_FUNCTION_CASE(float16, half_float::half);
+          SF_UNARY_FUNCTION_CASE(bfloat16, bfloat16_t);
+          SF_UNARY_FUNCTION_CASE(float32, float);
+          default:
+            throw std::invalid_argument(
+                fmt::format("Unsupported dtype({}) for operator argmax",
+                            input.dtype().name()));
+        }
+      },
+      py::arg("input"), py::arg("k"), py::arg("axis") = -1,
+      py::arg("out") = py::none(), py::arg("device_visible") = false,
+      DOCSTRING_ARGPARTITION);
+
   // Random number generation.
   py::class_<PyRandomGenerator>(m, "RandomGenerator")
       .def(py::init<std::optional<PyRandomGenerator::SeedType>>(),
diff --git a/shortfin/python/shortfin/array/__init__.py b/shortfin/python/shortfin/array/__init__.py
@@ -47,6 +47,7 @@
 
 # Ops.
 argmax = _sfl.array.argmax
+argpartition = _sfl.array.argpartition
 add = _sfl.array.add
 ceil = _sfl.array.ceil
 convert = _sfl.array.convert
@@ -99,6 +100,7 @@
     # Ops.
     "add",
     "argmax",
+    "argpartition",
     "ceil",
     "convert",
     "divide",
diff --git a/shortfin/src/shortfin/array/dims.h b/shortfin/src/shortfin/array/dims.h
@@ -108,6 +108,7 @@ class SHORTFIN_API InlinedDims {
       return p != other.p;
     }
     constexpr reference operator*() { return *p; }
+    constexpr reference operator[](difference_type d) const { return *(p + d); }
     constexpr const_iterator operator+(difference_type d) const {
       return const_iterator(p + d);
     }
diff --git a/shortfin/tests/api/array_ops_test.py b/shortfin/tests/api/array_ops_test.py
@@ -4,9 +4,10 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-import array
 import math
+from typing import List
 import pytest
+import random
 
 import shortfin as sf
 import shortfin.array as sfnp
@@ -112,6 +113,189 @@ def test_argmax_dtypes(device, dtype):
     sfnp.argmax(src)
 
 
+@pytest.mark.parametrize(
+    "k,axis",
+    [
+        # Min sort, default axis
+        [3, None],
+        # Min sort, axis=-1
+        [20, -1],
+        # Max sort, default axis
+        [-3, None],
+        # Max sort, axis=-1
+        [-20, -1],
+    ],
+)
+def test_argpartition(device, k, axis):
+    src = sfnp.device_array(device, [1, 1, 128], dtype=sfnp.float32)
+    data = [float(i) for i in range(math.prod([1, 1, 128]))]
+    randomized_data = data[:]
+    random.shuffle(randomized_data)
+    src.items = randomized_data
+
+    result = (
+        sfnp.argpartition(src, k) if axis is None else sfnp.argpartition(src, k, axis)
+    )
+
+    assert result.shape == src.shape
+
+    expected_values = data[:k] if k >= 0 else data[k:]
+
+    k_slice = slice(0, k) if k >= 0 else slice(k, None)
+
+    indices = result.view(0, 0, k_slice).items.tolist()
+    values = [randomized_data[index] for index in indices]
+    assert sorted(values) == sorted(expected_values)
+
+
+def test_argpartition_out_variant(device):
+    k, axis = -3, -1
+    src = sfnp.device_array(device, [1, 1, 128], dtype=sfnp.float32)
+    data = [float(i) for i in range(math.prod(src.shape))]
+
+    randomized_data = data[:]
+    random.shuffle(randomized_data)
+    src.items = randomized_data
+
+    output_array = sfnp.device_array(device, src.shape, dtype=sfnp.int64)
+    result_out = sfnp.argpartition(src, k, axis, out=output_array)
+    result_no_out = sfnp.argpartition(src, k, axis)
+
+    assert result_out.shape == src.shape
+    out_items = result_out.items.tolist()
+    no_out_items = result_no_out.items.tolist()
+    assert out_items == no_out_items
+
+
+def test_argpartition_axis0(device):
+    def _get_top_values_by_col_indices(
+        indices: List[int], data: List[List[int]], k: int
+    ) -> List[List[int]]:
+        """Obtain the top-k values from out matrix, using column indices.
+
+        For this test, we partition by column (axis == 0). This is just some
+        helper logic to obtain the values from the original matrix, given
+        then column indices.
+
+        Args:
+            indices (List[int]): Flattened indices from `sfnp.argpartition`
+            data (List[List[int]]): Matrix containing original values.
+            k (int): Specify top-k values to select.
+
+        Returns:
+            List[List[int]]: Top-k values for each column.
+        """
+        num_cols = len(data[0])
+
+        top_values_by_col = []
+
+        for c in range(num_cols):
+            # Collect the row indices for the first k entries in column c.
+            col_row_idxs = [indices[r * num_cols + c] for r in range(k)]
+
+            # Map those row indices into actual values in `data`.
+            col_values = [data[row_idx][c] for row_idx in col_row_idxs]
+
+            top_values_by_col.append(col_values)
+
+        return top_values_by_col
+
+    def _get_top_values_by_sorting(
+        data: List[List[float]], k: int
+    ) -> List[List[float]]:
+        """Get the top-k value for each col in the matrix, using sorting.
+
+        This is just to obtain a comparison for our `argpartition` testing.
+
+        Args:
+            data (List[List[int]]): Matrix of data.
+            k (int): Specify top-k values to select.
+
+        Returns:
+            List[List[float]]: Top-k values for each column.
+        """
+        num_rows = len(data)
+        num_cols = len(data[0])
+
+        top_values_by_col = []
+
+        for c in range(num_cols):
+            # Extract the entire column 'c' into a list
+            col = [data[r][c] for r in range(num_rows)]
+            # Sort the column in ascending order
+            col_sorted = sorted(col)
+            # The first k elements are the k smallest
+            col_k_smallest = col_sorted[:k]
+            top_values_by_col.append(col_k_smallest)
+
+        return top_values_by_col
+
+    k, axis = 2, 0
+    src = sfnp.device_array(device, [3, 4], dtype=sfnp.float32)
+    # data = [[float(i) for i in range(math.prod(src.shape))]]
+    data = [[i for i in range(src.shape[-1])] for _ in range(src.shape[0])]
+    for i in range(len(data)):
+        random.shuffle(data[i])
+
+    for i in range(src.shape[0]):
+        src.view(i).items = data[i]
+
+    result = sfnp.argpartition(src, k, axis)
+    assert result.shape == src.shape
+
+    expected_values = _get_top_values_by_sorting(data, k)
+    top_values = _get_top_values_by_col_indices(result.items.tolist(), data, k)
+    for result, expected in zip(top_values, expected_values):
+        assert sorted(result) == sorted(expected)
+
+
+def test_argpartition_error_cases(device):
+    # Invalid `input` dtype
+    with pytest.raises(
+        ValueError,
+    ):
+        src = sfnp.device_array(device, [1, 1, 16], dtype=sfnp.int64)
+        sfnp.argpartition(src, 0)
+
+    src = sfnp.device_array(device, [1, 1, 16], dtype=sfnp.float32)
+    data = [float(i) for i in range(math.prod(src.shape))]
+    src.items = data
+
+    # Invalid `axis`
+    with pytest.raises(
+        ValueError,
+    ):
+        sfnp.argpartition(src, 1, 3)
+        sfnp.argpartition(src, 1, -4)
+
+    # Invalid `k`
+    with pytest.raises(
+        ValueError,
+    ):
+        sfnp.argpartition(src, 17)
+        sfnp.argpartition(src, -17)
+
+    # Invalid `out` dtype
+    with pytest.raises(
+        ValueError,
+    ):
+        out = sfnp.device_array(device, src.shape, dtype=sfnp.float32)
+        sfnp.argpartition(src, 2, -1, out)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        sfnp.bfloat16,
+        sfnp.float16,
+        sfnp.float32,
+    ],
+)
+def test_argpartition_dtypes(device, dtype):
+    src = sfnp.device_array(device, [4, 16, 128], dtype=dtype)
+    sfnp.argpartition(src, 0)
+
+
 @pytest.mark.parametrize(
     "dtype",
     [

Original file line number	Diff line number	Diff line change
`@@ -108,6 +108,7 @@ class SHORTFIN_API InlinedDims {`
`108`	`108`	`return p != other.p;`
`109`	`109`	`}`
`110`	`110`	`constexpr reference operator() { return p; }`
	`111`	`+ constexpr reference operator[](difference_type d) const { return *(p + d); }`
`111`	`112`	`constexpr const_iterator operator+(difference_type d) const {`
`112`	`113`	`return const_iterator(p + d);`
`113`	`114`	`}`