candle-core/candle-kernels: lift the limitation of 1024 for sorting on

slckl · slckl · commit 7e6fb8b0a4fe · 2025-12-29T00:17:42.000+02:00
cuda
diff --git a/candle-core/src/sort.rs b/candle-core/src/sort.rs
@@ -85,9 +85,11 @@ mod cuda {
             let ncols = self.last_dim;
             let nrows = elem_count / ncols;
             let ncols_pad = next_power_of_2(ncols);
+            // Limit block dim to 1024 threads, which is the maximum on modern CUDA gpus.
+            let block_dim = ncols_pad.min(1024);
             let cfg = LaunchConfig {
                 grid_dim: (nrows as u32, 1, 1),
-                block_dim: (ncols_pad as u32, 1, 1),
+                block_dim: (block_dim as u32, 1, 1),
                 shared_mem_bytes: (ncols_pad * std::mem::size_of::<u32>()) as u32,
             };
             let stream = dev.cuda_stream();
diff --git a/candle-kernels/src/sort.cu b/candle-kernels/src/sort.cu
@@ -14,40 +14,39 @@ static inline __device__ void ggml_cuda_swap(T & a, T & b) {
 template<int order, typename T>
 static __device__ void k_argsort(const T * x, uint32_t * dst, const int ncols, int ncols_pad) {
     // bitonic sort
-    int col = threadIdx.x;
     int row = blockIdx.x;
 
-    if (col >= ncols_pad) {
-        return;
-    }
-
     const T * x_row = x + row * ncols;
     extern __shared__ int dst_row[];
 
-    // initialize indices
-    dst_row[col] = col;
+    // initialize indices - each thread handles multiple elements if ncols_pad > blockDim.x
+    for (int col = threadIdx.x; col < ncols_pad; col += blockDim.x) {
+        dst_row[col] = col;
+    }
 
     __syncthreads();
 
     for (int k = 2; k <= ncols_pad; k *= 2) {
         for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (dst_row[col] >= ncols ||
-                        (dst_row[ixj] < ncols && (order == SORT_ORDER_ASC ?
-                            x_row[dst_row[col]] > x_row[dst_row[ixj]] :
-                            x_row[dst_row[col]] < x_row[dst_row[ixj]]))
-                    ) {
-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
-                    }
-                } else {
-                    if (dst_row[ixj] >= ncols ||
-                        (dst_row[col] < ncols && (order == SORT_ORDER_ASC ?
-                            x_row[dst_row[col]] < x_row[dst_row[ixj]] :
-                            x_row[dst_row[col]] > x_row[dst_row[ixj]]))
-                    ) {
-                        ggml_cuda_swap(dst_row[col], dst_row[ixj]);
+            for (int col = threadIdx.x; col < ncols_pad; col += blockDim.x) {
+                int ixj = col ^ j;
+                if (ixj > col) {
+                    if ((col & k) == 0) {
+                        if (dst_row[col] >= ncols ||
+                            (dst_row[ixj] < ncols && (order == SORT_ORDER_ASC ?
+                                x_row[dst_row[col]] > x_row[dst_row[ixj]] :
+                                x_row[dst_row[col]] < x_row[dst_row[ixj]]))
+                        ) {
+                            ggml_cuda_swap(dst_row[col], dst_row[ixj]);
+                        }
+                    } else {
+                        if (dst_row[ixj] >= ncols ||
+                            (dst_row[col] < ncols && (order == SORT_ORDER_ASC ?
+                                x_row[dst_row[col]] < x_row[dst_row[ixj]] :
+                                x_row[dst_row[col]] > x_row[dst_row[ixj]]))
+                        ) {
+                            ggml_cuda_swap(dst_row[col], dst_row[ixj]);
+                        }
                     }
                 }
             }
@@ -56,7 +55,7 @@ static __device__ void k_argsort(const T * x, uint32_t * dst, const int ncols, i
     }
 
     // copy the result to dst without the padding
-    if (col < ncols) {
+    for (int col = threadIdx.x; col < ncols; col += blockDim.x) {
         dst[row * ncols + col] = dst_row[col];
     }
 }