ginkgo-project
diff --git a/‎common/components/prefix_sum.hpp.inc
Lines changed: 2 additions & 2 deletions b/‎common/components/prefix_sum.hpp.inc
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/components/reduction.hpp.inc
Lines changed: 1 addition & 1 deletion b/‎common/components/reduction.hpp.inc
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/components/thread_ids.hpp.inc
Lines changed: 77 additions & 0 deletions b/‎common/components/thread_ids.hpp.inc
Lines changed: 77 additions & 0 deletions
diff --git a/‎common/components/zero_array.hpp.inc
Lines changed: 1 addition & 2 deletions b/‎common/components/zero_array.hpp.inc
Lines changed: 1 addition & 2 deletions
diff --git a/‎common/factorization/par_ilu_kernels.hpp.inc
Lines changed: 16 additions & 20 deletions b/‎common/factorization/par_ilu_kernels.hpp.inc
Lines changed: 16 additions & 20 deletions
diff --git a/‎common/matrix/coo_kernels.hpp.inc
Lines changed: 2 additions & 2 deletions b/‎common/matrix/coo_kernels.hpp.inc
Lines changed: 2 additions & 2 deletions
diff --git a/‎common/matrix/csr_kernels.hpp.inc
Lines changed: 24 additions & 30 deletions b/‎common/matrix/csr_kernels.hpp.inc
Lines changed: 24 additions & 30 deletions
@@ -50,7 +50,7 @@ __global__ __launch_bounds__(block_size) void start_prefix_sum(
     size_type num_elements, ValueType *__restrict__ elements,
     ValueType *__restrict__ block_sum)
 {
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
+    const auto tidx = thread::get_thread_id_flat();
     const auto element_id = threadIdx.x;
     __shared__ size_type prefix_helper[block_size];
     prefix_helper[element_id] =
@@ -113,7 +113,7 @@ __global__ __launch_bounds__(block_size) void finalize_prefix_sum(
     size_type num_elements, ValueType *__restrict__ elements,
     const ValueType *__restrict__ block_sum)
 {
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const auto tidx = thread::get_thread_id_flat();
 
     if (tidx < num_elements) {
         ValueType prefix_block_sum = zero<ValueType>();
 
@@ -142,7 +142,7 @@ __device__ void reduce_array(size_type size,
                              ValueType *__restrict__ result,
                              Operator reduce_op = Operator{})
 {
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const auto tidx = thread::get_thread_id_flat();
     auto thread_result = zero<ValueType>();
     for (auto i = tidx; i < size; i += blockDim.x * gridDim.x) {
         thread_result = reduce_op(thread_result, source[i]);
 
@@ -192,4 +192,81 @@ __device__ __forceinline__ size_type get_thread_id()
 {
     return get_subwarp_id<subwarp_size, warps_per_block>() * subwarp_size +
            threadIdx.x;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the thread in the given index type.
+ * This function assumes one-dimensional thread and block indexing.
+ *
+ * @return the global ID of the thread in the given index type.
+ *
+ * @tparam IndexType  the index type
+ */
+template <typename IndexType = size_type>
+__device__ __forceinline__ IndexType get_thread_id_flat()
+{
+    return threadIdx.x + static_cast<IndexType>(blockDim.x) * blockIdx.x;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the total number of threads in the given index type.
+ * This function assumes one-dimensional thread and block indexing.
+ *
+ * @return the total number of threads in the given index type.
+ *
+ * @tparam IndexType  the index type
+ */
+template <typename IndexType = size_type>
+__device__ __forceinline__ IndexType get_thread_num_flat()
+{
+    return blockDim.x * static_cast<IndexType>(gridDim.x);
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the global ID of the subwarp in the given index type.
+ * This function assumes one-dimensional thread and block indexing
+ * with a power of two block size of at least subwarp_size.
+ *
+ * @return the global ID of the subwarp in the given index type.
+ *
+ * @tparam subwarp_size  the size of the subwarp. Must be a power of two!
+ * @tparam IndexType  the index type
+ */
+template <int subwarp_size, typename IndexType = size_type>
+__device__ __forceinline__ IndexType get_subwarp_id_flat()
+{
+    static_assert(!(subwarp_size & (subwarp_size - 1)),
+                  "subwarp_size must be a power of two");
+    return threadIdx.x / subwarp_size +
+           static_cast<IndexType>(blockDim.x / subwarp_size) * blockIdx.x;
+}
+
+
+/**
+ * @internal
+ *
+ * Returns the total number of subwarps in the given index type.
+ * This function assumes one-dimensional thread and block indexing
+ * with a power of two block size of at least subwarp_size.
+ *
+ * @return the total number of subwarps in the given index type.
+ *
+ * @tparam subwarp_size  the size of the subwarp. Must be a power of two!
+ * @tparam IndexType  the index type
+ */
+template <int subwarp_size, typename IndexType = size_type>
+__device__ __forceinline__ IndexType get_subwarp_num_flat()
+{
+    static_assert(!(subwarp_size & (subwarp_size - 1)),
+                  "subwarp_size must be a power of two");
+    return blockDim.x / subwarp_size * static_cast<IndexType>(gridDim.x);
 }
@@ -38,8 +38,7 @@ template <typename ValueType>
 __global__ __launch_bounds__(default_block_size) void zero_array(
     size_type n, ValueType *__restrict__ array)
 {
-    const auto tidx =
-        static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;
+    const auto tidx = thread::get_thread_id_flat();
     if (tidx < n) {
         array[tidx] = zero<ValueType>();
     }
 
@@ -93,18 +93,17 @@ __global__
         IndexType *__restrict__ elements_to_add_per_row,
         bool *__restrict__ changes_required)
 {
-    const auto total_thread_count =
-        static_cast<size_type>(blockDim.x) * gridDim.x / SubwarpSize;
-    const auto tidx =
-        threadIdx.x + static_cast<size_type>(blockIdx.x) * blockDim.x;
-    const auto begin_row = static_cast<IndexType>(tidx / SubwarpSize);
+    const auto total_subwarp_count =
+        thread::get_subwarp_num_flat<SubwarpSize, IndexType>();
+    const auto begin_row =
+        thread::get_subwarp_id_flat<SubwarpSize, IndexType>();
 
     auto thread_block = group::this_thread_block();
     auto subwarp_grp = group::tiled_partition<SubwarpSize>(thread_block);
     const auto subwarp_idx = subwarp_grp.thread_rank();
 
     bool local_change{false};
-    for (IndexType row = begin_row; row < num_rows; row += total_thread_count) {
+    for (auto row = begin_row; row < num_rows; row += total_subwarp_count) {
         if (row >= num_cols) {
             if (subwarp_idx == 0) {
                 elements_to_add_per_row[row] = 0;
@@ -145,17 +144,16 @@ __global__
         const IndexType *__restrict__ row_ptrs_addition)
 {
     // Precaution in case not enough threads were created
-    const auto total_thread_count =
-        static_cast<size_type>(blockDim.x) * gridDim.x / SubwarpSize;
-    const auto tidx =
-        threadIdx.x + static_cast<size_type>(blockIdx.x) * blockDim.x;
-    const auto begin_row = static_cast<IndexType>(tidx / SubwarpSize);
+    const auto total_subwarp_count =
+        thread::get_subwarp_num_flat<SubwarpSize, IndexType>();
+    const auto begin_row =
+        thread::get_subwarp_id_flat<SubwarpSize, IndexType>();
 
     auto thread_block = group::this_thread_block();
     auto subwarp_grp = group::tiled_partition<SubwarpSize>(thread_block);
     const auto subwarp_idx = subwarp_grp.thread_rank();
 
-    for (IndexType row = begin_row; row < num_rows; row += total_thread_count) {
+    for (auto row = begin_row; row < num_rows; row += total_subwarp_count) {
         const IndexType old_row_start{old_row_ptrs[row]};
         const IndexType old_row_end{old_row_ptrs[row + 1]};
         const IndexType new_row_start{old_row_start + row_ptrs_addition[row]};
@@ -223,12 +221,10 @@ __global__ __launch_bounds__(default_block_size) void update_row_ptrs(
     IndexType num_rows, IndexType *__restrict__ row_ptrs,
     IndexType *__restrict__ row_ptr_addition)
 {
-    const auto total_thread_count =
-        static_cast<size_type>(blockDim.x) * gridDim.x;
-    const auto begin_row =
-        threadIdx.x + static_cast<size_type>(blockIdx.x) * blockDim.x;
+    const auto total_thread_count = thread::get_thread_num_flat<IndexType>();
+    const auto begin_row = thread::get_thread_id_flat<IndexType>();
 
-    for (IndexType row = begin_row; row < num_rows; row += total_thread_count) {
+    for (auto row = begin_row; row < num_rows; row += total_thread_count) {
         row_ptrs[row] += row_ptr_addition[row];
     }
 }
@@ -241,7 +237,7 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
     const ValueType *__restrict__ values, IndexType *__restrict__ l_nnz_row,
     IndexType *__restrict__ u_nnz_row)
 {
-    const auto row = blockDim.x * blockIdx.x + threadIdx.x;
+    const auto row = thread::get_thread_id_flat<IndexType>();
     if (row < num_rows) {
         IndexType l_row_nnz{};
         IndexType u_row_nnz{};
@@ -266,7 +262,7 @@ __global__ __launch_bounds__(default_block_size) void initialize_l_u(
     const IndexType *__restrict__ u_row_ptrs,
     IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values)
 {
-    const auto row = blockDim.x * blockIdx.x + threadIdx.x;
+    const auto row = thread::get_thread_id_flat<IndexType>();
     if (row < num_rows) {
         auto l_idx = l_row_ptrs[row];
         auto u_idx = u_row_ptrs[row];
@@ -298,7 +294,7 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors(
     const IndexType *__restrict__ u_row_ptrs,
     const IndexType *__restrict__ u_col_idxs, ValueType *__restrict__ u_values)
 {
-    const auto elem_id = blockDim.x * blockIdx.x + threadIdx.x;
+    const auto elem_id = thread::get_thread_id_flat<IndexType>();
     if (elem_id < num_elements) {
         const auto row = row_idxs[elem_id];
         const auto col = col_idxs[elem_id];
 
@@ -228,7 +228,7 @@ __global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
     const IndexType *__restrict__ idxs, size_type num_nonzeros,
     IndexType *__restrict__ ptrs, size_type length)
 {
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const auto tidx = thread::get_thread_id_flat();
 
     if (tidx == 0) {
         ptrs[0] = 0;
@@ -265,7 +265,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
     const ValueType *__restrict__ values, size_type stride,
     ValueType *__restrict__ result)
 {
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
+    const auto tidx = thread::get_thread_id_flat();
     if (tidx < nnz) {
         result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx];
     }
 
@@ -222,7 +222,7 @@ template <typename ValueType>
 __global__ __launch_bounds__(default_block_size) void set_zero(
     const size_type nnz, ValueType *__restrict__ val)
 {
-    const auto ind = size_type(blockDim.x) * blockIdx.x + threadIdx.x;
+    const auto ind = thread::get_thread_id_flat();
     if (ind < nnz) {
         val[ind] = zero<ValueType>();
     }
@@ -438,19 +438,19 @@ __device__ void device_classical_spmv(const size_type num_rows,
                                       ValueType *__restrict__ c,
                                       const size_type c_stride, Closure scale)
 {
-    const auto tid = size_type(blockDim.x) * blockIdx.x + threadIdx.x;
-    const auto subrow = size_type(gridDim.x) * blockDim.x / subwarp_size;
-    const auto subid = tid % subwarp_size;
+    auto subwarp_tile =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
+    const auto subrow = thread::get_subwarp_num_flat<subwarp_size>();
+    const auto subid = subwarp_tile.thread_rank();
     const auto column_id = blockIdx.y;
-    for (auto row = tid / subwarp_size; row < num_rows; row += subrow) {
+    auto row = thread::get_subwarp_id_flat<subwarp_size>();
+    for (; row < num_rows; row += subrow) {
         const auto ind_end = row_ptrs[row + 1];
         ValueType temp_val = zero<ValueType>();
         for (auto ind = row_ptrs[row] + subid; ind < ind_end;
              ind += subwarp_size) {
             temp_val += val[ind] * b[col_idxs[ind] * b_stride + column_id];
         }
-        auto subwarp_tile =
-            group::tiled_partition<subwarp_size>(group::this_thread_block());
         auto subwarp_result = reduce(
             subwarp_tile, temp_val,
             [](const ValueType &a, const ValueType &b) { return a + b; });
@@ -500,8 +500,7 @@ __global__ __launch_bounds__(default_block_size) void spgeam_nnz(
     const IndexType *b_row_ptrs, const IndexType *b_col_idxs,
     IndexType num_rows, IndexType *nnz)
 {
-    auto row = (threadIdx.x + blockDim.x * static_cast<size_type>(blockIdx.x)) /
-               subwarp_size;
+    auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
     auto subwarp =
         group::tiled_partition<subwarp_size>(group::this_thread_block());
     if (row >= num_rows) {
@@ -533,8 +532,7 @@ __global__ __launch_bounds__(default_block_size) void spgeam(
     const IndexType *b_col_idxs, const ValueType *b_vals, IndexType num_rows,
     const IndexType *c_row_ptrs, IndexType *c_col_idxs, ValueType *c_vals)
 {
-    auto row = (threadIdx.x + blockDim.x * static_cast<size_type>(blockIdx.x)) /
-               subwarp_size;
+    auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
     auto subwarp =
         group::tiled_partition<subwarp_size>(group::this_thread_block());
     if (row >= num_rows) {
@@ -591,7 +589,7 @@ __global__ __launch_bounds__(default_block_size) void convert_row_ptrs_to_idxs(
     size_type num_rows, const IndexType *__restrict__ ptrs,
     IndexType *__restrict__ idxs)
 {
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
+    const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_rows) {
         for (auto i = ptrs[tidx]; i < ptrs[tidx + 1]; i++) {
             idxs[i] = tidx;
@@ -620,7 +618,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(
     const ValueType *__restrict__ values, size_type stride,
     ValueType *__restrict__ result)
 {
-    const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;
+    const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_rows) {
         for (auto i = row_ptrs[tidx]; i < row_ptrs[tidx + 1]; i++) {
             result[stride * tidx + col_idxs[i]] = values[i];
@@ -634,7 +632,7 @@ __global__ __launch_bounds__(default_block_size) void calculate_nnz_per_row(
     size_type num_rows, const IndexType *__restrict__ row_ptrs,
     size_type *__restrict__ nnz_per_row)
 {
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_rows) {
         nnz_per_row[tidx] = row_ptrs[tidx + 1] - row_ptrs[tidx];
     }
@@ -685,7 +683,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_sellp(
     IndexType *__restrict__ result_col_idxs,
     ValueType *__restrict__ result_values)
 {
-    const auto global_row = threadIdx.x + blockIdx.x * blockDim.x;
+    const auto global_row = thread::get_thread_id_flat();
     const auto row = global_row % slice_size;
     const auto sliceid = global_row / slice_size;
 
@@ -714,7 +712,7 @@ __global__ __launch_bounds__(default_block_size) void initialize_zero_ell(
     size_type max_nnz_per_row, size_type stride, ValueType *__restrict__ values,
     IndexType *__restrict__ col_idxs)
 {
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const auto tidx = thread::get_thread_id_flat();
 
     if (tidx < stride * max_nnz_per_row) {
         values[tidx] = zero<ValueType>();
@@ -732,10 +730,9 @@ __global__ __launch_bounds__(default_block_size) void fill_in_ell(
     ValueType *__restrict__ result_values,
     IndexType *__restrict__ result_col_idxs)
 {
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
     constexpr auto warp_size = config::warp_size;
-    const auto row = tidx / warp_size;
-    const auto local_tidx = tidx % warp_size;
+    const auto row = thread::get_subwarp_id_flat<warp_size>();
+    const auto local_tidx = threadIdx.x % warp_size;
 
     if (row < num_rows) {
         for (size_type i = local_tidx;
@@ -754,10 +751,11 @@ __global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice(
     size_type num_rows, size_type slice_size, size_type stride_factor,
     const size_type *__restrict__ nnz_per_row, size_type *__restrict__ result)
 {
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
     constexpr auto warp_size = config::warp_size;
-    const auto warpid = tidx / warp_size;
-    const auto tid_in_warp = tidx % warp_size;
+    auto warp_tile =
+        group::tiled_partition<warp_size>(group::this_thread_block());
+    const auto warpid = thread::get_subwarp_id_flat<warp_size>();
+    const auto tid_in_warp = warp_tile.thread_rank();
     const auto slice_num = ceildiv(num_rows, slice_size);
 
     size_type thread_result = 0;
@@ -767,9 +765,6 @@ __global__ __launch_bounds__(default_block_size) void reduce_max_nnz_per_slice(
                 max(thread_result, nnz_per_row[warpid * slice_size + i]);
         }
     }
-
-    auto warp_tile =
-        group::tiled_partition<warp_size>(group::this_thread_block());
     auto warp_result = reduce(
         warp_tile, thread_result,
         [](const size_type &a, const size_type &b) { return max(a, b); });
@@ -818,7 +813,7 @@ __global__
         IndexType *__restrict__ csr_row_idxs,
         size_type *__restrict__ coo_row_nnz)
 {
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
+    const auto tidx = thread::get_thread_id_flat();
     if (tidx < num_rows) {
         const size_type csr_nnz = csr_row_idxs[tidx + 1] - csr_row_idxs[tidx];
         coo_row_nnz[tidx] =
@@ -840,10 +835,9 @@ __global__ __launch_bounds__(default_block_size) void fill_in_hybrid(
     IndexType *__restrict__ result_coo_col,
     IndexType *__restrict__ result_coo_row)
 {
-    const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;
     constexpr auto warp_size = config::warp_size;
-    const auto row = tidx / warp_size;
-    const auto local_tidx = tidx % warp_size;
+    const auto row = thread::get_subwarp_id_flat<warp_size>();
+    const auto local_tidx = threadIdx.x % warp_size;
 
     if (row < num_rows) {
         for (size_type i = local_tidx;
@@ -876,7 +870,7 @@ template <typename ValueType>
 __global__ __launch_bounds__(default_block_size) void conjugate_kernel(
     size_type num_nonzeros, ValueType *__restrict__ val)
 {
-    const auto tidx = size_type(blockIdx.x) * default_block_size + threadIdx.x;
+    const auto tidx = thread::get_thread_id_flat();
 
     if (tidx < num_nonzeros) {
         val[tidx] = conj(val[tidx]);
Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,7 @@ __device__ void reduce_array(size_type size,`
`142`	`142`	`ValueType *__restrict__ result,`
`143`	`143`	`Operator reduce_op = Operator{})`
`144`	`144`	`{`
`145`		`- const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;`
	`145`	`+ const auto tidx = thread::get_thread_id_flat();`
`146`	`146`	`auto thread_result = zero<ValueType>();`
`147`	`147`	`for (auto i = tidx; i < size; i += blockDim.x * gridDim.x) {`
`148`	`148`	`thread_result = reduce_op(thread_result, source[i]);`
Original file line number	Diff line number	Diff line change
`@@ -38,8 +38,7 @@ template <typename ValueType>`
`38`	`38`	`__global__ __launch_bounds__(default_block_size) void zero_array(`
`39`	`39`	`size_type n, ValueType *__restrict__ array)`
`40`	`40`	`{`
`41`		`- const auto tidx =`
`42`		`- static_cast<size_type>(blockDim.x) * blockIdx.x + threadIdx.x;`
	`41`	`+ const auto tidx = thread::get_thread_id_flat();`
`43`	`42`	`if (tidx < n) {`
`44`	`43`	`array[tidx] = zero<ValueType>();`
`45`	`44`	`}`
Original file line number	Diff line number	Diff line change
`@@ -228,7 +228,7 @@ __global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(`
`228`	`228`	`const IndexType *__restrict__ idxs, size_type num_nonzeros,`
`229`	`229`	`IndexType *__restrict__ ptrs, size_type length)`
`230`	`230`	`{`
`231`		`- const auto tidx = threadIdx.x + blockIdx.x * blockDim.x;`
	`231`	`+ const auto tidx = thread::get_thread_id_flat();`
`232`	`232`
`233`	`233`	`if (tidx == 0) {`
`234`	`234`	`ptrs[0] = 0;`
`@@ -265,7 +265,7 @@ __global__ __launch_bounds__(default_block_size) void fill_in_dense(`
`265`	`265`	`const ValueType *__restrict__ values, size_type stride,`
`266`	`266`	`ValueType *__restrict__ result)`
`267`	`267`	`{`
`268`		`- const auto tidx = threadIdx.x + blockDim.x * blockIdx.x;`
	`268`	`+ const auto tidx = thread::get_thread_id_flat();`
`269`	`269`	`if (tidx < nnz) {`
`270`	`270`	`result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx];`
`271`	`271`	`}`