@@ -313,7 +313,7 @@ class warp_sort_filtered : public warp_sort<Capacity, Ascending, T, IdxT> {
313
313
314
314
__device__ __forceinline__ void merge_buf_ () {
315
315
topk::bitonic<kMaxBufLen >(!Ascending, kWarpWidth ).sort (val_buf_, idx_buf_);
316
- this ->merge_in <kMaxBufLen >(val_buf_, idx_buf_);
316
+ this ->template merge_in <kMaxBufLen >(val_buf_, idx_buf_);
317
317
buf_len_ = 0 ;
318
318
set_k_th_ (); // contains warp sync
319
319
#pragma unroll
@@ -385,7 +385,7 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
385
385
if (buf_len_ == kMaxArrLen ) {
386
386
topk::bitonic<kMaxArrLen >(!Ascending, kWarpWidth )
387
387
.sort (val_buf_, idx_buf_);
388
- this ->merge_in <kMaxArrLen >(val_buf_, idx_buf_);
388
+ this ->template merge_in <kMaxArrLen >(val_buf_, idx_buf_);
389
389
#pragma unroll
390
390
for (int i = 0 ; i < kMaxArrLen ; i++) {
391
391
val_buf_[i] = kDummy ;
@@ -398,7 +398,7 @@ class warp_sort_immediate : public warp_sort<Capacity, Ascending, T, IdxT> {
398
398
if (buf_len_ != 0 ) {
399
399
topk::bitonic<kMaxArrLen >(!Ascending, kWarpWidth )
400
400
.sort (val_buf_, idx_buf_);
401
- this ->merge_in <kMaxArrLen >(val_buf_, idx_buf_);
401
+ this ->template merge_in <kMaxArrLen >(val_buf_, idx_buf_);
402
402
}
403
403
}
404
404
@@ -421,7 +421,7 @@ constexpr inline __host__ __device__ IntType ceildiv(IntType a, IntType b) {
421
421
return (a + b - 1 ) / b;
422
422
}
423
423
template <typename IntType>
424
- constexpr inline __device__ IntType roundUp256 (IntType num) {
424
+ constexpr inline __host__ __device__ IntType roundUp256 (IntType num) {
425
425
// return (num + 255) / 256 * 256;
426
426
constexpr int MASK = 255 ;
427
427
return (num + MASK) & (~MASK);
0 commit comments