Skip to content

Commit 02f9651

Browse files
authored
Fix missing cub checks and streams. (#11642)
1 parent 816b46b commit 02f9651

File tree

7 files changed

+61
-60
lines changed

7 files changed

+61
-60
lines changed

src/common/ranking_utils.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,13 @@ void CalcQueriesDCG(Context const* ctx, linalg::VectorView<float const> d_labels
6161

6262
CHECK(out_dcg.Contiguous());
6363
std::size_t bytes;
64-
cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
65-
d_group_ptr.size() - 1, d_group_ptr.data(),
66-
d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
64+
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, bytes, value_it, out_dcg.Values().data(),
65+
d_group_ptr.size() - 1, d_group_ptr.data(),
66+
d_group_ptr.data() + 1, ctx->CUDACtx()->Stream()));
6767
dh::TemporaryArray<char> temp(bytes);
68-
cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, value_it, out_dcg.Values().data(),
69-
d_group_ptr.size() - 1, d_group_ptr.data(),
70-
d_group_ptr.data() + 1, ctx->CUDACtx()->Stream());
68+
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
69+
temp.data().get(), bytes, value_it, out_dcg.Values().data(), d_group_ptr.size() - 1,
70+
d_group_ptr.data(), d_group_ptr.data() + 1, ctx->CUDACtx()->Stream()));
7171
}
7272

7373
void CalcQueriesInvIDCG(Context const* ctx, linalg::VectorView<float const> d_labels,

src/common/stats.cu

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,10 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
5555
std::size_t bytes;
5656
CHECK_EQ(out.Size(), 1);
5757
auto s = ctx->CUDACtx()->Stream();
58-
cub::DeviceReduce::Sum(nullptr, bytes, it, out.Values().data(), v.Size(), s);
58+
dh::safe_cuda(cub::DeviceReduce::Sum(nullptr, bytes, it, out.Values().data(), v.Size(), s));
5959
dh::TemporaryArray<char> temp{bytes};
60-
cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
60+
dh::safe_cuda(
61+
cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s));
6162
}
6263

6364
void SampleMean(Context const* ctx, bool is_column_split, linalg::MatrixView<float const> d_v,

src/metric/rank_metric.cu

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,12 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
6262
thrust::fill_n(cuctx->CTP(), pre.data(), pre.size(), 0.0);
6363

6464
std::size_t bytes;
65-
cub::DeviceSegmentedReduce::Sum(nullptr, bytes, it, pre.data(), p_cache->Groups(), d_gptr.data(),
66-
d_gptr.data() + 1, cuctx->Stream());
65+
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, bytes, it, pre.data(), p_cache->Groups(),
66+
d_gptr.data(), d_gptr.data() + 1, cuctx->Stream()));
6767
dh::TemporaryArray<char> temp(bytes);
68-
cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, it, pre.data(), p_cache->Groups(),
69-
d_gptr.data(), d_gptr.data() + 1, cuctx->Stream());
68+
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, it, pre.data(),
69+
p_cache->Groups(), d_gptr.data(), d_gptr.data() + 1,
70+
cuctx->Stream()));
7071

7172
auto w_it =
7273
dh::MakeTransformIterator<double>(thrust::make_counting_iterator(0ul),
@@ -166,11 +167,13 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
166167
});
167168

168169
std::size_t bytes;
169-
cub::DeviceSegmentedReduce::Sum(nullptr, bytes, val_it, map.data(), p_cache->Groups(),
170-
d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
170+
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, bytes, val_it, map.data(),
171+
p_cache->Groups(), d_group_ptr.data(),
172+
d_group_ptr.data() + 1, cuctx->Stream()));
171173
dh::TemporaryArray<char> temp(bytes);
172-
cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, val_it, map.data(), p_cache->Groups(),
173-
d_group_ptr.data(), d_group_ptr.data() + 1, cuctx->Stream());
174+
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(temp.data().get(), bytes, val_it, map.data(),
175+
p_cache->Groups(), d_group_ptr.data(),
176+
d_group_ptr.data() + 1, cuctx->Stream()));
174177
}
175178

176179
PackedReduceResult result{0.0, 0.0};

src/objective/lambdarank_obj.cu

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ void MinBias(Context const* ctx, std::shared_ptr<ltr::RankingCache> p_cache,
6363
return std::abs(t_plus(i));
6464
});
6565
std::size_t bytes;
66-
cub::DeviceSegmentedReduce::Min(nullptr, bytes, val_it, d_min.data(), 2, key_it, key_it + 1,
67-
cuctx->Stream());
66+
dh::safe_cuda(cub::DeviceSegmentedReduce::Min(nullptr, bytes, val_it, d_min.data(), 2, key_it,
67+
key_it + 1, cuctx->Stream()));
6868
dh::TemporaryArray<char> temp(bytes);
69-
cub::DeviceSegmentedReduce::Min(temp.data().get(), bytes, val_it, d_min.data(), 2, key_it,
70-
key_it + 1, cuctx->Stream());
69+
dh::safe_cuda(cub::DeviceSegmentedReduce::Min(temp.data().get(), bytes, val_it, d_min.data(), 2,
70+
key_it, key_it + 1, cuctx->Stream()));
7171
}
7272

7373
/**
@@ -225,13 +225,13 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
225225
CHECK_EQ(n_groups * sizeof(GradCostNorm), d_max_lambdas.size_bytes());
226226
// Reduce by group.
227227
std::size_t bytes;
228-
cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, d_max_lambdas.data(), n_groups,
229-
d_threads_group_ptr.data(), d_threads_group_ptr.data() + 1,
230-
reduction_op, init, ctx->CUDACtx()->Stream());
228+
dh::safe_cuda(cub::DeviceSegmentedReduce::Reduce(
229+
nullptr, bytes, val_it, d_max_lambdas.data(), n_groups, d_threads_group_ptr.data(),
230+
d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream()));
231231
dh::TemporaryArray<char> temp(bytes);
232-
cub::DeviceSegmentedReduce::Reduce(
232+
dh::safe_cuda(cub::DeviceSegmentedReduce::Reduce(
233233
temp.data().get(), bytes, val_it, d_max_lambdas.data(), n_groups, d_threads_group_ptr.data(),
234-
d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream());
234+
d_threads_group_ptr.data() + 1, reduction_op, init, ctx->CUDACtx()->Stream()));
235235

236236
dh::TemporaryArray<double> min_bias(2);
237237
auto d_min_bias = dh::ToSpan(min_bias);
@@ -590,11 +590,13 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
590590

591591
auto init = thrust::make_tuple(0.0, 0.0);
592592
std::size_t bytes;
593-
cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, out_it, k, key_it, key_it + 1,
594-
ReduceOp{}, init, ctx->CUDACtx()->Stream());
593+
dh::safe_cuda(cub::DeviceSegmentedReduce::Reduce(nullptr, bytes, val_it, out_it, k, key_it,
594+
key_it + 1, ReduceOp{}, init,
595+
ctx->CUDACtx()->Stream()));
595596
dh::TemporaryArray<char> temp(bytes);
596-
cub::DeviceSegmentedReduce::Reduce(temp.data().get(), bytes, val_it, out_it, k, key_it,
597-
key_it + 1, ReduceOp{}, init, ctx->CUDACtx()->Stream());
597+
dh::safe_cuda(cub::DeviceSegmentedReduce::Reduce(temp.data().get(), bytes, val_it, out_it, k,
598+
key_it, key_it + 1, ReduceOp{}, init,
599+
ctx->CUDACtx()->Stream()));
598600

599601
thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), li.Size(),
600602
[=] XGBOOST_DEVICE(std::size_t i) mutable {

src/tree/gpu_hist/evaluate_splits.cu

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -352,9 +352,8 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
352352
}
353353

354354
void GPUHistEvaluator::LaunchEvaluateSplits(
355-
bst_feature_t max_active_features,
356-
common::Span<const EvaluateSplitInputs> d_inputs,
357-
EvaluateSplitSharedInputs shared_inputs,
355+
Context const *ctx, bst_feature_t max_active_features,
356+
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
358357
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
359358
common::Span<DeviceSplitCandidate> out_splits) {
360359
if (need_sort_histogram_) {
@@ -367,28 +366,25 @@ void GPUHistEvaluator::LaunchEvaluateSplits(
367366

368367
// One block for each feature
369368
uint32_t constexpr kBlockThreads = 32;
370-
dh::LaunchKernel {static_cast<uint32_t>(combined_num_features), kBlockThreads,
371-
0}(
372-
EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs,
373-
shared_inputs,
374-
this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()),
375-
evaluator, dh::ToSpan(feature_best_splits));
369+
dh::LaunchKernel{static_cast<uint32_t>(combined_num_features), kBlockThreads, 0, // NOLINT
370+
ctx->CUDACtx()->Stream()}(
371+
EvaluateSplitsKernel<kBlockThreads>, max_active_features, d_inputs, shared_inputs,
372+
this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()), evaluator,
373+
dh::ToSpan(feature_best_splits));
376374

377375
// Reduce to get best candidate for left and right child over all features
378-
auto reduce_offset =
379-
dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0llu),
380-
[=] __device__(size_t idx) -> size_t {
381-
return idx * max_active_features;
382-
});
376+
auto reduce_offset = dh::MakeTransformIterator<size_t>(
377+
thrust::make_counting_iterator(0llu),
378+
[=] __device__(size_t idx) -> size_t { return idx * max_active_features; });
383379
size_t temp_storage_bytes = 0;
384380
auto num_segments = out_splits.size();
385-
cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(),
386-
out_splits.data(), num_segments, reduce_offset,
387-
reduce_offset + 1);
381+
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
382+
nullptr, temp_storage_bytes, feature_best_splits.data(), out_splits.data(), num_segments,
383+
reduce_offset, reduce_offset + 1, ctx->CUDACtx()->Stream()));
388384
dh::TemporaryArray<int8_t> temp(temp_storage_bytes);
389-
cub::DeviceSegmentedReduce::Sum(temp.data().get(), temp_storage_bytes, feature_best_splits.data(),
390-
out_splits.data(), num_segments, reduce_offset,
391-
reduce_offset + 1);
385+
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
386+
temp.data().get(), temp_storage_bytes, feature_best_splits.data(), out_splits.data(),
387+
num_segments, reduce_offset, reduce_offset + 1, ctx->CUDACtx()->Stream()));
392388
}
393389

394390
void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
@@ -414,8 +410,8 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
414410

415411
dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
416412
auto out_splits = dh::ToSpan(splits_out_storage);
417-
this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
418-
evaluator, out_splits);
413+
this->LaunchEvaluateSplits(ctx, max_active_features, d_inputs, shared_inputs, evaluator,
414+
out_splits);
419415

420416
if (is_column_split_) {
421417
// With column-wise data split, we gather the split candidates from all the workers and find the
@@ -427,7 +423,7 @@ void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_
427423
all_candidates.subspan(collective::GetRank() * out_splits.size(), out_splits.size());
428424
dh::safe_cuda(cudaMemcpyAsync(current_rank.data(), out_splits.data(),
429425
out_splits.size() * sizeof(DeviceSplitCandidate),
430-
cudaMemcpyDeviceToDevice));
426+
cudaMemcpyDeviceToDevice, ctx->CUDACtx()->Stream()));
431427
auto rc = collective::Allgather(
432428
ctx, linalg::MakeVec(all_candidates.data(), all_candidates.size(), ctx->Device()));
433429
collective::SafeColl(rc);

src/tree/gpu_hist/evaluate_splits.cuh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -183,12 +183,11 @@ class GPUHistEvaluator {
183183
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);
184184

185185
// impl of evaluate splits, contains CUDA kernels so it's public
186-
void LaunchEvaluateSplits(
187-
bst_feature_t max_active_features,
188-
common::Span<const EvaluateSplitInputs> d_inputs,
189-
EvaluateSplitSharedInputs shared_inputs,
190-
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
191-
common::Span<DeviceSplitCandidate> out_splits);
186+
void LaunchEvaluateSplits(Context const *ctx, bst_feature_t max_active_features,
187+
common::Span<const EvaluateSplitInputs> d_inputs,
188+
EvaluateSplitSharedInputs shared_inputs,
189+
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
190+
common::Span<DeviceSplitCandidate> out_splits);
192191
/**
193192
* \brief Evaluate splits for left and right nodes.
194193
*/

tests/cpp/tree/gpu_hist/test_evaluate_splits.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -496,8 +496,8 @@ TEST(GpuHist, EvaluateSplits) {
496496
FstCU()};
497497
dh::device_vector<EvaluateSplitInputs> inputs =
498498
std::vector<EvaluateSplitInputs>{input_left, input_right};
499-
evaluator.LaunchEvaluateSplits(input_left.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
500-
evaluator.GetEvaluator(), dh::ToSpan(out_splits));
499+
evaluator.LaunchEvaluateSplits(&ctx, input_left.feature_set.size(), dh::ToSpan(inputs),
500+
shared_inputs, evaluator.GetEvaluator(), dh::ToSpan(out_splits));
501501

502502
DeviceSplitCandidate result_left = out_splits[0];
503503
EXPECT_EQ(result_left.findex, 1);

0 commit comments

Comments
 (0)