Skip to content

Commit a22acdc

Browse files
authored
Small cleanup for GPU split evaluation. (#11778)
1 parent 194c1b9 commit a22acdc

File tree

11 files changed

+70
-70
lines changed

11 files changed

+70
-70
lines changed

cmake/Utils.cmake

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,9 @@ function(xgboost_set_cuda_flags target)
118118

119119
if(USE_NVTX)
120120
target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
121-
target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
121+
if(NOT USE_DEVICE_DEBUG)
122+
target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
123+
endif()
122124
endif()
123125

124126
# Use CCCL we find before CUDA Toolkit to make sure we get newer headers as intended

src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ set_source_files_properties(
1818
PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
1919

2020
if(USE_CUDA)
21-
file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
21+
file(GLOB_RECURSE CUDA_SOURCES *.cu)
2222
target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
2323
endif()
2424

src/tree/gpu_hist/evaluate_splits.cu

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
3030
quantiser.ToFloatingPoint(parent_sum - scan));
3131

3232
missing_left_out = missing_left_gain > missing_right_gain;
33-
return missing_left_out?missing_left_gain:missing_right_gain;
33+
return missing_left_out ? missing_left_gain : missing_right_gain;
3434
}
3535

3636
// This kernel uses block_size == warp_size. This is an unusually small block size for a cuda kernel
@@ -92,8 +92,7 @@ class EvaluateSplitAgent {
9292
}
9393
__device__ GradientPairInt64 ReduceFeature() {
9494
GradientPairInt64 local_sum;
95-
for (int idx = gidx_begin + threadIdx.x; idx < gidx_end;
96-
idx += kBlockSize) {
95+
for (int idx = gidx_begin + threadIdx.x; idx < gidx_end; idx += kBlockSize) {
9796
local_sum += LoadGpair(node_histogram + idx);
9897
}
9998
local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum); // NOLINT
@@ -103,16 +102,16 @@ class EvaluateSplitAgent {
103102
}
104103

105104
// Load using efficient 128 vector load instruction
106-
__device__ __forceinline__ GradientPairInt64 LoadGpair(const GradientPairInt64 *ptr) {
105+
__device__ __forceinline__ static GradientPairInt64 LoadGpair(const GradientPairInt64 *ptr) {
107106
float4 tmp = *reinterpret_cast<const float4 *>(ptr);
108107
auto gpair = *reinterpret_cast<const GradientPairInt64 *>(&tmp);
109108
static_assert(sizeof(decltype(gpair)) == sizeof(float4),
110109
"Vector type size does not match gradient pair size.");
111110
return gpair;
112111
}
113112

114-
__device__ __forceinline__ void Numerical(DeviceSplitCandidate * best_split) {
115-
for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
113+
__device__ __forceinline__ void Numerical(DeviceSplitCandidate *best_split) {
114+
for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
116115
bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
117116
GradientPairInt64 bin = thread_active ? LoadGpair(node_histogram + scan_begin + threadIdx.x)
118117
: GradientPairInt64();
@@ -255,20 +254,18 @@ class EvaluateSplitAgent {
255254
}
256255
};
257256

258-
template <int kBlockSize>
259-
__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
260-
bst_feature_t max_active_features,
261-
common::Span<const EvaluateSplitInputs> d_inputs,
262-
const EvaluateSplitSharedInputs shared_inputs,
263-
common::Span<bst_feature_t> sorted_idx,
257+
template <int kBlockThreads>
258+
__global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
259+
bst_feature_t max_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
260+
const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx,
264261
const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
265262
common::Span<DeviceSplitCandidate> out_candidates) {
266263
// Aligned && shared storage for best_split
267264
__shared__ cub::Uninitialized<DeviceSplitCandidate> uninitialized_split;
268265
DeviceSplitCandidate &best_split = uninitialized_split.Alias();
269266

270267
if (threadIdx.x == 0) {
271-
best_split = DeviceSplitCandidate();
268+
best_split = DeviceSplitCandidate{};
272269
}
273270

274271
__syncthreads();
@@ -284,7 +281,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
284281
}
285282
int fidx = inputs.feature_set[feature_offset];
286283

287-
using AgentT = EvaluateSplitAgent<kBlockSize>;
284+
using AgentT = EvaluateSplitAgent<kBlockThreads>;
288285
__shared__ typename AgentT::TempStorage temp_storage;
289286
AgentT agent(&temp_storage, fidx, inputs, shared_inputs, evaluator);
290287

src/tree/gpu_hist/evaluate_splits.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ namespace tree {
2121

2222
// Inputs specific to each node
2323
struct EvaluateSplitInputs {
24-
int nidx;
25-
int depth;
24+
bst_node_t nidx;
25+
bst_node_t depth;
2626
GradientPairInt64 parent_sum;
2727
common::Span<const bst_feature_t> feature_set;
2828
common::Span<const GradientPairInt64> gradient_histogram;
@@ -168,10 +168,10 @@ class GPUHistEvaluator {
168168
void ApplyTreeSplit(GPUExpandEntry const &candidate, RegTree *p_tree) {
169169
auto &tree = *p_tree;
170170
// Set up child constraints
171-
auto left_child = tree[candidate.nid].LeftChild();
172-
auto right_child = tree[candidate.nid].RightChild();
173-
tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
174-
tree[candidate.nid].SplitIndex(), candidate.left_weight,
171+
auto left_child = tree[candidate.nidx].LeftChild();
172+
auto right_child = tree[candidate.nidx].RightChild();
173+
tree_evaluator_.AddSplit(candidate.nidx, left_child, right_child,
174+
tree[candidate.nidx].SplitIndex(), candidate.left_weight,
175175
candidate.right_weight);
176176
}
177177

src/tree/gpu_hist/expand_entry.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
namespace xgboost::tree {
1515
struct GPUExpandEntry {
16-
bst_node_t nid;
16+
bst_node_t nidx;
1717
bst_node_t depth;
1818
DeviceSplitCandidate split;
1919

@@ -24,7 +24,7 @@ struct GPUExpandEntry {
2424
GPUExpandEntry() = default;
2525
XGBOOST_DEVICE GPUExpandEntry(bst_node_t nid, bst_node_t depth, DeviceSplitCandidate split,
2626
float base, float left, float right)
27-
: nid(nid),
27+
: nidx(nid),
2828
depth(depth),
2929
split(std::move(split)),
3030
base_weight{base},
@@ -49,13 +49,13 @@ struct GPUExpandEntry {
4949

5050
[[nodiscard]] float GetLossChange() const { return split.loss_chg; }
5151

52-
[[nodiscard]] bst_node_t GetNodeId() const { return nid; }
52+
[[nodiscard]] bst_node_t GetNodeId() const { return nidx; }
5353

5454
[[nodiscard]] bst_node_t GetDepth() const { return depth; }
5555

5656
friend std::ostream& operator<<(std::ostream& os, const GPUExpandEntry& e) {
5757
os << "GPUExpandEntry: \n";
58-
os << "nidx: " << e.nid << "\n";
58+
os << "nidx: " << e.nidx << "\n";
5959
os << "depth: " << e.depth << "\n";
6060
os << "loss: " << e.split.loss_chg << "\n";
6161
os << "left_sum: " << e.split.left_sum << "\n";
@@ -66,7 +66,7 @@ struct GPUExpandEntry {
6666
void Save(Json* p_out) const {
6767
auto& out = *p_out;
6868

69-
out["nid"] = Integer{this->nid};
69+
out["nid"] = Integer{this->nidx};
7070
out["depth"] = Integer{this->depth};
7171
// GPU specific
7272
out["base_weight"] = this->base_weight;
@@ -99,7 +99,7 @@ struct GPUExpandEntry {
9999
}
100100

101101
void Load(Json const& in) {
102-
this->nid = get<Integer const>(in["nid"]);
102+
this->nidx = get<Integer const>(in["nid"]);
103103
this->depth = get<Integer const>(in["depth"]);
104104
// GPU specific
105105
this->base_weight = get<Number const>(in["base_weight"]);

src/tree/gpu_hist/histogram.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ class DeviceHistogramBuilder {
196196
for (std::size_t i = 0; i < subtraction_nidx.size(); i++) {
197197
auto build_hist_nidx = build_nidx.at(i);
198198
auto subtraction_trick_nidx = subtraction_nidx.at(i);
199-
auto parent_nidx = candidates.at(i).nid;
199+
auto parent_nidx = candidates.at(i).nidx;
200200

201201
if (!this->SubtractionTrick(ctx, parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
202202
need_build.push_back(subtraction_trick_nidx);

src/tree/param.h

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2014-2023 by XGBoost Contributors
2+
* Copyright 2014-2025, XGBoost Contributors
33
* \file param.h
44
* \brief training parameters, statistics used to support tree construction.
55
* \author Tianqi Chen
@@ -242,8 +242,8 @@ XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad,
242242

243243
// calculate weight given the statistics
244244
template <typename TrainingParams, typename T>
245-
XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad,
246-
T sum_hess) {
245+
XGBOOST_DEVICE std::enable_if_t<std::is_floating_point_v<T>, T> CalcWeight(TrainingParams const &p,
246+
T sum_grad, T sum_hess) {
247247
if (sum_hess < p.min_child_weight || sum_hess <= 0.0) {
248248
return 0.0;
249249
}
@@ -291,17 +291,17 @@ XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, GpairT sum_grad)
291291
}
292292

293293
/**
294-
* \brief multi-target weight, calculated with learning rate.
294+
* @brief multi-target weight, calculated with learning rate.
295295
*/
296296
inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
297297
float eta, linalg::VectorView<float> out_w) {
298-
for (bst_target_t i = 0; i < out_w.Size(); ++i) {
299-
out_w(i) = CalcWeight(p, grad_sum(i).GetGrad(), grad_sum(i).GetHess()) * eta;
298+
for (bst_target_t t = 0, n_targets = out_w.Size(); t < n_targets; ++t) {
299+
out_w(t) = CalcWeight(p, grad_sum(t).GetGrad(), grad_sum(t).GetHess()) * eta;
300300
}
301301
}
302302

303303
/**
304-
* \brief multi-target weight
304+
* @brief multi-target weight
305305
*/
306306
inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
307307
linalg::VectorView<float> out_w) {
@@ -312,8 +312,8 @@ inline double CalcGainGivenWeight(TrainParam const &p,
312312
linalg::VectorView<GradientPairPrecise const> sum_grad,
313313
linalg::VectorView<float const> weight) {
314314
double gain{0};
315-
for (bst_target_t i = 0; i < weight.Size(); ++i) {
316-
gain += -weight(i) * ThresholdL1(sum_grad(i).GetGrad(), p.reg_alpha);
315+
for (bst_target_t t = 0, n_targets = weight.Size(); t < n_targets; ++t) {
316+
gain += -weight(t) * ThresholdL1(sum_grad(t).GetGrad(), p.reg_alpha);
317317
}
318318
return gain;
319319
}

src/tree/updater_gpu_hist.cu

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,11 @@ void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
8585
auto right_sum = quantizer->ToFloatingPoint(e.split.right_sum);
8686
bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
8787
if (fewer_right) {
88-
p_build_nidx[nidx_in_set] = tree.RightChild(e.nid);
89-
p_sub_nidx[nidx_in_set] = tree.LeftChild(e.nid);
88+
p_build_nidx[nidx_in_set] = tree.RightChild(e.nidx);
89+
p_sub_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
9090
} else {
91-
p_build_nidx[nidx_in_set] = tree.LeftChild(e.nid);
92-
p_sub_nidx[nidx_in_set] = tree.RightChild(e.nid);
91+
p_build_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
92+
p_sub_nidx[nidx_in_set] = tree.RightChild(e.nidx);
9393
}
9494
++nidx_in_set;
9595
}
@@ -132,13 +132,13 @@ struct GPUHistMakerDevice {
132132
auto tree = p_tree->HostScView();
133133
for (std::size_t i = 0, n = candidates.size(); i < n; i++) {
134134
auto const& e = candidates[i];
135-
RegTree::Node split_node = tree.nodes[e.nid];
136-
auto split_type = tree.SplitType(e.nid);
137-
nodes.nidx.at(i) = e.nid;
138-
nodes.left_nidx[i] = tree.LeftChild(e.nid);
139-
nodes.right_nidx[i] = tree.RightChild(e.nid);
135+
RegTree::Node split_node = tree.nodes[e.nidx];
136+
auto split_type = tree.SplitType(e.nidx);
137+
nodes.nidx.at(i) = e.nidx;
138+
nodes.left_nidx[i] = tree.LeftChild(e.nidx);
139+
nodes.right_nidx[i] = tree.RightChild(e.nidx);
140140
nodes.split_data[i] =
141-
NodeSplitData{split_node, split_type, this->evaluator_.GetDeviceNodeCats(e.nid)};
141+
NodeSplitData{split_node, split_type, this->evaluator_.GetDeviceNodeCats(e.nidx)};
142142

143143
CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
144144
}
@@ -299,8 +299,8 @@ struct GPUHistMakerDevice {
299299
auto sc_tree = tree.HostScView();
300300
for (std::size_t i = 0; i < candidates.size(); i++) {
301301
auto candidate = candidates.at(i);
302-
bst_node_t left_nidx = sc_tree.LeftChild(candidate.nid);
303-
bst_node_t right_nidx = sc_tree.RightChild(candidate.nid);
302+
bst_node_t left_nidx = sc_tree.LeftChild(candidate.nidx);
303+
bst_node_t right_nidx = sc_tree.RightChild(candidate.nidx);
304304
nidx[i * 2] = left_nidx;
305305
nidx[i * 2 + 1] = right_nidx;
306306
auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx));
@@ -482,7 +482,7 @@ struct GPUHistMakerDevice {
482482
bst_idx_t n_samples = 0;
483483
for (auto const& c : candidates) {
484484
for (auto const& part : this->partitioners_) {
485-
n_samples += part->GetRows(c.nid).size();
485+
n_samples += part->GetRows(c.nidx).size();
486486
}
487487
}
488488
// avoid copy if the kernel is small.
@@ -688,7 +688,7 @@ struct GPUHistMakerDevice {
688688

689689
// Sanity check - have we created a leaf with no training instances?
690690
if (!collective::IsDistributed() && partitioners_.size() == 1) {
691-
CHECK(partitioners_.front()->GetRows(candidate.nid).size() > 0)
691+
CHECK(partitioners_.front()->GetRows(candidate.nidx).size() > 0)
692692
<< "No training instances in this leaf!";
693693
}
694694

@@ -708,27 +708,27 @@ struct GPUHistMakerDevice {
708708
CHECK(common::CheckNAN(candidate.split.fvalue));
709709
std::vector<common::CatBitField::value_type> split_cats;
710710

711-
auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
711+
auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nidx);
712712
auto n_bins_feature = cuts_->FeatureBins(candidate.split.findex);
713713
split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);
714714
CHECK_LE(split_cats.size(), h_cats.size());
715715
std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
716716

717717
tree.ExpandCategorical(
718-
candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
718+
candidate.nidx, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
719719
base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_hess,
720720
left_hess, right_hess);
721721
} else {
722722
CHECK(!common::CheckNAN(candidate.split.fvalue));
723-
tree.ExpandNode(candidate.nid, candidate.split.findex, candidate.split.fvalue,
723+
tree.ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
724724
candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight,
725725
candidate.split.loss_chg, parent_hess,
726726
left_hess, right_hess);
727727
}
728728
evaluator_.ApplyTreeSplit(candidate, p_tree);
729729

730-
const auto& parent = tree[candidate.nid];
731-
interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
730+
const auto& parent = tree[candidate.nidx];
731+
interaction_constraints.Split(candidate.nidx, parent.SplitIndex(), parent.LeftChild(),
732732
parent.RightChild());
733733
}
734734

@@ -742,7 +742,7 @@ struct GPUHistMakerDevice {
742742
[=] __device__(auto const& gpair) { return quantiser.ToFixedPoint(gpair); });
743743
GradientPairInt64 root_sum_quantised =
744744
dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + this->gpair.size(),
745-
GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
745+
GradientPairInt64{}, cuda::std::plus<GradientPairInt64>{});
746746
using ReduceT = typename decltype(root_sum_quantised)::ValueT;
747747
auto rc = collective::GlobalSum(
748748
ctx_, p_fmat->Info(), linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
@@ -838,8 +838,6 @@ std::pair<std::shared_ptr<common::HistogramCuts const>, bool> InitBatchCuts(
838838
}
839839

840840
class GPUHistMaker : public TreeUpdater {
841-
using GradientSumT = GradientPairPrecise;
842-
843841
public:
844842
explicit GPUHistMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {};
845843
void Configure(const Args& args) override {

tests/cpp/tree/gpu_hist/test_driver.cu

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
/**
2+
* Copyright 2020-2025, XGBoost contributors
3+
*/
14
#include <gtest/gtest.h>
25
#include "../../../../src/tree/driver.h"
36
#include "../../../../src/tree/gpu_hist/expand_entry.cuh"
@@ -17,7 +20,7 @@ TEST(GpuHist, DriverDepthWise) {
1720
split.right_sum = {0, 1};
1821
GPUExpandEntry root(0, 0, split, 2.0f, 1.0f, 1.0f);
1922
driver.Push({root});
20-
EXPECT_EQ(driver.Pop().front().nid, 0);
23+
EXPECT_EQ(driver.Pop().front().nidx, 0);
2124
driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}});
2225
driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}});
2326
driver.Push({GPUExpandEntry{3, 1, split, 2.0f, 1.0f, 1.0f}});
@@ -55,24 +58,24 @@ TEST(GpuHist, DriverLossGuided) {
5558
EXPECT_TRUE(driver.Pop().empty());
5659
GPUExpandEntry root(0, 0, high_gain, 2.0f, 1.0f, 1.0f );
5760
driver.Push({root});
58-
EXPECT_EQ(driver.Pop().front().nid, 0);
61+
EXPECT_EQ(driver.Pop().front().nidx, 0);
5962
// Select high gain first
6063
driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
6164
driver.Push({GPUExpandEntry{2, 2, high_gain, 2.0f, 1.0f, 1.0f}});
6265
auto res = driver.Pop();
6366
EXPECT_EQ(res.size(), 1);
64-
EXPECT_EQ(res[0].nid, 2);
67+
EXPECT_EQ(res[0].nidx, 2);
6568
res = driver.Pop();
6669
EXPECT_EQ(res.size(), 1);
67-
EXPECT_EQ(res[0].nid, 1);
70+
EXPECT_EQ(res[0].nidx, 1);
6871

6972
// If equal gain, use nid
7073
driver.Push({GPUExpandEntry{2, 1, low_gain, 2.0f, 1.0f, 1.0f}});
7174
driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
7275
res = driver.Pop();
73-
EXPECT_EQ(res[0].nid, 1);
76+
EXPECT_EQ(res[0].nidx, 1);
7477
res = driver.Pop();
75-
EXPECT_EQ(res[0].nid, 2);
78+
EXPECT_EQ(res[0].nidx, 2);
7679
}
7780
} // namespace tree
7881
} // namespace xgboost

0 commit comments

Comments
 (0)