Small cleanup for GPU split evaluation. (#11778)

trivialfis · web-flow · commit a22acdc482ef · 2025-10-28T08:50:47.000+08:00
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
@@ -118,7 +118,9 @@ function(xgboost_set_cuda_flags target)
 
   if(USE_NVTX)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
-    target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
+    if(NOT USE_DEVICE_DEBUG)
+      target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
+    endif()
   endif()
 
   # Use CCCL we find before CUDA Toolkit to make sure we get newer headers as intended
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -18,7 +18,7 @@ set_source_files_properties(
   PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
 
 if(USE_CUDA)
-  file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
+  file(GLOB_RECURSE CUDA_SOURCES *.cu)
   target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
 endif()
 
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
@@ -30,7 +30,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
       quantiser.ToFloatingPoint(parent_sum - scan));
 
   missing_left_out = missing_left_gain > missing_right_gain;
-  return missing_left_out?missing_left_gain:missing_right_gain;
+  return missing_left_out ? missing_left_gain : missing_right_gain;
 }
 
 // This kernel uses block_size == warp_size. This is an unusually small block size for a cuda kernel
@@ -92,8 +92,7 @@ class EvaluateSplitAgent {
   }
   __device__ GradientPairInt64 ReduceFeature() {
     GradientPairInt64 local_sum;
-    for (int idx = gidx_begin + threadIdx.x; idx < gidx_end;
-         idx += kBlockSize) {
+    for (int idx = gidx_begin + threadIdx.x; idx < gidx_end; idx += kBlockSize) {
       local_sum += LoadGpair(node_histogram + idx);
     }
     local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum);  // NOLINT
@@ -103,16 +102,16 @@ class EvaluateSplitAgent {
   }
 
   // Load using efficient 128 vector load instruction
-  __device__ __forceinline__ GradientPairInt64 LoadGpair(const GradientPairInt64 *ptr) {
+  __device__ __forceinline__ static GradientPairInt64 LoadGpair(const GradientPairInt64 *ptr) {
     float4 tmp = *reinterpret_cast<const float4 *>(ptr);
     auto gpair = *reinterpret_cast<const GradientPairInt64 *>(&tmp);
     static_assert(sizeof(decltype(gpair)) == sizeof(float4),
                   "Vector type size does not match gradient pair size.");
     return gpair;
   }
 
-  __device__ __forceinline__ void Numerical(DeviceSplitCandidate * best_split) {
-    for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
+  __device__ __forceinline__ void Numerical(DeviceSplitCandidate *best_split) {
+    for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
       bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
       GradientPairInt64 bin = thread_active ? LoadGpair(node_histogram + scan_begin + threadIdx.x)
                                             : GradientPairInt64();
@@ -255,20 +254,18 @@ class EvaluateSplitAgent {
   }
 };
 
-template <int kBlockSize>
-__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
-    bst_feature_t max_active_features,
-    common::Span<const EvaluateSplitInputs> d_inputs,
-    const EvaluateSplitSharedInputs shared_inputs,
-    common::Span<bst_feature_t> sorted_idx,
+template <int kBlockThreads>
+__global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
+    bst_feature_t max_active_features, common::Span<const EvaluateSplitInputs> d_inputs,
+    const EvaluateSplitSharedInputs shared_inputs, common::Span<bst_feature_t> sorted_idx,
     const TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
     common::Span<DeviceSplitCandidate> out_candidates) {
   // Aligned && shared storage for best_split
   __shared__ cub::Uninitialized<DeviceSplitCandidate> uninitialized_split;
   DeviceSplitCandidate &best_split = uninitialized_split.Alias();
 
   if (threadIdx.x == 0) {
-    best_split = DeviceSplitCandidate();
+    best_split = DeviceSplitCandidate{};
   }
 
   __syncthreads();
@@ -284,7 +281,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
   }
   int fidx = inputs.feature_set[feature_offset];
 
-  using AgentT = EvaluateSplitAgent<kBlockSize>;
+  using AgentT = EvaluateSplitAgent<kBlockThreads>;
   __shared__ typename AgentT::TempStorage temp_storage;
   AgentT agent(&temp_storage, fidx, inputs, shared_inputs, evaluator);
 
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -21,8 +21,8 @@ namespace tree {
 
 // Inputs specific to each node
 struct EvaluateSplitInputs {
-  int nidx;
-  int depth;
+  bst_node_t nidx;
+  bst_node_t depth;
   GradientPairInt64 parent_sum;
   common::Span<const bst_feature_t> feature_set;
   common::Span<const GradientPairInt64> gradient_histogram;
@@ -168,10 +168,10 @@ class GPUHistEvaluator {
   void ApplyTreeSplit(GPUExpandEntry const &candidate, RegTree *p_tree) {
     auto &tree = *p_tree;
     // Set up child constraints
-    auto left_child = tree[candidate.nid].LeftChild();
-    auto right_child = tree[candidate.nid].RightChild();
-    tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
-                             tree[candidate.nid].SplitIndex(), candidate.left_weight,
+    auto left_child = tree[candidate.nidx].LeftChild();
+    auto right_child = tree[candidate.nidx].RightChild();
+    tree_evaluator_.AddSplit(candidate.nidx, left_child, right_child,
+                             tree[candidate.nidx].SplitIndex(), candidate.left_weight,
                              candidate.right_weight);
   }
 
diff --git a/src/tree/gpu_hist/expand_entry.cuh b/src/tree/gpu_hist/expand_entry.cuh
@@ -13,7 +13,7 @@
 
 namespace xgboost::tree {
 struct GPUExpandEntry {
-  bst_node_t nid;
+  bst_node_t nidx;
   bst_node_t depth;
   DeviceSplitCandidate split;
 
@@ -24,7 +24,7 @@ struct GPUExpandEntry {
   GPUExpandEntry() = default;
   XGBOOST_DEVICE GPUExpandEntry(bst_node_t nid, bst_node_t depth, DeviceSplitCandidate split,
                                 float base, float left, float right)
-      : nid(nid),
+      : nidx(nid),
         depth(depth),
         split(std::move(split)),
         base_weight{base},
@@ -49,13 +49,13 @@ struct GPUExpandEntry {
 
   [[nodiscard]] float GetLossChange() const { return split.loss_chg; }
 
-  [[nodiscard]] bst_node_t GetNodeId() const { return nid; }
+  [[nodiscard]] bst_node_t GetNodeId() const { return nidx; }
 
   [[nodiscard]] bst_node_t GetDepth() const { return depth; }
 
   friend std::ostream& operator<<(std::ostream& os, const GPUExpandEntry& e) {
     os << "GPUExpandEntry: \n";
-    os << "nidx: " << e.nid << "\n";
+    os << "nidx: " << e.nidx << "\n";
     os << "depth: " << e.depth << "\n";
     os << "loss: " << e.split.loss_chg << "\n";
     os << "left_sum: " << e.split.left_sum << "\n";
@@ -66,7 +66,7 @@ struct GPUExpandEntry {
   void Save(Json* p_out) const {
     auto& out = *p_out;
 
-    out["nid"] = Integer{this->nid};
+    out["nid"] = Integer{this->nidx};
     out["depth"] = Integer{this->depth};
     // GPU specific
     out["base_weight"] = this->base_weight;
@@ -99,7 +99,7 @@ struct GPUExpandEntry {
   }
 
   void Load(Json const& in) {
-    this->nid = get<Integer const>(in["nid"]);
+    this->nidx = get<Integer const>(in["nid"]);
     this->depth = get<Integer const>(in["depth"]);
     // GPU specific
     this->base_weight = get<Number const>(in["base_weight"]);
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
@@ -196,7 +196,7 @@ class DeviceHistogramBuilder {
     for (std::size_t i = 0; i < subtraction_nidx.size(); i++) {
       auto build_hist_nidx = build_nidx.at(i);
       auto subtraction_trick_nidx = subtraction_nidx.at(i);
-      auto parent_nidx = candidates.at(i).nid;
+      auto parent_nidx = candidates.at(i).nidx;
 
       if (!this->SubtractionTrick(ctx, parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
         need_build.push_back(subtraction_trick_nidx);
diff --git a/src/tree/param.h b/src/tree/param.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2025, XGBoost Contributors
  * \file param.h
  * \brief training parameters, statistics used to support tree construction.
  * \author Tianqi Chen
@@ -242,8 +242,8 @@ XGBOOST_DEVICE inline T CalcGainGivenWeight(const TrainingParams &p, T sum_grad,
 
 // calculate weight given the statistics
 template <typename TrainingParams, typename T>
-XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad,
-                                   T sum_hess) {
+XGBOOST_DEVICE std::enable_if_t<std::is_floating_point_v<T>, T> CalcWeight(TrainingParams const &p,
+                                                                           T sum_grad, T sum_hess) {
   if (sum_hess < p.min_child_weight || sum_hess <= 0.0) {
     return 0.0;
   }
@@ -291,17 +291,17 @@ XGBOOST_DEVICE inline float CalcWeight(const TrainingParams &p, GpairT sum_grad)
 }
 
 /**
- * \brief multi-target weight, calculated with learning rate.
+ * @brief multi-target weight, calculated with learning rate.
  */
 inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
                        float eta, linalg::VectorView<float> out_w) {
-  for (bst_target_t i = 0; i < out_w.Size(); ++i) {
-    out_w(i) = CalcWeight(p, grad_sum(i).GetGrad(), grad_sum(i).GetHess()) * eta;
+  for (bst_target_t t = 0, n_targets = out_w.Size(); t < n_targets; ++t) {
+    out_w(t) = CalcWeight(p, grad_sum(t).GetGrad(), grad_sum(t).GetHess()) * eta;
   }
 }
 
 /**
- * \brief multi-target weight
+ * @brief multi-target weight
  */
 inline void CalcWeight(TrainParam const &p, linalg::VectorView<GradientPairPrecise const> grad_sum,
                        linalg::VectorView<float> out_w) {
@@ -312,8 +312,8 @@ inline double CalcGainGivenWeight(TrainParam const &p,
                                   linalg::VectorView<GradientPairPrecise const> sum_grad,
                                   linalg::VectorView<float const> weight) {
   double gain{0};
-  for (bst_target_t i = 0; i < weight.Size(); ++i) {
-    gain += -weight(i) * ThresholdL1(sum_grad(i).GetGrad(), p.reg_alpha);
+  for (bst_target_t t = 0, n_targets = weight.Size(); t < n_targets; ++t) {
+    gain += -weight(t) * ThresholdL1(sum_grad(t).GetGrad(), p.reg_alpha);
   }
   return gain;
 }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
@@ -85,11 +85,11 @@ void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
     auto right_sum = quantizer->ToFloatingPoint(e.split.right_sum);
     bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
     if (fewer_right) {
-      p_build_nidx[nidx_in_set] = tree.RightChild(e.nid);
-      p_sub_nidx[nidx_in_set] = tree.LeftChild(e.nid);
+      p_build_nidx[nidx_in_set] = tree.RightChild(e.nidx);
+      p_sub_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
     } else {
-      p_build_nidx[nidx_in_set] = tree.LeftChild(e.nid);
-      p_sub_nidx[nidx_in_set] = tree.RightChild(e.nid);
+      p_build_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
+      p_sub_nidx[nidx_in_set] = tree.RightChild(e.nidx);
     }
     ++nidx_in_set;
   }
@@ -132,13 +132,13 @@ struct GPUHistMakerDevice {
     auto tree = p_tree->HostScView();
     for (std::size_t i = 0, n = candidates.size(); i < n; i++) {
       auto const& e = candidates[i];
-      RegTree::Node split_node = tree.nodes[e.nid];
-      auto split_type = tree.SplitType(e.nid);
-      nodes.nidx.at(i) = e.nid;
-      nodes.left_nidx[i] = tree.LeftChild(e.nid);
-      nodes.right_nidx[i] = tree.RightChild(e.nid);
+      RegTree::Node split_node = tree.nodes[e.nidx];
+      auto split_type = tree.SplitType(e.nidx);
+      nodes.nidx.at(i) = e.nidx;
+      nodes.left_nidx[i] = tree.LeftChild(e.nidx);
+      nodes.right_nidx[i] = tree.RightChild(e.nidx);
       nodes.split_data[i] =
-          NodeSplitData{split_node, split_type, this->evaluator_.GetDeviceNodeCats(e.nid)};
+          NodeSplitData{split_node, split_type, this->evaluator_.GetDeviceNodeCats(e.nidx)};
 
       CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
     }
@@ -299,8 +299,8 @@ struct GPUHistMakerDevice {
     auto sc_tree = tree.HostScView();
     for (std::size_t i = 0; i < candidates.size(); i++) {
       auto candidate = candidates.at(i);
-      bst_node_t left_nidx = sc_tree.LeftChild(candidate.nid);
-      bst_node_t right_nidx = sc_tree.RightChild(candidate.nid);
+      bst_node_t left_nidx = sc_tree.LeftChild(candidate.nidx);
+      bst_node_t right_nidx = sc_tree.RightChild(candidate.nidx);
       nidx[i * 2] = left_nidx;
       nidx[i * 2 + 1] = right_nidx;
       auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx));
@@ -482,7 +482,7 @@ struct GPUHistMakerDevice {
     bst_idx_t n_samples = 0;
     for (auto const& c : candidates) {
       for (auto const& part : this->partitioners_) {
-        n_samples += part->GetRows(c.nid).size();
+        n_samples += part->GetRows(c.nidx).size();
       }
     }
     // avoid copy if the kernel is small.
@@ -688,7 +688,7 @@ struct GPUHistMakerDevice {
 
     // Sanity check - have we created a leaf with no training instances?
     if (!collective::IsDistributed() && partitioners_.size() == 1) {
-      CHECK(partitioners_.front()->GetRows(candidate.nid).size() > 0)
+      CHECK(partitioners_.front()->GetRows(candidate.nidx).size() > 0)
           << "No training instances in this leaf!";
     }
 
@@ -708,27 +708,27 @@ struct GPUHistMakerDevice {
       CHECK(common::CheckNAN(candidate.split.fvalue));
       std::vector<common::CatBitField::value_type> split_cats;
 
-      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
+      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nidx);
       auto n_bins_feature = cuts_->FeatureBins(candidate.split.findex);
       split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);
       CHECK_LE(split_cats.size(), h_cats.size());
       std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
 
       tree.ExpandCategorical(
-          candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
+          candidate.nidx, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
           base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_hess,
           left_hess, right_hess);
     } else {
       CHECK(!common::CheckNAN(candidate.split.fvalue));
-      tree.ExpandNode(candidate.nid, candidate.split.findex, candidate.split.fvalue,
+      tree.ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
                       candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight,
                       candidate.split.loss_chg, parent_hess,
           left_hess, right_hess);
     }
     evaluator_.ApplyTreeSplit(candidate, p_tree);
 
-    const auto& parent = tree[candidate.nid];
-    interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
+    const auto& parent = tree[candidate.nidx];
+    interaction_constraints.Split(candidate.nidx, parent.SplitIndex(), parent.LeftChild(),
                                   parent.RightChild());
   }
 
@@ -742,7 +742,7 @@ struct GPUHistMakerDevice {
         [=] __device__(auto const& gpair) { return quantiser.ToFixedPoint(gpair); });
     GradientPairInt64 root_sum_quantised =
         dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + this->gpair.size(),
-                   GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
+                   GradientPairInt64{}, cuda::std::plus<GradientPairInt64>{});
     using ReduceT = typename decltype(root_sum_quantised)::ValueT;
     auto rc = collective::GlobalSum(
         ctx_, p_fmat->Info(), linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
@@ -838,8 +838,6 @@ std::pair<std::shared_ptr<common::HistogramCuts const>, bool> InitBatchCuts(
 }
 
 class GPUHistMaker : public TreeUpdater {
-  using GradientSumT = GradientPairPrecise;
-
  public:
   explicit GPUHistMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {};
   void Configure(const Args& args) override {
diff --git a/tests/cpp/tree/gpu_hist/test_driver.cu b/tests/cpp/tree/gpu_hist/test_driver.cu
@@ -1,3 +1,6 @@
+/**
+ * Copyright 2020-2025, XGBoost contributors
+ */
 #include <gtest/gtest.h>
 #include "../../../../src/tree/driver.h"
 #include "../../../../src/tree/gpu_hist/expand_entry.cuh"
@@ -17,7 +20,7 @@ TEST(GpuHist, DriverDepthWise) {
   split.right_sum = {0, 1};
   GPUExpandEntry root(0, 0, split, 2.0f, 1.0f, 1.0f);
   driver.Push({root});
-  EXPECT_EQ(driver.Pop().front().nid, 0);
+  EXPECT_EQ(driver.Pop().front().nidx, 0);
   driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}});
   driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}});
   driver.Push({GPUExpandEntry{3, 1, split, 2.0f, 1.0f, 1.0f}});
@@ -55,24 +58,24 @@ TEST(GpuHist, DriverLossGuided) {
   EXPECT_TRUE(driver.Pop().empty());
   GPUExpandEntry root(0, 0, high_gain, 2.0f, 1.0f, 1.0f );
   driver.Push({root});
-  EXPECT_EQ(driver.Pop().front().nid, 0);
+  EXPECT_EQ(driver.Pop().front().nidx, 0);
   // Select high gain first
   driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
   driver.Push({GPUExpandEntry{2, 2, high_gain, 2.0f, 1.0f, 1.0f}});
   auto res = driver.Pop();
   EXPECT_EQ(res.size(), 1);
-  EXPECT_EQ(res[0].nid, 2);
+  EXPECT_EQ(res[0].nidx, 2);
   res = driver.Pop();
   EXPECT_EQ(res.size(), 1);
-  EXPECT_EQ(res[0].nid, 1);
+  EXPECT_EQ(res[0].nidx, 1);
 
   // If equal gain, use nid
   driver.Push({GPUExpandEntry{2, 1, low_gain, 2.0f, 1.0f, 1.0f}});
   driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
   res = driver.Pop();
-  EXPECT_EQ(res[0].nid, 1);
+  EXPECT_EQ(res[0].nidx, 1);
   res = driver.Pop();
-  EXPECT_EQ(res[0].nid, 2);
+  EXPECT_EQ(res[0].nidx, 2);
 }
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu