[mt] Implement histogram subtraction. (#11825)

trivialfis · web-flow · commit 4bda887974db · 2025-11-21T02:14:45.000+08:00
diff --git a/src/data/batch_utils.cc b/src/data/batch_utils.cc
@@ -25,6 +25,15 @@ void CheckParam(BatchParam const& init, BatchParam const& param) {
       << "Only the `hist` tree method can use the `QuantileDMatrix`.";
 }
 
+/**
+ * @brief Check whether we should configure `min_cache_page_bytes`.
+ *
+ * Defined by @ref AutoCachePageBytes .
+ */
+[[nodiscard]] bool CachePageBytesIsAuto(std::int64_t min_cache_page_bytes) {
+  return min_cache_page_bytes == cuda_impl::AutoCachePageBytes();
+}
+
 [[nodiscard]] std::pair<double, std::int64_t> DftPageSizeHostRatio(
     std::size_t n_cache_bytes, bool is_validation, double cache_host_ratio,
     std::int64_t min_cache_page_bytes) {
diff --git a/src/data/batch_utils.h b/src/data/batch_utils.h
@@ -56,14 +56,6 @@ void CheckParam(BatchParam const& init, BatchParam const& param);
 [[nodiscard]] inline bool HostRatioIsAuto(float cache_host_ratio) {
   return std::isnan(cache_host_ratio);
 }
-/**
- * @brief Check whether we should configure `min_cache_page_bytes`.
- *
- * Defined by @ref AutoCachePageBytes .
- */
-[[nodiscard]] inline bool CachePageBytesIsAuto(std::int64_t min_cache_page_bytes) {
-  return min_cache_page_bytes == -1;
-}
 }  // namespace xgboost::data::detail
 
 namespace xgboost::cuda_impl {
diff --git a/src/tree/gpu_hist/expand_entry.cuh b/src/tree/gpu_hist/expand_entry.cuh
@@ -136,6 +136,10 @@ struct MultiExpandEntry {
   common::Span<float const> left_weight;
   common::Span<float const> right_weight;
 
+  // Sum Hessian of the first target. Used as a surrogate for node size.
+  double left_fst_hess{0};
+  double right_fst_hess{0};
+
   MultiExpandEntry() = default;
 
   [[nodiscard]] float GetLossChange() const { return split.loss_chg; }
@@ -165,6 +169,11 @@ struct MultiExpandEntry {
     return true;
   }
 
+  __device__ void UpdateFirstHessian(GradientPairPrecise const& lg, GradientPairPrecise const& rg) {
+    this->left_fst_hess = lg.GetHess();
+    this->right_fst_hess = rg.GetHess();
+  }
+
   friend std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& entry);
 };
 }  // namespace cuda_impl
diff --git a/src/tree/gpu_hist/leaf_sum.cu b/src/tree/gpu_hist/leaf_sum.cu
@@ -60,7 +60,8 @@ void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
       auto g = grad(sorted_ridx[j], t);
       return roundings[t].ToFixedPoint(g);
     });
-    // Use an output iterator to implement running sum.
+    // Use an output iterator to implement running sum. Old thrust versions either don't
+    // have this iterator, or unusable with segmented sum.
 #if THRUST_MAJOR_VERSION >= 3
     auto out_it = thrust::make_tabulate_output_iterator(
         [=] XGBOOST_DEVICE(std::int32_t idx, GradientPairInt64 v) mutable { out_t(idx) += v; });
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -202,10 +202,6 @@ __global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
   AgentT agent{&temp_storage, fidx};
 
   auto n_targets = shared.Targets();
-  // The number of bins in a feature
-  auto f_hist_size =
-      (shared.feature_segments[fidx + 1] - shared.feature_segments[fidx]) * n_targets;
-
   auto candidate_idx = nidx * shared.Features() + fidx;
 
   if (shared.one_pass != MultiEvaluateSplitSharedInputs::kBackward) {
@@ -256,11 +252,12 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
                GradientPairInt64{});
 
   // Create spans for each node's scan results
-  dh::device_vector<common::Span<GradientPairInt64>> scans(n_nodes);
+  std::vector<common::Span<GradientPairInt64>> h_scans(n_nodes);
   for (std::size_t nidx_in_set = 0; nidx_in_set < n_nodes; ++nidx_in_set) {
-    scans[nidx_in_set] = dh::ToSpan(this->scan_buffer_)
-                             .subspan(nidx_in_set * node_hist_size * 2, node_hist_size * 2);
+    h_scans[nidx_in_set] = dh::ToSpan(this->scan_buffer_)
+                               .subspan(nidx_in_set * node_hist_size * 2, node_hist_size * 2);
   }
+  dh::device_vector<common::Span<GradientPairInt64>> scans(h_scans);
 
   // Launch histogram scan kernel
   dim3 grid{n_nodes, n_features, n_targets};
@@ -328,32 +325,40 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
     s_parent_gains[nidx_in_set] = parent_gain;
 
     bool l = true, r = true;
+    GradientPairPrecise lg_fst, rg_fst;
     for (bst_target_t t = 0; t < n_targets; ++t) {
       auto quantizer = d_roundings[t];
       auto sibling_sum = input.parent_sum[t] - node_sum[t];
 
       l = l && (node_sum[t].GetQuantisedHess() == 0);
       r = r && (sibling_sum.GetQuantisedHess() == 0);
 
+      GradientPairPrecise lg, rg;
       if (best_split.dir == kRightDir) {
         // forward pass, node_sum is the left sum
-        auto lg = quantizer.ToFloatingPoint(node_sum[t]);
+        lg = quantizer.ToFloatingPoint(node_sum[t]);
         left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess());
-        auto rg = quantizer.ToFloatingPoint(sibling_sum);
+        rg = quantizer.ToFloatingPoint(sibling_sum);
         right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess());
       } else {
         // backward pass, node_sum is the right sum
-        auto rg = quantizer.ToFloatingPoint(node_sum[t]);
+        rg = quantizer.ToFloatingPoint(node_sum[t]);
         right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess());
-        auto lg = quantizer.ToFloatingPoint(sibling_sum);
+        lg = quantizer.ToFloatingPoint(sibling_sum);
         left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess());
       }
+
+      if (t == 0) {
+        lg_fst = lg;
+        rg_fst = rg;
+      }
     }
 
     // Set up the output entry
     out_splits[nidx_in_set] = {input.nidx,  input.depth, best_split,
                                base_weight, left_weight, right_weight};
     out_splits[nidx_in_set].split.loss_chg -= parent_gain;
+    out_splits[nidx_in_set].UpdateFirstHessian(lg_fst, rg_fst);
 
     if (l || r) {
       out_splits[nidx_in_set] = {};
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cuh b/src/tree/gpu_hist/multi_evaluate_splits.cuh
@@ -12,9 +12,11 @@
 namespace xgboost::tree::cuda_impl {
 /** @brief Evaluator for vector leaf. */
 class MultiHistEvaluator {
+  // Buffer for node weights
   dh::device_vector<float> weights_;
-
+  // Buffer for histogram scans.
   dh::device_vector<GradientPairInt64> scan_buffer_;
+  // Buffer for node gradient sums.
   dh::device_vector<GradientPairInt64> node_sums_;
 
  public:
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
@@ -76,33 +76,6 @@ struct NodeSplitData {
 };
 static_assert(std::is_trivially_copyable_v<NodeSplitData>);
 
-// Some nodes we will manually compute histograms, others we will do by subtraction
-void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
-                 std::vector<GPUExpandEntry> const& candidates,
-                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
-  auto const& tree = p_tree->HostScView();
-  std::size_t nidx_in_set{0};
-  auto p_build_nidx = nodes_to_build.data();
-  auto p_sub_nidx = nodes_to_sub.data();
-  for (auto& e : candidates) {
-    // Decide whether to build the left histogram or right histogram Use sum of Hessian as
-    // a heuristic to select node with fewest training instances This optimization is for
-    // distributed training to avoid an allreduce call for synchronizing the number of
-    // instances for each node.
-    auto left_sum = quantizer->ToFloatingPoint(e.split.left_sum);
-    auto right_sum = quantizer->ToFloatingPoint(e.split.right_sum);
-    bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
-    if (fewer_right) {
-      p_build_nidx[nidx_in_set] = tree.RightChild(e.nidx);
-      p_sub_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
-    } else {
-      p_build_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
-      p_sub_nidx[nidx_in_set] = tree.RightChild(e.nidx);
-    }
-    ++nidx_in_set;
-  }
-}
-
 // GPU tree updater implementation.
 struct GPUHistMakerDevice {
  private:
@@ -501,9 +474,16 @@ struct GPUHistMakerDevice {
     auto nodes = this->CreatePartitionNodes(p_tree, is_single_block ? candidates : expand_set);
 
     // Prepare for build hist
+    auto const& tree = p_tree->HostScView();
     std::vector<bst_node_t> build_nidx(candidates.size());
     std::vector<bst_node_t> subtraction_nidx(candidates.size());
-    AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx);
+    cuda_impl::AssignNodes(tree, candidates, build_nidx, subtraction_nidx,
+                           [&](GPUExpandEntry const& e) {
+                             auto left_sum = this->quantiser->ToFloatingPoint(e.split.left_sum);
+                             auto right_sum = this->quantiser->ToFloatingPoint(e.split.right_sum);
+                             bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
+                             return fewer_right;
+                           });
     auto prefetch_copy = !build_nidx.empty() && this->NeedCopy(p_fmat, candidates);
 
     this->histogram_.AllocateHistograms(ctx_, build_nidx, subtraction_nidx);
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh