dmlc · trivialfis · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/python-package/xgboost/testing/multi_target.py b/python-package/xgboost/testing/multi_target.py
@@ -14,7 +14,7 @@
 
 from .._typing import ArrayLike
 from ..compat import import_cupy
-from ..core import Booster, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
+from ..core import Booster, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix, build_info
 from ..objective import Objective, TreeObjective
 from ..sklearn import XGBClassifier
 from ..training import train
@@ -172,6 +172,15 @@ def run_with_iter(device: Device) -> None:  # pylint: disable=too-many-locals
     n_rounds = 8
     n_targets = 3
     intercept = [0.5] * n_targets
+
+    params = {
+        "device": device,
+        "multi_strategy": "multi_output_tree",
+        "learning_rate": 1.0,
+        "base_score": intercept,
+        "debug_synchronize": True,
+    }
+
     Xs = []
     ys = []
     for i in range(n_batches):
@@ -185,12 +194,7 @@ def run_with_iter(device: Device) -> None:  # pylint: disable=too-many-locals
 
     evals_result_0: Dict[str, Dict] = {}
     booster_0 = train(
-        {
-            "device": device,
-            "multi_strategy": "multi_output_tree",
-            "learning_rate": 1.0,
-            "base_score": intercept,
-        },
+        params,
         Xy,
         num_boost_round=n_rounds,
         evals=[(Xy, "Train")],
@@ -201,12 +205,7 @@ def run_with_iter(device: Device) -> None:  # pylint: disable=too-many-locals
     Xy = QuantileDMatrix(it)
     evals_result_1: Dict[str, Dict] = {}
     booster_1 = train(
-        {
-            "device": device,
-            "multi_strategy": "multi_output_tree",
-            "learning_rate": 1.0,
-            "base_score": intercept,
-        },
+        params,
         Xy,
         num_boost_round=n_rounds,
         evals=[(Xy, "Train")],
@@ -219,18 +218,23 @@ def run_with_iter(device: Device) -> None:  # pylint: disable=too-many-locals
     X, _, _ = it.as_arrays()
     assert_allclose(device, booster_0.inplace_predict(X), booster_1.inplace_predict(X))
 
-    it = IteratorForTest(Xs, ys, None, cache="cache", on_host=True)
+    v = build_info()["THRUST_VERSION"]
+    if v[0] < 3:
+        pytest.xfail("CCCL version too old.")
+
+    it = IteratorForTest(
+        Xs,
+        ys,
+        None,
+        cache="cache",
+        on_host=True,
+        min_cache_page_bytes=X.shape[0] // n_batches * X.shape[1],
+    )
     Xy = ExtMemQuantileDMatrix(it, cache_host_ratio=1.0)
 
     evals_result_2: Dict[str, Dict] = {}
     booster_2 = train(
-        {
-            "device": device,
-            "multi_strategy": "multi_output_tree",
-            "learning_rate": 1.0,
-            "base_score": intercept,
-            "debug_synchronize": True,
-        },
+        params,
         Xy,
         evals=[(Xy, "Train")],
         obj=LsObj0(),

diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
@@ -411,7 +411,7 @@ template <typename T>
 using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>;  // NOLINT
 
 /**
- * @brief Container class that doesn't initialize the data when RMM is used.
+ * @brief Container class that doesn't initialize the data.
  */
 template <typename T, bool is_caching>
 class DeviceUVectorImpl {

diff --git a/src/tree/gpu_hist/expand_entry.cu b/src/tree/gpu_hist/expand_entry.cu
@@ -16,8 +16,8 @@ std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& e) {
      << "depth: " << e.depth << "\n"
      << "loss: " << e.split.loss_chg << "\n";
 
-  std::vector<GradientPairInt64> h_node_sum(e.split.node_sum.size());
-  dh::CopyDeviceSpanToVector(&h_node_sum, e.split.node_sum);
+  std::vector<GradientPairInt64> h_node_sum(e.split.child_sum.size());
+  dh::CopyDeviceSpanToVector(&h_node_sum, e.split.child_sum);
 
   auto print_span = [&](auto const& span) {
     using T = typename common::GetValueT<decltype(span)>::value_type;
@@ -38,7 +38,7 @@ std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& e) {
   } else {
     os << "right_sum: ";
   }
-  print_span(e.split.node_sum);
+  print_span(e.split.child_sum);
 
   os << "base_weight: ";
   print_span(e.base_weight);

diff --git a/src/tree/gpu_hist/leaf_sum.cu b/src/tree/gpu_hist/leaf_sum.cu
@@ -1,11 +1,15 @@
 /**
  * Copyright 2025, XGBoost contributors
  */
-#include <cstddef>  // for size_t
-#include <vector>   // for vector
+#include <thrust/scan.h>     // for inclusive_scan
+#include <thrust/version.h>  // for THRUST_MAJOR_VERSION
 
-#include "../../common/linalg_op.cuh"  // for tbegin
-#include "../updater_gpu_common.cuh"   // for GPUTrainingParam
+#include <cstddef>                                 // for size_t
+#include <cstdint>                                 // for int32_t
+#include <cub/device/device_segmented_reduce.cuh>  // for DeviceSegmentedReduce
+#include <vector>                                  // for vector
+
+#include "../updater_gpu_common.cuh"  // for GPUTrainingParam
 #include "leaf_sum.cuh"
 #include "quantiser.cuh"        // for GradientQuantiser
 #include "row_partitioner.cuh"  // for RowIndexT, LeafInfo
@@ -14,6 +18,12 @@
 #include "xgboost/linalg.h"     // for MatrixView
 #include "xgboost/span.h"       // for Span
 
+#if THRUST_MAJOR_VERSION >= 3
+#include <thrust/iterator/tabulate_output_iterator.h>  // for make_tabulate_output_iterator
+#else
+#include "../../common/linalg_op.cuh"  // for tbegin
+#endif
+
 namespace xgboost::tree::cuda_impl {
 void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
                  common::Span<GradientQuantiser const> roundings,
@@ -50,14 +60,22 @@ void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
       auto g = grad(sorted_ridx[j], t);
       return roundings[t].ToFixedPoint(g);
     });
+    // Use an output iterator to implement running sum.
+#if THRUST_MAJOR_VERSION >= 3
+    auto out_it = thrust::make_tabulate_output_iterator(
+        [=] XGBOOST_DEVICE(std::int32_t idx, GradientPairInt64 v) mutable { out_t(idx) += v; });
+#else
+    auto out_it = linalg::tbegin(out_t);
+#endif
+
     std::size_t n_bytes = 0;
-    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, it, linalg::tbegin(out_t),
-                                                  h_leaves.size(), indptr.data(), indptr.data() + 1,
+    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, it, out_it, h_leaves.size(),
+                                                  indptr.data(), indptr.data() + 1,
                                                   ctx->CUDACtx()->Stream()));
     dh::TemporaryArray<char> alloc(n_bytes);
-    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
-        alloc.data().get(), n_bytes, it, linalg::tbegin(out_t), h_leaves.size(), indptr.data(),
-        indptr.data() + 1, ctx->CUDACtx()->Stream()));
+    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(alloc.data().get(), n_bytes, it, out_it,
+                                                  h_leaves.size(), indptr.data(), indptr.data() + 1,
+                                                  ctx->CUDACtx()->Stream()));
   }
 }
 
@@ -66,7 +84,6 @@ void LeafWeight(Context const* ctx, GPUTrainingParam const& param,
                 linalg::MatrixView<GradientPairInt64 const> grad_sum,
                 linalg::MatrixView<float> out_weights) {
   CHECK(grad_sum.Contiguous());
-  auto s_grad_sum = grad_sum.Values();
   dh::LaunchN(grad_sum.Size(), ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t i) mutable {
     auto [nidx_in_set, t] = linalg::UnravelIndex(i, grad_sum.Shape());
     auto g = roundings[t].ToFloatingPoint(grad_sum(nidx_in_set, t));

diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -280,10 +280,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
   auto d_weights = dh::ToSpan(this->weights_);
 
   dh::CachingDeviceUVector<float> d_parent_gains(n_nodes);
-  dh::CachingDeviceUVector<std::int32_t> sum_zeros(n_nodes * 2);
-
   auto s_parent_gains = dh::ToSpan(d_parent_gains);
-  auto s_sum_zeros = dh::ToSpan(sum_zeros);
   auto s_d_splits = dh::ToSpan(d_splits);
 
   // Process results for each node
@@ -304,7 +301,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
   dh::LaunchN(n_nodes, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t nidx_in_set) {
     auto input = d_inputs[nidx_in_set];
     MultiSplitCandidate best_split = d_best_splits[nidx_in_set];
-    if (best_split.node_sum.empty()) {
+    if (best_split.child_sum.empty()) {
       // Invalid split
       out_splits[nidx_in_set] = {};
       return;
@@ -316,7 +313,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
     auto right_weight = d_weights.subspan(nidx_in_set * n_targets * 3 + n_targets * 2, n_targets);
 
     auto d_roundings = shared_inputs.roundings;
-    auto node_sum = best_split.node_sum;
+    auto node_sum = best_split.child_sum;
 
     float parent_gain = 0;
     for (bst_target_t t = 0; t < n_targets; ++t) {
@@ -353,9 +350,6 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
       }
     }
 
-    s_sum_zeros[nidx_in_set * 2] = l;
-    s_sum_zeros[nidx_in_set * 2 + 1] = r;
-
     // Set up the output entry
     out_splits[nidx_in_set] = {input.nidx,  input.depth, best_split,
                                base_weight, left_weight, right_weight};
@@ -384,7 +378,7 @@ void MultiHistEvaluator::ApplyTreeSplit(Context const *ctx, RegTree const *p_tre
   // TODO(jiamingy): We need to batch the nodes
   auto best_split = candidate.split;
 
-  auto node_sum = best_split.node_sum;
+  auto node_sum = best_split.child_sum;
   dh::LaunchN(n_targets, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t t) {
     auto sibling_sum = parent_sum[t] - node_sum[t];
     if (best_split.dir == kRightDir) {

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <thrust/sequence.h>  // for sequence
 
@@ -13,7 +13,6 @@ namespace xgboost::tree {
 void RowPartitioner::Reset(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid) {
   ridx_segments_.clear();
   ridx_.resize(n_samples);
-  ridx_tmp_.resize(n_samples);
   tmp_.clear();
   n_nodes_ = 1;  // Root
 

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
@@ -174,21 +174,24 @@ void SortPositionBatch(Context const* ctx, common::Span<const PerNodeData<OpData
     auto ret =
         cub::DispatchScan<decltype(input_iterator), decltype(discard_write_iterator), IndexFlagOp,
                           cub::NullType, std::uint64_t>::Dispatch(nullptr, n_bytes, input_iterator,
-                                                                 discard_write_iterator,
-                                                                 IndexFlagOp{}, cub::NullType{},
-                                                                 static_cast<std::uint64_t>(total_rows),
-                                                                 ctx->CUDACtx()->Stream());
+                                                                  discard_write_iterator,
+                                                                  IndexFlagOp{}, cub::NullType{},
+                                                                  static_cast<std::uint64_t>(
+                                                                      total_rows),
+                                                                  ctx->CUDACtx()->Stream());
     dh::safe_cuda(ret);
     tmp->resize(n_bytes);
   }
   n_bytes = tmp->size();
   auto ret =
       cub::DispatchScan<decltype(input_iterator), decltype(discard_write_iterator), IndexFlagOp,
-                        cub::NullType, std::uint64_t>::Dispatch(tmp->data(), n_bytes, input_iterator,
-                                                               discard_write_iterator,
-                                                               IndexFlagOp{}, cub::NullType{},
-                                                               static_cast<std::uint64_t>(total_rows),
-                                                               ctx->CUDACtx()->Stream());
+                        cub::NullType, std::uint64_t>::Dispatch(tmp->data(), n_bytes,
+                                                                input_iterator,
+                                                                discard_write_iterator,
+                                                                IndexFlagOp{}, cub::NullType{},
+                                                                static_cast<std::uint64_t>(
+                                                                    total_rows),
+                                                                ctx->CUDACtx()->Stream());
   dh::safe_cuda(ret);
 
   constexpr int kBlockSize = 256;
@@ -272,8 +275,6 @@ class RowPartitioner {
    * rows idx | 3, 5, 1 | 13, 31 |
    */
   dh::DeviceUVector<RowIndexT> ridx_;
-  // Staging area for sorting ridx
-  dh::DeviceUVector<RowIndexT> ridx_tmp_;
   dh::DeviceUVector<int8_t> tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
@@ -343,7 +344,8 @@ class RowPartitioner {
   void UpdatePositionBatch(Context const* ctx, std::vector<bst_node_t> const& nidx,
                            std::vector<bst_node_t> const& left_nidx,
                            std::vector<bst_node_t> const& right_nidx,
-                           std::vector<OpDataT> const& op_data, UpdatePositionOpT op) {
+                           std::vector<OpDataT> const& op_data, common::Span<RowIndexT> ridx_tmp,
+                           UpdatePositionOpT op) {
     if (nidx.empty()) {
       return;
     }
@@ -366,20 +368,21 @@ class RowPartitioner {
     auto h_counts = pinned_.GetSpan<RowIndexT>(nidx.size());
     // Must initialize with 0 as 0 count is not written in the kernel.
     dh::TemporaryArray<RowIndexT> d_counts(nidx.size(), 0);
+    CHECK_EQ(ridx_tmp.size(), this->Size());
 
     // Process a sub-batch
-    auto sub_batch_impl = [ctx, op, this](common::Span<bst_node_t const> nidx,
-                                          common::Span<PerNodeData<OpDataT>> d_batch_info,
-                                          common::Span<RowIndexT> d_counts) {
+    auto sub_batch_impl = [&](common::Span<bst_node_t const> nidx,
+                              common::Span<PerNodeData<OpDataT>> d_batch_info,
+                              common::Span<RowIndexT> d_counts) {
       std::size_t total_rows = 0;
       for (bst_node_t i : nidx) {
         total_rows += this->ridx_segments_[i].segment.Size();
       }
 
       // Partition the rows according to the operator
       SortPositionBatch<UpdatePositionOpT, OpDataT>(ctx, d_batch_info, dh::ToSpan(this->ridx_),
-                                                    dh::ToSpan(this->ridx_tmp_), d_counts,
-                                                    total_rows, op, &this->tmp_);
+                                                    ridx_tmp, d_counts, total_rows, op,
+                                                    &this->tmp_);
     };
 
     // Divide inputs into sub-batches.
@@ -441,4 +444,59 @@ class RowPartitioner {
         base_ridx, d_ridx, d_out_position, op);
   }
 };
+
+// Partitioner for all batches, used for external memory training.
+class RowPartitionerBatches {
+ private:
+  // Temporary buffer for sorting the samples.
+  dh::DeviceUVector<cuda_impl::RowIndexT> ridx_tmp_;
+  // Partitioners for each batch.
+  std::vector<std::unique_ptr<RowPartitioner>> partitioners_;
+
+ public:
+  void Reset(Context const* ctx, std::vector<bst_idx_t> const& batch_ptr) {
+    CHECK_GE(batch_ptr.size(), 2);
+    std::size_t n_batches = batch_ptr.size() - 1;
+    if (partitioners_.size() != n_batches) {
+      partitioners_.clear();
+    }
+
+    bst_idx_t n_max_samples = 0;
+    for (std::size_t k = 0; k < n_batches; ++k) {
+      if (partitioners_.size() != n_batches) {
+        // First run.
+        partitioners_.emplace_back(std::make_unique<RowPartitioner>());
+      }
+      auto base_ridx = batch_ptr[k];
+      auto n_samples = batch_ptr.at(k + 1) - base_ridx;
+      partitioners_[k]->Reset(ctx, n_samples, base_ridx);
+      CHECK_LE(n_samples, std::numeric_limits<cuda_impl::RowIndexT>::max());
+      n_max_samples = std::max(n_samples, n_max_samples);
+    }
+    this->ridx_tmp_.resize(n_max_samples);
+  }
+
+  // Accessors
+  [[nodiscard]] decltype(auto) operator[](std::size_t i) { return partitioners_[i]; }
+  decltype(auto) At(std::size_t i) { return partitioners_.at(i); }
+  [[nodiscard]] std::size_t Size() const { return this->partitioners_.size(); }
+  decltype(auto) cbegin() const { return this->partitioners_.cbegin(); }  // NOLINT
+  decltype(auto) cend() const { return this->partitioners_.cend(); }      // NOLINT
+  decltype(auto) begin() const { return this->partitioners_.cbegin(); }   // NOLINT
+  decltype(auto) end() const { return this->partitioners_.cend(); }       // NOLINT
+
+  [[nodiscard]] decltype(auto) Front() { return this->partitioners_.front(); }
+  [[nodiscard]] bool Empty() const { return this->partitioners_.empty(); }
+
+  template <typename UpdatePositionOpT, typename OpDataT>
+  void UpdatePositionBatch(Context const* ctx, std::int32_t batch_idx,
+                           std::vector<bst_node_t> const& nidx,
+                           std::vector<bst_node_t> const& left_nidx,
+                           std::vector<bst_node_t> const& right_nidx,
+                           std::vector<OpDataT> const& op_data, UpdatePositionOpT op) {
+    auto& part = this->At(batch_idx);
+    auto ridx_tmp = dh::ToSpan(this->ridx_tmp_).subspan(0, part->Size());
+    part->UpdatePositionBatch(ctx, nidx, left_nidx, right_nidx, op_data, ridx_tmp, op);
+  }
+};
 };  // namespace xgboost::tree