From 5943efbef78e0702d175e34213a352ef37785bed Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 22 Oct 2025 06:05:55 +0800
Subject: [PATCH 01/24] [mt] Split up gradient types for the GPU hist.

- Use reduced gradient for tree structure exploration.

The PR adds a gradient container that has two different gradient types, one for tree split
and the other one for leaf values. This is an optimization for vector-leaf to reduce the
overhead of finding tree structure.

work on build hist.

notes.

work on evaluation.

build all nodes.

inputs.

getter.

alloc.

proto.

apply.

disable.

Work on high-level tests.

lint.

cleanup.

test policy.

lazy gen.

Fix scan.

Cleanup.

Cleanup single split.

Revert.

Remove sync.

check.

cleanup.

notes.

error message.

Cleanup grp type, notes.

notes.

leaf sum.

leaf weight.

cleanup.

work on the path.

update.

update.

sth to run.

parameters.

sort.

probing.

change n targets.

Expand tree.

q

set leaves.

print.

fix.

nidx.

fix.

copy root sum.

copy root sum.

Fix sum.

work on a simple python test.

note.

sort leaves.

cleanup.

test.

cleanup.

comment.

cleanup.

check.

note.

try to find an interface.

notes.

remove.

check.

clenaup.

Add debugging utilities.

set root.

allow smaller weight.

Cleanup.

Cleanup.

cleanup.

Doc, cleanup.

Remove allocations.

move container.

unify the do boost.

Remove the update method.
---
 include/xgboost/gbm.h                         |   9 +-
 include/xgboost/gradient.h                    |  52 ++++++++
 include/xgboost/learner.h                     |  44 +++---
 include/xgboost/linalg.h                      |  13 +-
 include/xgboost/multi_target_tree_model.h     |   5 +-
 include/xgboost/objective.h                   |   2 +-
 include/xgboost/tree_model.h                  |  19 ++-
 include/xgboost/tree_updater.h                |  33 ++---
 python-package/xgboost/core.py                |  80 ++++++++---
 python-package/xgboost/testing/__init__.py    |   2 +-
 .../xgboost/testing/multi_target.py           |  78 ++++++++++-
 python-package/xgboost/training.py            |   3 +-
 src/c_api/c_api.cc                            |  44 +++++-
 src/c_api/c_api.cu                            |   2 +-
 src/common/algorithm.h                        |   1 -
 src/common/device_debug.cuh                   |  38 ++++++
 src/common/device_helpers.cuh                 |  12 --
 src/gbm/gblinear.cc                           |   9 +-
 src/gbm/gbtree.cc                             |  40 ++++--
 src/gbm/gbtree.h                              |   6 +-
 src/learner.cc                                |  43 ++++--
 src/objective/adaptive.cc                     |   2 +-
 src/objective/adaptive.cu                     |   2 +-
 src/objective/adaptive.h                      |   6 +-
 src/objective/quantile_obj.cu                 |   2 +-
 src/objective/regression_obj.cu               |   2 +-
 src/tree/gpu_hist/multi_evaluate_splits.cu    |  10 +-
 src/tree/gpu_hist/multi_evaluate_splits.cuh   |   8 +-
 src/tree/gpu_hist/row_partitioner.cuh         |   6 +-
 src/tree/leaf_sum.cu                          |  79 +++++++++++
 src/tree/leaf_sum.cuh                         |  29 ++++
 src/tree/multi_target_tree_model.cc           |  35 +++--
 src/tree/tree_model.cc                        |   6 +
 src/tree/updater_approx.cc                    |   8 +-
 src/tree/updater_colmaker.cc                  |   8 +-
 src/tree/updater_gpu_hist.cu                  |  43 ++++--
 src/tree/updater_gpu_hist.cuh                 | 125 ++++++++++++++----
 src/tree/updater_prune.cc                     |   8 +-
 src/tree/updater_quantile_hist.cc             |   7 +-
 src/tree/updater_refresh.cc                   |   9 +-
 src/tree/updater_sync.cc                      |   5 +-
 tests/cpp/gbm/test_gbtree.cc                  |  11 +-
 tests/cpp/helpers.cc                          |   5 +-
 tests/cpp/helpers.h                           |  11 +-
 tests/cpp/predictor/test_cpu_predictor.cc     |   5 +-
 tests/cpp/tree/hist/test_evaluate_splits.cc   |   2 +-
 tests/cpp/tree/test_approx.cc                 |   8 +-
 tests/cpp/tree/test_gpu_approx.cu             |   6 +-
 tests/cpp/tree/test_gpu_hist.cu               |  22 ++-
 tests/cpp/tree/test_prune.cc                  |   8 +-
 tests/cpp/tree/test_quantile_hist.cc          |  13 +-
 tests/cpp/tree/test_refresh.cc                |  18 ++-
 tests/cpp/tree/test_tree_stat.cc              |  16 +--
 tests/python-gpu/test_gpu_multi_target.py     |  10 +-
 54 files changed, 808 insertions(+), 262 deletions(-)
 create mode 100644 include/xgboost/gradient.h
 create mode 100644 src/common/device_debug.cuh
 create mode 100644 src/tree/leaf_sum.cu
 create mode 100644 src/tree/leaf_sum.cuh

diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index 9a74594ae8cc..61f4b1a62fe6 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -11,13 +11,14 @@
 #include <dmlc/registry.h>
 #include <xgboost/base.h>
 #include <xgboost/data.h>
+#include <xgboost/gradient.h>  // for GradientContainer
 #include <xgboost/host_device_vector.h>
 #include <xgboost/model.h>
 
-#include <vector>
-#include <string>
 #include <functional>
 #include <memory>
+#include <string>
+#include <vector>
 
 namespace xgboost {
 
@@ -78,8 +79,8 @@ class GradientBooster : public Model, public Configurable {
    *                   the booster may change content of gpair
    * @param obj The objective function used for boosting.
    */
-  virtual void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
-                       PredictionCacheEntry*, ObjFunction const* obj) = 0;
+  virtual void DoBoost(DMatrix* /*p_fmat*/, GradientContainer* /*in_gpair*/,
+                       PredictionCacheEntry* /*prediction*/, ObjFunction const* /*obj*/) = 0;
 
   /**
    * \brief Generate predictions for given feature matrix
diff --git a/include/xgboost/gradient.h b/include/xgboost/gradient.h
new file mode 100644
index 000000000000..da4ffe9741ec
--- /dev/null
+++ b/include/xgboost/gradient.h
@@ -0,0 +1,52 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#pragma once
+
+#include <xgboost/base.h>    // for GradientPair
+#include <xgboost/linalg.h>  // for Matrix
+#include <xgboost/logging.h>
+
+#include <cstddef>  // for size_t
+
+namespace xgboost {
+/**
+ * @brief Container for gradient produced by objective.
+ */
+struct GradientContainer {
+  /** @brief Gradient used for multi-target tree split and linear model. */
+  linalg::Matrix<GradientPair> gpair;
+  /** @brief Gradient used for tree leaf value, optional. */
+  linalg::Matrix<GradientPair> value_gpair;
+
+  [[nodiscard]] bool HasValueGrad() const noexcept { return !value_gpair.Empty(); }
+
+  [[nodiscard]] std::size_t NumSplitTargets() const noexcept { return gpair.Shape(1); }
+  [[nodiscard]] std::size_t NumTargets() const noexcept {
+    return HasValueGrad() ? value_gpair.Shape(1) : this->gpair.Shape(1);
+  }
+
+  linalg::MatrixView<GradientPair const> ValueGrad(Context const* ctx) const {
+    if (HasValueGrad()) {
+      return this->value_gpair.View(ctx->Device());
+    }
+    return this->gpair.View(ctx->Device());
+  }
+
+  [[nodiscard]] linalg::Matrix<GradientPair> const* Grad() const { return &gpair; }
+  [[nodiscard]] linalg::Matrix<GradientPair>* Grad() { return &gpair; }
+
+  [[nodiscard]] linalg::Matrix<GradientPair> const* FullGradOnly() const {
+    if (this->HasValueGrad()) {
+      LOG(FATAL) << "Reduced gradient is not yet supported.";
+    }
+    return this->Grad();
+  }
+  [[nodiscard]] linalg::Matrix<GradientPair>* FullGradOnly() {
+    if (this->HasValueGrad()) {
+      LOG(FATAL) << "Reduced gradient is not yet supported.";
+    }
+    return this->Grad();
+  }
+};
+}  // namespace xgboost
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 24ff72f77d8d..5fab2ff2b574 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -8,22 +8,23 @@
 #ifndef XGBOOST_LEARNER_H_
 #define XGBOOST_LEARNER_H_
 
-#include <dmlc/io.h>          // for Serializable
-#include <xgboost/base.h>     // for bst_feature_t, bst_target_t, bst_float, Args, GradientPair, ..
-#include <xgboost/context.h>  // for Context
-#include <xgboost/linalg.h>   // for Vector, VectorView
-#include <xgboost/metric.h>   // for Metric
-#include <xgboost/model.h>    // for Configurable, Model
-#include <xgboost/span.h>     // for Span
-#include <xgboost/task.h>     // for ObjInfo
+#include <dmlc/io.h>           // for Serializable
+#include <xgboost/base.h>      // for bst_feature_t, bst_target_t, bst_float, Args, GradientPair, ..
+#include <xgboost/context.h>   // for Context
+#include <xgboost/gradient.h>  // for GradientContainer
+#include <xgboost/linalg.h>    // for Vector, VectorView
+#include <xgboost/metric.h>    // for Metric
+#include <xgboost/model.h>     // for Configurable, Model
+#include <xgboost/span.h>      // for Span
+#include <xgboost/task.h>      // for ObjInfo
 
-#include <algorithm>          // for max
-#include <cstdint>            // for int32_t, uint32_t, uint8_t
-#include <map>                // for map
-#include <memory>             // for shared_ptr, unique_ptr
-#include <string>             // for string
-#include <utility>            // for move
-#include <vector>             // for vector
+#include <algorithm>  // for max
+#include <cstdint>    // for int32_t, uint32_t, uint8_t
+#include <map>        // for map
+#include <memory>     // for shared_ptr, unique_ptr
+#include <string>     // for string
+#include <utility>    // for move
+#include <vector>     // for vector
 
 namespace xgboost {
 class FeatureMap;
@@ -47,25 +48,24 @@ enum class PredictionType : std::uint8_t {  // NOLINT
   kLeaf = 6
 };
 
-/*!
- * \brief Learner class that does training and prediction.
+/**
+ * @brief Learner class that does training and prediction.
  *  This is the user facing module of xgboost training.
  *  The Load/Save function corresponds to the model used in python/R.
- *  \code
+ *  @code
  *
  *  std::unique_ptr<Learner> learner(new Learner::Create(cache_mats));
- *  learner.Configure(configs);
+ *  learner->Configure(configs);
  *
  *  for (int iter = 0; iter < max_iter; ++iter) {
  *    learner->UpdateOneIter(iter, train_mat);
  *    LOG(INFO) << learner->EvalOneIter(iter, data_sets, data_names);
  *  }
  *
- *  \endcode
+ *  @endcode
  */
 class Learner : public Model, public Configurable, public dmlc::Serializable {
  public:
-  /*! \brief virtual destructor */
   ~Learner() override;
   /*!
    * \brief Configure Learner based on set parameters.
@@ -88,7 +88,7 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    * @param in_gpair The input gradient statistics.
    */
   virtual void BoostOneIter(std::int32_t iter, std::shared_ptr<DMatrix> train,
-                            linalg::Matrix<GradientPair>* in_gpair) = 0;
+                            GradientContainer* in_gpair) = 0;
   /*!
    * \brief evaluate the model for specific iteration using the configured metrics.
    * \param iter iteration number
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 23a47dea45fe..2ce60ffdcb68 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -957,7 +957,7 @@ template <typename T>
 using Vector = Tensor<T, 1>;
 
 /**
- * \brief Create an array without initialization.
+ * @brief Create an array without initialization.
  */
 template <typename T, typename... Index>
 auto Empty(Context const *ctx, Index &&...index) {
@@ -967,6 +967,17 @@ auto Empty(Context const *ctx, Index &&...index) {
   return t;
 }
 
+/**
+ * @brief Create an array with the same shape and dtype as the input.
+ */
+template <typename T, std::int32_t kDim>
+auto EmptyLike(Context const *ctx, Tensor<T, kDim> const &in) {
+  Tensor<T, kDim> t;
+  t.SetDevice(ctx->Device());
+  t.Reshape(in.Shape());
+  return t;
+}
+
 /**
  * \brief Create an array with value v.
  */
diff --git a/include/xgboost/multi_target_tree_model.h b/include/xgboost/multi_target_tree_model.h
index 2fc110f02b73..3f5645e46c43 100644
--- a/include/xgboost/multi_target_tree_model.h
+++ b/include/xgboost/multi_target_tree_model.h
@@ -60,9 +60,9 @@ class MultiTargetTree : public Model {
   MultiTargetTree& operator=(MultiTargetTree&& that) = delete;
 
   /**
-   * @brief Set the weight for a leaf.
+   * @brief Set the weight for the root.
    */
-  void SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight);
+  void SetRoot(linalg::VectorView<float const> weight);
   /**
    * @brief Expand a leaf into split node.
    */
@@ -70,6 +70,7 @@ class MultiTargetTree : public Model {
               linalg::VectorView<float const> base_weight,
               linalg::VectorView<float const> left_weight,
               linalg::VectorView<float const> right_weight);
+  void SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights);
 
   [[nodiscard]] bool IsLeaf(bst_node_t nidx) const {
     return left_.ConstHostVector()[nidx] == InvalidNodeId();
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
index 624218e22123..497821590bc9 100644
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -129,7 +129,7 @@ class ObjFunction : public Configurable {
   virtual void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& /*position*/,
                               MetaInfo const& /*info*/, float /*learning_rate*/,
                               HostDeviceVector<float> const& /*prediction*/,
-                              std::int32_t /*group_idx*/, RegTree* /*p_tree*/) const {}
+                              bst_target_t /*group_idx*/, RegTree* /*p_tree*/) const {}
   /**
    * @brief Create an objective function according to the name.
    *
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 18656ac23b59..bc8d4ade6d76 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -321,12 +321,23 @@ class RegTree : public Model {
                   float right_sum,
                   bst_node_t leaf_right_child = kInvalidNodeId);
   /**
-   * \brief Expands a leaf node into two additional leaf nodes for a multi-target tree.
+   * @brief Expands a leaf node into two additional leaf nodes for a multi-target tree.
    */
   void ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split_cond, bool default_left,
                   linalg::VectorView<float const> base_weight,
                   linalg::VectorView<float const> left_weight,
                   linalg::VectorView<float const> right_weight);
+  /**
+   * @brief Set all leaf weights for a multi-target tree.
+   *
+   * The leaf weight can be different from the internal weight stored by @ref ExpandNode
+   * This function is used to set the leaf at the end of tree construction.
+   *
+   * @param leaves  The node indices for all leaves. This must contain all the leaves in this tree.
+   * @param weights Row-major matrix for leaf weights, each row contains a leaf specified by the
+   *                leaves parameter.
+   */
+  void SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights);
 
   /**
    * \brief Expands a leaf node with categories
@@ -396,11 +407,11 @@ class RegTree : public Model {
    */
   [[nodiscard]] bst_node_t GetDepth(bst_node_t nidx) const;
   /**
-   * @brief Set the leaf weight for a multi-target tree.
+   * @brief Set the root weight for a multi-target tree.
    */
-  void SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
+  void SetRoot(linalg::VectorView<float const> weight) {
     CHECK(IsMultiTarget());
-    return this->p_mt_tree_->SetLeaf(nidx, weight);
+    return this->p_mt_tree_->SetRoot(weight);
   }
   /**
    * @brief Get the maximum depth.
diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h
index 477c8e4a1785..7a96d71c5231 100644
--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@@ -1,7 +1,7 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
- * \file tree_updater.h
- * \brief General primitive for tree learning,
+ * Copyright 2014-2025, XGBoost Contributors
+ *
+ * @brief General primitive for tree learning,
  *   Updating a collection of trees given the information.
  * \author Tianqi Chen
  */
@@ -10,16 +10,17 @@
 
 #include <dmlc/registry.h>
 #include <xgboost/base.h>                // for Args, GradientPair
-#include <xgboost/data.h>                // DMatrix
+#include <xgboost/data.h>                // for DMatrix
+#include <xgboost/gradient.h>            // for GradientContainer
 #include <xgboost/host_device_vector.h>  // for HostDeviceVector
 #include <xgboost/linalg.h>              // for VectorView
 #include <xgboost/model.h>               // for Configurable
 #include <xgboost/span.h>                // for Span
 #include <xgboost/tree_model.h>          // for RegTree
 
-#include <functional>                    // for function
-#include <string>                        // for string
-#include <vector>                        // for vector
+#include <functional>  // for function
+#include <string>      // for string
+#include <vector>      // for vector
 
 namespace xgboost {
 namespace tree {
@@ -59,21 +60,21 @@ class TreeUpdater : public Configurable {
    */
   [[nodiscard]] virtual bool HasNodePosition() const { return false; }
   /**
-   * \brief perform update to the tree models
+   * @brief perform update to the tree models
    *
-   * \param param Hyper-parameter for constructing trees.
-   * \param gpair the gradient pair statistics of the data
-   * \param data The data matrix passed to the updater.
-   * \param out_position The leaf index for each row.  The index is negated if that row is
+   * @param param  Hyper-parameter for constructing trees.
+   * @param gpair  The gradient pair statistics of the data
+   * @param p_fmat The data matrix passed to the updater.
+   * @param out_position The leaf index for each row.  The index is negated if that row is
    *                     removed during sampling. So the 3th node is ~3.
-   * \param out_trees references the trees to be updated, updater will change the content of trees
+   * @param out_trees references the trees to be updated, updater will change the content of trees
    *   note: all the trees in the vector are updated, with the same statistics,
    *         but maybe different random seeds, usually one tree is passed in at a time,
    *         there can be multiple trees when we train random forest style model
    */
-  virtual void Update(tree::TrainParam const* param, linalg::Matrix<GradientPair>* gpair,
-                      DMatrix* data, common::Span<HostDeviceVector<bst_node_t>> out_position,
-                      const std::vector<RegTree*>& out_trees) = 0;
+  virtual void Update(tree::TrainParam const* param, GradientContainer* gpair, DMatrix* p_fmat,
+                      common::Span<HostDeviceVector<bst_node_t>> out_position,
+                      std::vector<RegTree*> const& out_trees) = 0;
 
   /*!
    * \brief determines whether updater has enough knowledge about a given dataset
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index acfa6e8bd3bf..4066cd84a718 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1966,7 +1966,7 @@ def __init__(
         cache = cache if cache is not None else []
         for d in cache:
             if not isinstance(d, DMatrix):
-                raise TypeError(f"invalid cache item: {type(d).__name__}", cache)
+                raise TypeError(f"Invalid cache item: {type(d).__name__}", cache)
 
         dmats = c_array(ctypes.c_void_p, [d.handle for d in cache])
         self.handle: Optional[ctypes.c_void_p] = ctypes.c_void_p()
@@ -2068,7 +2068,7 @@ def __del__(self) -> None:
             self.handle = None
 
     def __getstate__(self) -> Dict:
-        # can't pickle ctypes pointers, put model content in bytearray
+        # can't pickle ctypes pointers, put model content in a bytearray
         this = self.__dict__.copy()
         handle = this["handle"]
         if handle is not None:
@@ -2084,7 +2084,7 @@ def __getstate__(self) -> Dict:
         return this
 
     def __setstate__(self, state: Dict) -> None:
-        # reconstruct handle from raw data
+        # reconstruct the handle from raw data
         handle = state["handle"]
         if handle is not None:
             buf = handle
@@ -2385,7 +2385,11 @@ def set_param(
                 )
 
     def update(
-        self, dtrain: DMatrix, iteration: int, fobj: Optional[Objective] = None
+        self,
+        dtrain: DMatrix,
+        iteration: int,
+        fobj: Optional[Objective] = None,
+        fred: Optional[Objective] = None,  # fixme: type
     ) -> None:
         """Update for one iteration, with objective function calculated
         internally.  This function should not be called directly by users.
@@ -2401,22 +2405,50 @@ def update(
 
         """
         if not isinstance(dtrain, DMatrix):
-            raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
+            raise TypeError(f"Invalid training matrix: {type(dtrain).__name__}")
         self._assign_dmatrix_features(dtrain)
 
-        if fobj is None:
+        if fobj is None and fred is None:
             _check_call(
                 _LIB.XGBoosterUpdateOneIter(
                     self.handle, ctypes.c_int(iteration), dtrain.handle
                 )
             )
-        else:
+        elif fobj is not None and fred is not None:
+            pred = self.predict(dtrain, output_margin=True, training=True)
+            vgrad, vhess = fobj(pred, dtrain)
+            sgrad, shess = fred(vgrad, vhess, dtrain)
+            self.boost(
+                dtrain,
+                iteration=iteration,
+                grad=sgrad,
+                hess=shess,
+                vgrad=vgrad,
+                vhess=vhess,
+            )
+        elif fobj is not None:
             pred = self.predict(dtrain, output_margin=True, training=True)
             grad, hess = fobj(pred, dtrain)
-            self.boost(dtrain, iteration=iteration, grad=grad, hess=hess)
+            self.boost(
+                dtrain,
+                iteration=iteration,
+                grad=grad,
+                hess=hess,
+            )
+        else:
+            raise NotImplementedError(
+                "A custom gradient reducer with built-in objective is not yet"
+                " implemented."
+            )
 
     def boost(
-        self, dtrain: DMatrix, iteration: int, grad: NumpyOrCupy, hess: NumpyOrCupy
+        self,
+        dtrain: DMatrix,
+        iteration: int,
+        grad: NumpyOrCupy,
+        hess: NumpyOrCupy,
+        vgrad: Optional[NumpyOrCupy] = None,
+        vhess: Optional[NumpyOrCupy] = None,
     ) -> None:
         """Boost the booster for one iteration with customized gradient statistics.
         Like :py:func:`xgboost.Booster.update`, this function should not be called
@@ -2467,15 +2499,29 @@ def grad_arrinf(array: NumpyOrCupy) -> bytes:
 
             return interface
 
-        _check_call(
-            _LIB.XGBoosterTrainOneIter(
-                self.handle,
-                dtrain.handle,
-                iteration,
-                grad_arrinf(grad),
-                grad_arrinf(hess),
+        if vgrad is not None or vhess is not None:
+            assert vhess is not None and vgrad is not None
+            _check_call(
+                _LIB.XGBoosterTrainOneIterWithObj(
+                    self.handle,
+                    dtrain.handle,
+                    iteration,
+                    grad_arrinf(grad),
+                    grad_arrinf(hess),
+                    grad_arrinf(vgrad),
+                    grad_arrinf(vhess),
+                )
+            )
+        else:
+            _check_call(
+                _LIB.XGBoosterTrainOneIter(
+                    self.handle,
+                    dtrain.handle,
+                    iteration,
+                    grad_arrinf(grad),
+                    grad_arrinf(hess),
+                )
             )
-        )
 
     def eval_set(
         self,
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 765d6ff0443f..85804d9434e6 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -621,7 +621,7 @@ def ls_obj(
 ) -> Tuple[np.ndarray, np.ndarray]:
     """Least squared error."""
     grad = y_pred - y_true
-    hess = np.ones(len(y_true))
+    hess = np.ones(grad.shape)
     if sample_weight is not None:
         grad *= sample_weight
         hess *= sample_weight
diff --git a/python-package/xgboost/testing/multi_target.py b/python-package/xgboost/testing/multi_target.py
index 4558916fb0b4..3a932e3e478f 100644
--- a/python-package/xgboost/testing/multi_target.py
+++ b/python-package/xgboost/testing/multi_target.py
@@ -1,19 +1,28 @@
 """Tests for multi-target training."""
 
-from typing import Optional
+from typing import Callable, Dict, Optional, Tuple
 
-from sklearn.datasets import make_classification, make_multilabel_classification
+import numpy as np
+from sklearn.datasets import (
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
 
 import xgboost.testing as tm
 
+from ..core import Booster, QuantileDMatrix
 from ..sklearn import XGBClassifier
+from ..training import train
 from .updater import ResetStrategy
 from .utils import Device
 
 
 def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
     """Use vector leaf for multi-class models."""
-    X, y = make_classification(128, n_features=12, n_informative=10, n_classes=4)
+    X, y = make_classification(
+        128, n_features=12, n_informative=10, n_classes=4, random_state=2025
+    )
     clf = XGBClassifier(
         multi_strategy="multi_output_tree",
         callbacks=[ResetStrategy()],
@@ -24,6 +33,8 @@ def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
     clf.fit(X, y, eval_set=[(X, y)])
     assert clf.objective == "multi:softprob"
     assert tm.non_increasing(clf.evals_result()["validation_0"]["mlogloss"])
+    if learning_rate is not None and abs(learning_rate - 1.0) < 1e-5:
+        assert clf.evals_result()["validation_0"]["mlogloss"][-1] < 0.045
 
     proba = clf.predict_proba(X)
     assert proba.shape == (y.shape[0], 4)
@@ -31,7 +42,7 @@ def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
 
 def run_multilabel(device: Device, learning_rate: Optional[float]) -> None:
     """Use vector leaf for multi-label classification models."""
-    X, y = make_multilabel_classification(128)
+    X, y = make_multilabel_classification(128, random_state=2025)
     clf = XGBClassifier(
         multi_strategy="multi_output_tree",
         callbacks=[ResetStrategy()],
@@ -42,6 +53,65 @@ def run_multilabel(device: Device, learning_rate: Optional[float]) -> None:
     clf.fit(X, y, eval_set=[(X, y)])
     assert clf.objective == "binary:logistic"
     assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])
+    if learning_rate is not None and abs(learning_rate - 1.0) < 1e-5:
+        assert clf.evals_result()["validation_0"]["logloss"][-1] < 0.065
 
     proba = clf.predict_proba(X)
     assert proba.shape == y.shape
+
+
+def run_reduced_grad(device: Device) -> None:
+    """Basic test for using reduced gradient for tree splits."""
+    import cupy as cp
+
+    def reducer(
+        grad: np.ndarray, hess: np.ndarray, dtrain: QuantileDMatrix
+    ) -> tuple[cp.ndarray, cp.ndarray]:
+        return cp.array(grad), cp.array(hess)
+
+    def ls_obj(
+        y_pred: np.ndarray, dtrain: QuantileDMatrix
+    ) -> Tuple[cp.ndarray, cp.ndarray]:
+        # no weight yet
+        y_true = dtrain.get_label().reshape(y_pred.shape)
+        grad, hess = tm.ls_obj(y_true, y_pred, None)
+        return cp.array(grad), cp.array(hess)
+
+    X, y = make_regression(
+        n_samples=1024, n_features=16, random_state=1994, n_targets=5
+    )
+    Xy = QuantileDMatrix(X, y)
+
+    def run_test(reducer: Optional[Callable]) -> Booster:
+        evals_result: Dict[str, Dict] = {}
+        booster = train(
+            {
+                "device": device,
+                "multi_strategy": "multi_output_tree",
+                "learning_rate": 1,
+            },
+            Xy,
+            evals=[(Xy, "Train")],
+            red=reducer,
+            obj=ls_obj,
+            num_boost_round=8,
+            evals_result=evals_result,
+        )
+        assert tm.non_increasing(evals_result["Train"]["rmse"])
+        return booster
+
+    booster_0 = run_test(reducer)
+    booster_1 = run_test(None)
+    np.testing.assert_allclose(
+        booster_0.inplace_predict(X), booster_1.inplace_predict(X)
+    )
+
+    # Use mean gradient, should still converge.
+    def reducer_1(
+        grad: np.ndarray, hess: np.ndarray, dtrain: QuantileDMatrix
+    ) -> tuple[cp.ndarray, cp.ndarray]:
+        sgrad = cp.mean(grad, axis=1)
+        shess = cp.mean(hess, axis=1)
+        return sgrad, shess
+
+    run_test(reducer_1)
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index 75cebeff60d4..233b62331bb0 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -56,6 +56,7 @@ def train(
     *,
     evals: Optional[Sequence[Tuple[DMatrix, str]]] = None,
     obj: Optional[Objective] = None,
+    red: Optional[Objective] = None,  # fixme: type
     maximize: Optional[bool] = None,
     early_stopping_rounds: Optional[int] = None,
     evals_result: Optional[TrainingCallback.EvalsLog] = None,
@@ -196,7 +197,7 @@ def train(
     for i in range(start_iteration, num_boost_round):
         if cb_container.before_iteration(bst, i, dtrain, evals):
             break
-        bst.update(dtrain, iteration=i, fobj=obj)
+        bst.update(dtrain, iteration=i, fobj=obj, fred=red)
         if cb_container.after_iteration(bst, i, dtrain, evals):
             break
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 074d1ab76e4f..35bc730044ed 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1205,7 +1205,7 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bs
 
 namespace xgboost {
 // copy user-supplied CUDA gradient arrays
-void CopyGradientFromCUDAArrays(Context const *, ArrayInterface<2, false> const &,
+void CopyGradientFromCudaArrays(Context const *, ArrayInterface<2, false> const &,
                                 ArrayInterface<2, false> const &, linalg::Matrix<GradientPair> *)
 #if !defined(XGBOOST_USE_CUDA)
 {
@@ -1228,7 +1228,7 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in
   StringView msg{"Mismatched shape between the gradient and hessian."};
   CHECK_EQ(i_grad.Shape<0>(), i_hess.Shape<0>()) << msg;
   CHECK_EQ(i_grad.Shape<1>(), i_hess.Shape<1>()) << msg;
-  linalg::Matrix<GradientPair> gpair;
+  GradientContainer gpair;
   auto grad_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_grad.data);
   auto hess_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_hess.data);
   CHECK_EQ(i_grad.Shape<0>(), p_fmat->Info().num_row_)
@@ -1237,8 +1237,8 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in
   auto *learner = static_cast<Learner *>(handle);
   auto ctx = learner->Ctx();
   if (!grad_is_cuda) {
-    gpair.Reshape(i_grad.Shape<0>(), i_grad.Shape<1>());
-    auto h_gpair = gpair.HostView();
+    gpair.gpair.Reshape(i_grad.Shape<0>(), i_grad.Shape<1>());
+    auto h_gpair = gpair.gpair.HostView();
     DispatchDType(i_grad, DeviceOrd::CPU(), [&](auto &&t_grad) {
       DispatchDType(i_hess, DeviceOrd::CPU(), [&](auto &&t_hess) {
         common::ParallelFor(h_gpair.Size(), ctx->Threads(),
@@ -1246,12 +1246,46 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in
       });
     });
   } else {
-    CopyGradientFromCUDAArrays(ctx, i_grad, i_hess, &gpair);
+    CopyGradientFromCudaArrays(ctx, i_grad, i_hess, &gpair.gpair);
   }
   learner->BoostOneIter(iter, p_fmat, &gpair);
   API_END();
 }
 
+// Hidden, experimental
+// fixme: find a better way to consume gradients, maybe expose the objective.
+//
+// We can not obtain the gradient from built-in objectives without making copy due to
+// array-of-structs.
+XGB_DLL int XGBoosterTrainOneIterWithObj(BoosterHandle handle, DMatrixHandle dtrain, int iter,
+                                         char const *split_grad, char const *split_hess,
+                                         char const *value_grad, char const *value_hess) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  auto *learner = static_cast<Learner *>(handle);
+  GradientContainer gpair;
+  auto ctx = learner->Ctx();
+
+  {
+    ArrayInterface<2, false> i_grad{StringView{split_grad}};
+    ArrayInterface<2, false> i_hess{StringView{split_hess}};
+    CHECK(ArrayInterfaceHandler::IsCudaPtr(i_grad.data))
+        << "Reduced gradient with CPU" << MTNotImplemented();
+    CopyGradientFromCudaArrays(ctx, i_grad, i_hess, &gpair.gpair);
+  }
+  {
+    ArrayInterface<2, false> i_grad{StringView{value_grad}};
+    ArrayInterface<2, false> i_hess{StringView{value_hess}};
+    CHECK(ArrayInterfaceHandler::IsCudaPtr(i_grad.data))
+        << "Reduced gradient with CPU" << MTNotImplemented();
+    CopyGradientFromCudaArrays(ctx, i_grad, i_hess, &gpair.value_gpair);
+  }
+  auto p_fmat = CastDMatrixHandle(dtrain);
+  learner->BoostOneIter(iter, p_fmat, &gpair);
+
+  API_END();
+}
+
 XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle,
                                  int iter,
                                  DMatrixHandle dmats[],
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 4a0d02107c21..999d3dfb36d5 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -84,7 +84,7 @@ void XGBoostAPIGuard::RestoreGPUAttribute() {
   cudaSetDevice(device_id_);
 }
 
-void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
+void CopyGradientFromCudaArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
                                 ArrayInterface<2, false> const &hess,
                                 linalg::Matrix<GradientPair> *out_gpair) {
   auto grad_dev = dh::CudaGetPointerDevice(grad.data);
diff --git a/src/common/algorithm.h b/src/common/algorithm.h
index 19afaf3cc51e..10d23d05cc08 100644
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@@ -78,7 +78,6 @@ void Sort(Context const *ctx, Iter begin, Iter end, Comp comp) {
 template <typename Idx, typename Iter, typename V = typename std::iterator_traits<Iter>::value_type,
           typename Comp = std::less<V>>
 std::vector<Idx> ArgSort(Context const *ctx, Iter begin, Iter end, Comp comp = std::less<V>{}) {
-  CHECK(!ctx->IsCUDA());
   auto n = std::distance(begin, end);
   std::vector<Idx> result(n);
   Iota(ctx, result.begin(), result.end(), 0);
diff --git a/src/common/device_debug.cuh b/src/common/device_debug.cuh
new file mode 100644
index 000000000000..fa36a3f3c8dd
--- /dev/null
+++ b/src/common/device_debug.cuh
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <cstddef>  // for size_t
+#include <vector>   // for vector
+
+#include "common.h"
+#include "device_helpers.cuh"     // for CopyDeviceSpanToVector
+#include "xgboost/span.h"         // for Span
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost::debug {
+// debug::SyncDevice(__FILE__, __LINE__);
+inline void SyncDevice(char const *file = __builtin_FILE(), int32_t line = __builtin_LINE()) {
+  {
+    auto err = cudaDeviceSynchronize();
+    dh::ThrowOnCudaError(err, file, line);
+  }
+  {
+    auto err = cudaGetLastError();
+    dh::ThrowOnCudaError(err, file, line);
+  }
+}
+
+template <typename T>
+void PrintDeviceSpan(common::Span<T> values, StringView name) {
+  std::cout << name << std::endl;
+  std::vector<std::remove_cv_t<T>> h_values(values.size());
+  dh::CopyDeviceSpanToVector(&h_values, values);
+  for (std::size_t i = 0; i < values.size(); ++i) {
+    if (i != 0 && i % 16 == 0) {
+      std::cout << std::endl;
+    }
+    std::cout << h_values[i] << ", ";
+  }
+  std::cout << std::endl;
+}
+}  // namespace xgboost::debug
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 8cc936419856..dbea513ee4ca 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -257,18 +257,6 @@ void Iota(Container array, cudaStream_t stream) {
   LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
 }
 
-// dh::DebugSyncDevice(__FILE__, __LINE__);
-inline void DebugSyncDevice(char const *file = __builtin_FILE(), int32_t line = __builtin_LINE()) {
-  {
-    auto err = cudaDeviceSynchronize();
-    ThrowOnCudaError(err, file, line);
-  }
-  {
-    auto err = cudaGetLastError();
-    ThrowOnCudaError(err, file, line);
-  }
-}
-
 // Faster to instantiate than caching_device_vector and invokes no synchronisation
 // Use this where vector functionality (e.g. resize) is not required
 template <typename T>
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index ecca3f3267e4..3bd03a3b4a41 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -123,8 +123,13 @@ class GBLinear : public GradientBooster {
     this->updater_->SaveConfig(&j_updater);
   }
 
-  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry*,
+  void DoBoost(DMatrix* p_fmat, GradientContainer* in_gpair, PredictionCacheEntry*,
                ObjFunction const*) override {
+    if (in_gpair->HasValueGrad()) {
+      LOG(FATAL)
+          << "Multi-target with reduced gradient is not implemented for the current booster.";
+    }
+
     monitor_.Start("DoBoost");
 
     CHECK(!p_fmat->Info().HasCategorical()) << error::NoCategorical("`gblinear`");
@@ -132,7 +137,7 @@ class GBLinear : public GradientBooster {
     this->LazySumWeights(p_fmat);
 
     if (!this->CheckConvergence()) {
-      updater_->Update(in_gpair, p_fmat, &model_, sum_instance_weight_);
+      updater_->Update(in_gpair->Grad(), p_fmat, &model_, sum_instance_weight_);
     }
     model_.num_boosted_rounds++;
     monitor_.Stop("DoBoost");
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 05f4fd0d14ff..219d14bba2cf 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -210,8 +210,8 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
   }
 }
 
-void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
-                     PredictionCacheEntry* predt, ObjFunction const* obj) {
+void GBTree::DoBoost(DMatrix* p_fmat, GradientContainer* in_gpair, PredictionCacheEntry* predt,
+                     ObjFunction const* obj) {
   if (model_.learner_model_param->IsVectorLeaf()) {
     CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
         << "Only the hist tree method is supported for building multi-target trees with vector "
@@ -243,6 +243,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   std::vector<HostDeviceVector<bst_node_t>> node_position;
 
   if (model_.learner_model_param->IsVectorLeaf()) {
+    // Multi-target, vector leaf
     TreesOneGroup ret;
     BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
     UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
@@ -253,6 +254,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
       predt->Update(1);
     }
   } else if (model_.learner_model_param->OutputLength() == 1u) {
+    // Single target
     TreesOneGroup ret;
     BoostNewTrees(in_gpair, p_fmat, 0, &node_position, &ret);
     UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, node_position, &ret);
@@ -263,13 +265,15 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
       predt->Update(1);
     }
   } else {
-    CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
-    linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
-                                     ctx_->Device()};
+    // Multi-target, scalar leaf
+    CHECK_EQ(in_gpair->gpair.Size() % n_groups, 0U) << "Must have exactly n_groups * n_samples gpairs.";
+    GradientContainer tmp;
+    tmp.gpair = linalg::Matrix<GradientPair>{
+        {in_gpair->gpair.Shape(0), static_cast<std::size_t>(1ul)}, ctx_->Device()};
     bool update_predict = true;
     for (bst_target_t gid = 0; gid < n_groups; ++gid) {
       node_position.clear();
-      CopyGradient(ctx_, in_gpair, gid, &tmp);
+      CopyGradient(ctx_, &in_gpair->gpair, gid, &tmp.gpair);
       TreesOneGroup ret;
       BoostNewTrees(&tmp, p_fmat, gid, &node_position, &ret);
       UpdateTreeLeaf(p_fmat, predt->predictions, obj, gid, node_position, &ret);
@@ -290,9 +294,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   this->CommitModel(std::move(new_trees));
 }
 
-void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
-                           std::vector<HostDeviceVector<bst_node_t>>* out_position,
-                           TreesOneGroup* ret) {
+std::vector<RegTree*> GBTree::InitNewTrees(bst_target_t bst_group, TreesOneGroup* ret) {
   std::vector<RegTree*> new_trees;
   ret->clear();
   // create the trees
@@ -326,20 +328,30 @@ void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
       ret->push_back(std::move(t));
     }
   }
+  return new_trees;
+}
+
+void GBTree::BoostNewTrees(GradientContainer* gpair, DMatrix* p_fmat, int bst_group,
+                           std::vector<HostDeviceVector<bst_node_t>>* out_position,
+                           TreesOneGroup* ret) {
+  std::vector<RegTree*> new_trees = this->InitNewTrees(bst_group, ret);
 
   // update the trees
   auto n_out = model_.learner_model_param->OutputLength() * p_fmat->Info().num_row_;
   StringView msg{
       "Mismatching size between number of rows from input data and size of gradient vector."};
   if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
-    CHECK_EQ(n_out % gpair->Size(), 0) << msg;
-  } else {
-    CHECK_EQ(gpair->Size(), n_out) << msg;
+    CHECK_EQ(n_out % gpair->gpair.Size(), 0) << msg;
+  } else if (model_.learner_model_param->IsVectorLeaf()){
+    // vector leaf
+    if (!gpair->HasValueGrad()) {
+      CHECK_EQ(gpair->gpair.Size(), n_out) << msg;
+    }
   }
 
   out_position->resize(new_trees.size());
 
-  // Rescale learning rate according to the size of trees
+  // Rescale learning rate according to the number of trees
   auto lr = tree_param_.learning_rate;
   tree_param_.learning_rate /= static_cast<float>(new_trees.size());
   for (auto& up : updaters_) {
@@ -1005,7 +1017,7 @@ DMLC_REGISTER_PARAMETER(DartTrainParam);
 XGBOOST_REGISTER_GBM(GBTree, "gbtree")
     .describe("Tree booster, gradient boosted trees.")
     .set_body([](LearnerModelParam const* booster_config, Context const* ctx) {
-      auto* p = new GBTree(booster_config, ctx);
+      auto* p = new GBTree{booster_config, ctx};
       return p;
     });
 XGBOOST_REGISTER_GBM(Dart, "dart")
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 46b54ce2c3f2..20975b610d5e 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -188,7 +188,7 @@ class GBTree : public GradientBooster {
   /**
    * @brief Carry out one iteration of boosting.
    */
-  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry* predt,
+  void DoBoost(DMatrix* p_fmat, GradientContainer* in_gpair, PredictionCacheEntry* predt,
                ObjFunction const* obj) override;
 
   [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
@@ -326,10 +326,12 @@ class GBTree : public GradientBooster {
   }
 
  protected:
-  void BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+  void BoostNewTrees(GradientContainer* gpair, DMatrix* p_fmat, int bst_group,
                      std::vector<HostDeviceVector<bst_node_t>>* out_position,
                      std::vector<std::unique_ptr<RegTree>>* ret);
 
+  std::vector<RegTree*> InitNewTrees(bst_target_t bst_group, TreesOneGroup* ret);
+
   [[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
       bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
       DMatrix* f_dmat = nullptr) const;
diff --git a/src/learner.cc b/src/learner.cc
index 1424ac471e18..8faf0506c5d5 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1142,17 +1142,36 @@ class LearnerImpl : public LearnerIO {
     monitor_.Stop("PredictRaw");
 
     monitor_.Start("GetGradient");
-    GetGradient(predt->predictions, train->Info(), iter, &gpair_);
+    GetGradient(predt->predictions, train->Info(), iter, &gpair_.gpair);
     monitor_.Stop("GetGradient");
-    TrainingObserver::Instance().Observe(*gpair_.Data(), "Gradients");
+    TrainingObserver::Instance().Observe(gpair_.Grad()->Data(), "Gradients");
 
     gbm_->DoBoost(train.get(), &gpair_, predt.get(), obj_.get());
     monitor_.Stop("UpdateOneIter");
   }
 
-  void BoostOneIter(int iter, std::shared_ptr<DMatrix> train,
-                    linalg::Matrix<GradientPair>* in_gpair) override {
-    monitor_.Start("BoostOneIter");
+  // void BoostOneIter(int iter, std::shared_ptr<DMatrix> train,
+  //                   linalg::Matrix<GradientPair>* in_gpair) override {
+  //   monitor_.Start("BoostOneIter");
+  //   this->Configure();
+
+  //   if (ctx_.seed_per_iteration) {
+  //     common::GlobalRandom().seed(ctx_.seed * kRandSeedMagic + iter);
+  //   }
+
+  //   this->ValidateDMatrix(train.get(), true);
+
+  //   CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
+  //       << "The number of columns in gradient should be equal to the number of targets/classes in "
+  //          "the model.";
+  //   auto predt = prediction_container_.Cache(train, ctx_.Device());
+  //   gbm_->DoBoost(train.get(), in_gpair, predt.get(), obj_.get());
+  //   monitor_.Stop("BoostOneIter");
+  // }
+
+  void BoostOneIter(std::int32_t iter, std::shared_ptr<DMatrix> train,
+                    GradientContainer* in_gpair) override {
+    this->monitor_.Start(__func__);
     this->Configure();
 
     if (ctx_.seed_per_iteration) {
@@ -1160,13 +1179,13 @@ class LearnerImpl : public LearnerIO {
     }
 
     this->ValidateDMatrix(train.get(), true);
-
-    CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
-        << "The number of columns in gradient should be equal to the number of targets/classes in "
-           "the model.";
+    // fixme: avoid duplicated code, including the error message
+    CHECK_GE(this->learner_model_param_.OutputLength(), in_gpair->value_gpair.Shape(1))
+        << "The number of columns in gradient should be equal to or lesser than the number of "
+           "targets/classes in the model.";
     auto predt = prediction_container_.Cache(train, ctx_.Device());
-    gbm_->DoBoost(train.get(), in_gpair, predt.get(), obj_.get());
-    monitor_.Stop("BoostOneIter");
+    this->gbm_->DoBoost(train.get(), in_gpair, predt.get(), obj_.get());
+    this->monitor_.Stop(__func__);
   }
 
   std::string EvalOneIter(int iter,
@@ -1338,7 +1357,7 @@ class LearnerImpl : public LearnerIO {
   /*! \brief random number transformation seed. */
   static int32_t constexpr kRandSeedMagic = 127;
   // gradient pairs
-  linalg::Matrix<GradientPair> gpair_;
+  GradientContainer gpair_;
   /*! \brief Temporary storage to prediction.  Useful for storing data transformed by
    *  objective function */
   PredictionContainer output_predictions_;
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index 8e4060bea7b7..abbde7f3c0e9 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -80,7 +80,7 @@ void EncodeTreeLeafHost(Context const* ctx, RegTree const& tree,
 }
 
 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                        bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                         HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
   auto& tree = *p_tree;
 
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 4b404259d485..81ebbcb6b9a5 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -144,7 +144,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 }
 
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                          bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
   dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   dh::device_vector<size_t> ridx;
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index 1a7aef0516d1..5f0b1c8ad11c 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -87,16 +87,16 @@ inline std::size_t IdxY(MetaInfo const& info, bst_group_t group_idx) {
 }
 
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
-                          std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                          bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
 
 void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
-                        std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                        bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                         HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree);
 }  // namespace detail
 
 inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
-                           std::int32_t group_idx, MetaInfo const& info, float learning_rate,
+                           bst_target_t group_idx, MetaInfo const& info, float learning_rate,
                            HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
   if (ctx->IsCUDA()) {
     position.SetDevice(ctx->Device());
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 34a82e808310..05b4627ea85d 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -167,7 +167,7 @@ class QuantileRegression : public ObjFunction {
 
   void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
                       float learning_rate, HostDeviceVector<float> const& prediction,
-                      std::int32_t group_idx, RegTree* p_tree) const override {
+                      bst_target_t group_idx, RegTree* p_tree) const override {
     auto alpha = param_.quantile_alpha[group_idx];
     ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction,
                                    alpha, p_tree);
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index aa071c19cbc0..26b660d31554 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -784,7 +784,7 @@ class MeanAbsoluteError : public ObjFunction {
 
   void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
                       float learning_rate, HostDeviceVector<float> const& prediction,
-                      std::int32_t group_idx, RegTree* p_tree) const override {
+                      bst_target_t group_idx, RegTree* p_tree) const override {
     ::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction, 0.5,
                                    p_tree);
   }
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
index a6a9e3b24b72..6157e5a8ac03 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cu
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -5,6 +5,7 @@
 #include <cub/util_type.cuh>         // for KeyValuePair
 #include <cub/warp/warp_reduce.cuh>  // for WarpReduce
 #include <vector>                    // for vector
+#include "../../common/device_debug.cuh"
 
 #include "../../common/cuda_context.cuh"
 #include "../updater_gpu_common.cuh"  // for SumCallbackOp
@@ -232,7 +233,6 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
                                         MultiEvaluateSplitSharedInputs const &shared_inputs,
                                         common::Span<MultiExpandEntry> out_splits) {
   auto n_targets = shared_inputs.Targets();
-  CHECK_GE(n_targets, 2);
   auto n_bins_per_feat_tar = shared_inputs.n_bins_per_feat_tar;
   CHECK_GE(n_bins_per_feat_tar, 1);
   auto n_features = shared_inputs.Features();
@@ -260,6 +260,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
     scans[nidx_in_set] = dh::ToSpan(this->scan_buffer_)
                              .subspan(nidx_in_set * node_hist_size * 2, node_hist_size * 2);
   }
+  // fixme: make sure root sum is copied.
 
   // Launch histogram scan kernel
   dim3 grid{n_nodes, n_features, n_targets};
@@ -274,6 +275,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
       dh::ToSpan(d_splits));
 
   // Find best split for each node
+  // * 3 because of three nodes, parent, left, right
   this->weights_.resize(n_nodes * n_targets * 3);
   auto d_weights = dh::ToSpan(this->weights_);
 
@@ -362,21 +364,21 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
 
 void MultiHistEvaluator::ApplyTreeSplit(Context const *ctx, RegTree const *p_tree,
                                         MultiExpandEntry const &candidate) {
-  auto n_targets = p_tree->NumTargets();
-
   auto left_child = p_tree->LeftChild(candidate.nidx);
   auto right_child = p_tree->RightChild(candidate.nidx);
   bst_node_t max_node = std::max(left_child, right_child);
+  auto n_targets = candidate.base_weight.size();
+
   this->AllocNodeSum(max_node, n_targets);
 
   auto parent_sum = this->GetNodeSum(candidate.nidx, n_targets);
-
   auto left_sum = this->GetNodeSum(left_child, n_targets);
   auto right_sum = this->GetNodeSum(right_child, n_targets);
 
   // Calculate node sums
   // TODO(jiamingy): We need to batch the targets and nodes
   auto best_split = candidate.split;
+
   auto node_sum = best_split.node_sum;
   dh::LaunchN(1, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t) {
     for (bst_target_t t = 0; t < n_targets; ++t) {
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cuh b/src/tree/gpu_hist/multi_evaluate_splits.cuh
index 3df0b1f736b5..d3c369e4276a 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cuh
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cuh
@@ -10,6 +10,7 @@
 #include "xgboost/context.h"               // for Context
 
 namespace xgboost::tree::cuda_impl {
+/** @brief Evaluator for vector leaf. */
 class MultiHistEvaluator {
   dh::device_vector<float> weights_;
 
@@ -17,10 +18,15 @@ class MultiHistEvaluator {
   dh::device_vector<GradientPairInt64> node_sums_;
 
  public:
+  /**
+   * @brief Run evaluation for the root node.
+   */
   [[nodiscard]] MultiExpandEntry EvaluateSingleSplit(
       Context const *ctx, MultiEvaluateSplitInputs const &input,
       MultiEvaluateSplitSharedInputs const &shared_inputs);
-
+  /**
+   * @brief Run evaluation for multiple nodes.
+   */
   void EvaluateSplits(Context const *ctx, common::Span<MultiEvaluateSplitInputs const> d_inputs,
                       MultiEvaluateSplitSharedInputs const &shared_inputs,
                       common::Span<MultiExpandEntry> out_splits);
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 27eb040afa1e..bc22e80cec16 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -205,7 +205,7 @@ struct NodePositionInfo {
   Segment segment;
   bst_node_t left_child = -1;
   bst_node_t right_child = -1;
-  __device__ bool IsLeaf() { return left_child == -1; }
+  [[nodiscard]] XGBOOST_DEVICE bool IsLeaf() const { return left_child == -1; }
 };
 
 XGBOOST_DEV_INLINE int GetPositionFromSegments(std::size_t idx,
@@ -304,10 +304,12 @@ class RowPartitioner {
   [[nodiscard]] bst_node_t GetNumNodes() const { return n_nodes_; }
 
   /**
-   * \brief Convenience method for testing
+   * @brief Convenient method for testing.
    */
   std::vector<RowIndexT> GetRowsHost(bst_node_t nidx);
 
+  std::vector<NodePositionInfo> const& GetSegmentsHost() const { return this->ridx_segments_; }
+
   /**
    * \brief Updates the tree position for set of training instances being split
    * into left and right child nodes. Accepts a user-defined lambda specifying
diff --git a/src/tree/leaf_sum.cu b/src/tree/leaf_sum.cu
new file mode 100644
index 000000000000..b81515f29b64
--- /dev/null
+++ b/src/tree/leaf_sum.cu
@@ -0,0 +1,79 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <vector>  // for vector
+
+#include "../common/linalg_op.cuh"       // for tbegin
+#include "gpu_hist/quantiser.cuh"        // for GradientQuantiser
+#include "gpu_hist/row_partitioner.cuh"  // for RowIndexT, NodePositionInfo
+#include "leaf_sum.cuh"
+#include "updater_gpu_common.cuh"  // for GPUTrainingParam
+#include "xgboost/base.h"          // for GradientPairInt64
+#include "xgboost/context.h"       // for Context
+#include "xgboost/linalg.h"        // for MatrixView
+#include "xgboost/span.h"          // for Span
+
+namespace xgboost::tree::cuda_impl {
+void LeafGradSum(Context const* ctx, std::vector<NodePositionInfo> const& h_segments,
+                 common::Span<GradientQuantiser const> roundings,
+                 common::Span<RowIndexT const> sorted_ridx,
+                 linalg::MatrixView<GradientPair const> grad,
+                 linalg::MatrixView<GradientPairInt64> out_sum) {
+  std::vector<NodePositionInfo> h_leaves;
+  for (auto const& node : h_segments) {
+    if (node.IsLeaf()) {
+      h_leaves.push_back(node);
+    }
+  }
+  CHECK_EQ(h_leaves.size(), out_sum.Shape(0));
+
+  dh::device_vector<NodePositionInfo> leaves(h_leaves);
+  auto d_leaves = dh::ToSpan(leaves);
+
+  std::vector<RowIndexT> h_indptr{0};
+  for (auto const& node : h_leaves) {
+    h_indptr.push_back(node.segment.Size());
+  }
+  // leaves form a complete partition
+  dh::device_vector<RowIndexT> indptr{h_indptr};
+  thrust::inclusive_scan(ctx->CUDACtx()->CTP(), indptr.cbegin(), indptr.cend(), indptr.begin());
+  CHECK_EQ(roundings.size(), grad.Shape(1));
+  CHECK_EQ(roundings.size(), out_sum.Shape(1));
+  CHECK_EQ(out_sum.Shape(0), indptr.size() - 1);
+  CHECK_EQ(indptr.size(), h_leaves.size() + 1);
+  auto d_indptr = dh::ToSpan(indptr);
+
+  for (bst_target_t t = 0, n_targets = grad.Shape(1); t < n_targets; ++t) {
+    // TODO(jiamingy): Avoid additional allocation for d_sum
+    auto out_t = out_sum.Slice(linalg::All(), t);  // len == n_leaves
+    std::size_t n_bytes = 0;
+    auto it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) {
+      auto nidx_in_set = dh::SegmentId(d_indptr, i);
+      auto k = i - d_indptr[nidx_in_set];
+      auto j = d_leaves[nidx_in_set].segment.begin + k;
+      auto g = grad(sorted_ridx[j], t);
+      return roundings[t].ToFixedPoint(g);
+    });
+    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, it, linalg::tbegin(out_t),
+                                                  h_leaves.size(), indptr.data(), indptr.data() + 1,
+                                                  ctx->CUDACtx()->Stream()));
+    dh::TemporaryArray<char> alloc(n_bytes);
+    dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
+        alloc.data().get(), n_bytes, it, linalg::tbegin(out_t), h_leaves.size(), indptr.data(),
+        indptr.data() + 1, ctx->CUDACtx()->Stream()));
+  }
+}
+
+void LeafWeight(Context const* ctx, GPUTrainingParam const& param,
+                common::Span<GradientQuantiser const> roundings,
+                linalg::MatrixView<GradientPairInt64 const> grad_sum,
+                linalg::MatrixView<float> out_weights) {
+  CHECK(grad_sum.Contiguous());
+  auto s_grad_sum = grad_sum.Values();
+  dh::LaunchN(grad_sum.Size(), ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t i) mutable {
+    auto [nidx_in_set, t] = linalg::UnravelIndex(i, grad_sum.Shape());
+    auto g = roundings[t].ToFloatingPoint(grad_sum(nidx_in_set, t));
+    out_weights(nidx_in_set, t) = CalcWeight(param, g.GetGrad(), g.GetHess());
+  });
+}
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/leaf_sum.cuh b/src/tree/leaf_sum.cuh
new file mode 100644
index 000000000000..4829eae1ed3e
--- /dev/null
+++ b/src/tree/leaf_sum.cuh
@@ -0,0 +1,29 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#pragma once
+
+#include <vector>  // for vector
+
+#include "gpu_hist/quantiser.cuh"        // for GradientQuantiser
+#include "gpu_hist/row_partitioner.cuh"  // for RowIndexT, NodePositionInfo
+#include "updater_gpu_common.cuh"        // for GPUTrainingParam
+#include "xgboost/context.h"             // for Context
+#include "xgboost/linalg.h"              // for MatrixView
+#include "xgboost/span.h"                // for Span
+
+namespace xgboost::tree::cuda_impl {
+// shape(out_sum) == (n_leaves, n_targets)
+void LeafGradSum(Context const* ctx, std::vector<NodePositionInfo> const& h_segments,
+                 common::Span<GradientQuantiser const> roundings,
+                 common::Span<RowIndexT const> sorted_ridx,
+                 linalg::MatrixView<GradientPair const> grad,
+                 linalg::MatrixView<GradientPairInt64> out_sum);
+
+// shape(grad_sum) == (n_leaves, n_targets)
+// shape(out_weights) == (n_leaves, n_targets)
+void LeafWeight(Context const* ctx, GPUTrainingParam const& param,
+                common::Span<GradientQuantiser const> roundings,
+                linalg::MatrixView<GradientPairInt64 const> grad_sum,
+                linalg::MatrixView<float> out_weights);
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/multi_target_tree_model.cc b/src/tree/multi_target_tree_model.cc
index eeae410803c7..c34cc82c42aa 100644
--- a/src/tree/multi_target_tree_model.cc
+++ b/src/tree/multi_target_tree_model.cc
@@ -48,13 +48,12 @@ MultiTargetTree::MultiTargetTree(MultiTargetTree const& that)
   this->weights_.Copy(that.weights_);
 }
 
-void MultiTargetTree::SetLeaf(bst_node_t nidx, linalg::VectorView<float const> weight) {
-  CHECK(this->IsLeaf(nidx)) << "Collapsing a split node to leaf " << MTNotImplemented();
-  auto const next_nidx = nidx + 1;
-  CHECK_EQ(weight.Size(), this->NumTargets());
+void MultiTargetTree::SetRoot(linalg::VectorView<float const> weight) {
+  auto const next_nidx = RegTree::kRoot + 1;
+  CHECK_LE(weight.Size(), this->NumTargets());
   CHECK_GE(weights_.Size(), next_nidx * weight.Size());
-  auto out_weight = weights_.HostSpan().subspan(nidx * weight.Size(), weight.Size());
-  for (std::size_t i = 0; i < weight.Size(); ++i) {
+  auto out_weight = weights_.HostSpan().subspan(RegTree::kRoot * weight.Size(), weight.Size());
+  for (std::size_t i = 0, n = weight.Size(); i < n; ++i) {
     out_weight[i] = weight(i);
   }
 }
@@ -100,19 +99,35 @@ void MultiTargetTree::Expand(bst_node_t nidx, bst_feature_t split_idx, float spl
 
   weights_.Resize(n * this->NumTargets());
   auto p_weight = this->NodeWeight(nidx);
-  CHECK_EQ(p_weight.Size(), base_weight.Size());
+  CHECK_GE(p_weight.Size(), base_weight.Size());
   auto l_weight = this->NodeWeight(left_child);
-  CHECK_EQ(l_weight.Size(), left_weight.Size());
+  CHECK_GE(l_weight.Size(), left_weight.Size());
   auto r_weight = this->NodeWeight(right_child);
-  CHECK_EQ(r_weight.Size(), right_weight.Size());
+  CHECK_GE(r_weight.Size(), right_weight.Size());
 
-  for (std::size_t i = 0; i < base_weight.Size(); ++i) {
+  CHECK_EQ(base_weight.Size(), left_weight.Size());
+  CHECK_EQ(base_weight.Size(), right_weight.Size());
+
+  for (std::size_t i = 0, n = base_weight.Size(); i < n; ++i) {
     p_weight(i) = base_weight(i);
     l_weight(i) = left_weight(i);
     r_weight(i) = right_weight(i);
   }
 }
 
+void MultiTargetTree::SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights) {
+  auto n_targets = this->NumTargets();
+  auto h_weights = this->weights_.HostSpan();
+  std::int32_t nidx_in_set = 0;
+  for (auto nidx : leaves) {
+    CHECK(this->IsLeaf(nidx));
+    auto w_in = weights.subspan(nidx_in_set * n_targets, n_targets);
+    auto w_out = h_weights.subspan(nidx * n_targets, n_targets);
+    std::copy(w_in.cbegin(), w_in.cend(), w_out.begin());
+    nidx_in_set++;
+  }
+}
+
 template <bool typed, bool feature_is_64>
 void LoadModelImpl(Json const& in, HostDeviceVector<float>* p_weights,
                    HostDeviceVector<bst_node_t>* p_lefts, HostDeviceVector<bst_node_t>* p_rights,
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index e6d83e181308..96883d16a60b 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -889,6 +889,12 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
   this->param_.num_nodes = this->p_mt_tree_->Size();
 }
 
+void RegTree::SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights) {
+  // fixme: cleanup
+  CHECK(IsMultiTarget());
+  this->p_mt_tree_->SetLeaves(std::move(leaves), weights);
+}
+
 void RegTree::ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
                                 common::Span<common::KCatBitField::value_type> split_cat,
                                 bool default_left, bst_float base_weight,
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index dadcc94a1a24..91560a2564a6 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2024, XGBoost contributors
+ * Copyright 2021-2025, XGBoost contributors
  *
  * \brief Implementation for the approx tree method.
  */
@@ -21,13 +21,14 @@
 #include "driver.h"                          // for Driver
 #include "hist/evaluate_splits.h"            // for HistEvaluator, UpdatePredictionCacheImpl
 #include "hist/expand_entry.h"               // for CPUExpandEntry
-#include "hist/histogram.h"                  // for MultiHistogramBuilder
 #include "hist/hist_param.h"                 // for HistMakerTrainParam
+#include "hist/histogram.h"                  // for MultiHistogramBuilder
 #include "hist/sampler.h"                    // for SampleGradient
 #include "param.h"                           // for GradStats, TrainParam
 #include "xgboost/base.h"                    // for Args, GradientPair, bst_node_t, bst_bin_t
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for DMatrix, BatchSet, BatchIterator, MetaInfo
+#include "xgboost/gradient.h"                // for GradientContainer
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
 #include "xgboost/json.h"                    // for Object, Json, FromJson, ToJson, get
 #include "xgboost/linalg.h"                  // for Matrix, MakeTensorView, Empty, MatrixView
@@ -284,7 +285,7 @@ class GlobalApproxUpdater : public TreeUpdater {
 
   [[nodiscard]] char const *Name() const override { return "grow_histmaker"; }
 
-  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *m,
+  void Update(TrainParam const *param, GradientContainer *in_gpair, DMatrix *m,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
     CHECK(hist_param_.GetInitialised());
@@ -293,6 +294,7 @@ class GlobalApproxUpdater : public TreeUpdater {
     }
     pimpl_ = std::make_unique<GlobalApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
                                                    column_sampler_, task_, &monitor_);
+    auto gpair = in_gpair->FullGradOnly();
 
     linalg::Matrix<GradientPair> h_gpair;
     // Obtain the hessian values for weighted sketching
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 807e1089ee8d..c12bb5ad7aaf 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2024, XGBoost Contributors
+ * Copyright 2014-2025, XGBoost Contributors
  * \file updater_colmaker.cc
  * \brief use columnwise update to construct a tree
  * \author Tianqi Chen
@@ -14,7 +14,8 @@
 #include "param.h"
 #include "sample_position.h"  // for SamplePosition
 #include "split_evaluator.h"
-#include "tree_view.h"  // for ScalarTreeView
+#include "tree_view.h"         // for ScalarTreeView
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/json.h"
 #include "xgboost/logging.h"
 #include "xgboost/parameter.h"
@@ -94,7 +95,7 @@ class ColMaker: public TreeUpdater {
     }
   }
 
-  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *dmat,
+  void Update(TrainParam const *param, GradientContainer *in_gpair, DMatrix *dmat,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree *> &trees) override {
     if (collective::IsDistributed()) {
@@ -115,6 +116,7 @@ class ColMaker: public TreeUpdater {
     // rescale learning rate according to size of trees
     interaction_constraints_.Configure(*param, dmat->Info().num_row_);
     // build tree
+    auto gpair = in_gpair->FullGradOnly();
     CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
     for (auto tree : trees) {
       CHECK(ctx_);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index be639d013cf1..0bd4791ccb3f 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -3,13 +3,18 @@
  */
 #include <thrust/transform.h>  // for transform
 
-#include <algorithm>        // for max
+#include <algorithm>        // for max, none_of
 #include <cmath>            // for isnan
+#include <cstdint>          // for int32_t, uint32_t
 #include <cuda/functional>  // for plus
 #include <memory>           // for unique_ptr, make_unique
+#include <numeric>          // for partial_sum
+#include <string>           // for string
+#include <type_traits>      // for is_trivially_copyable_v
 #include <utility>          // for move
 #include <vector>           // for vector
 
+#include "../../src/collective/comm.h"  // for Op
 #include "../collective/aggregator.h"
 #include "../common/categorical.h"     // for KCatBitField
 #include "../common/cuda_context.cuh"  // for CUDAContext
@@ -30,6 +35,7 @@
 #include "gpu_hist/feature_groups.cuh"          // for FeatureGroups
 #include "gpu_hist/gradient_based_sampler.cuh"  // for GradientBasedSampler
 #include "gpu_hist/histogram.cuh"
+#include "gpu_hist/quantiser.cuh"        // for GradientQuantiser
 #include "gpu_hist/row_partitioner.cuh"  // for RowPartitioner
 #include "hist/hist_param.h"             // for HistMakerTrainParam
 #include "param.h"                       // for TrainParam
@@ -38,10 +44,14 @@
 #include "updater_gpu_common.cuh"        // for HistBatch
 #include "updater_gpu_hist.cuh"          // for MultiTargetHistMaker
 #include "xgboost/base.h"                // for bst_idx_t
+#include "xgboost/collective/result.h"   // for Success, SafeColl
 #include "xgboost/context.h"             // for Context
 #include "xgboost/data.h"                // for DMatrix
+#include "xgboost/gradient.h"            // for GradientContainer
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
 #include "xgboost/json.h"                // for Json
+#include "xgboost/linalg.h"              // for MakeVec
+#include "xgboost/logging.h"             // for CHECK_EQ, CHECK_LE, CHECK_GE
 #include "xgboost/span.h"                // for Span
 #include "xgboost/task.h"                // for ObjInfo
 #include "xgboost/tree_model.h"          // for RegTree
@@ -847,19 +857,26 @@ class GPUHistMaker : public TreeUpdater {
 
   ~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); }
 
-  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
+  void Update(TrainParam const* param, GradientContainer* in_gpair, DMatrix* p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
-              const std::vector<RegTree*>& trees) override {
-    monitor_.Start(__func__);
+              std::vector<RegTree*> const& trees) override {
+    if (in_gpair->HasValueGrad() || in_gpair->gpair.Shape(1) > 1) {
+      CHECK(!this->task_->UpdateTreeLeaf()) << "Adaptive tree" << MTNotImplemented();
+    }
+    in_gpair->gpair.SetDevice(this->ctx_->Device());
 
     // build tree
     std::size_t t_idx{0};
-    for (xgboost::RegTree* tree : trees) {
-      this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
-      this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
+    for (xgboost::RegTree* p_tree : trees) {
+      this->InitData(param, p_fmat, p_tree);
+      if (p_tree->IsMultiTarget()) {
+        p_mtimpl_->UpdateTree(in_gpair, p_fmat, task_, p_tree);
+      } else {
+        p_scimpl_->UpdateTree(in_gpair->gpair.Data(), p_fmat, task_, p_tree, &out_position[t_idx]);
+      }
+      this->hist_maker_param_.CheckTreesSynchronized(ctx_, p_tree);
       ++t_idx;
     }
-    dh::safe_cuda(cudaGetLastError());
     monitor_.Stop(__func__);
   }
 
@@ -903,11 +920,8 @@ class GPUHistMaker : public TreeUpdater {
     this->InitData(param, p_fmat, p_tree);
     gpair->SetDevice(ctx_->Device());
     auto gpair_hdv = gpair->Data();
-    if (p_tree->IsMultiTarget()) {
-      p_mtimpl_->UpdateTree(gpair_hdv, p_fmat, task_, p_tree, p_out_position);
-    } else {
-      p_scimpl_->UpdateTree(gpair_hdv, p_fmat, task_, p_tree, p_out_position);
-    }
+    CHECK(!p_tree->IsMultiTarget());
+    p_scimpl_->UpdateTree(gpair_hdv, p_fmat, task_, p_tree, p_out_position);
   }
 
   bool UpdatePredictionCache(const DMatrix* data, linalg::MatrixView<float> p_out_preds) override {
@@ -972,12 +986,13 @@ class GPUGlobalApproxMaker : public TreeUpdater {
   }
   ~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); }
 
-  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
+  void Update(TrainParam const* param, GradientContainer* in_gpair, DMatrix* p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     monitor_.Start(__func__);
 
     this->InitDataOnce(p_fmat);
+    auto gpair = in_gpair->FullGradOnly();
     // build tree
     hess_.resize(gpair->Size());
     auto hess = dh::ToSpan(hess_);
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index 47275b20fe6b..0965d1f9ace8 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -14,9 +14,11 @@
 #include "gpu_hist/multi_evaluate_splits.cuh"  // for MultiHistEvaluator
 #include "gpu_hist/row_partitioner.cuh"        // for RowPartitioner
 #include "hist/hist_param.h"                   // for HistMakerTrainParam
+#include "leaf_sum.cuh"                        // for LeafGradSum
 #include "tree_view.h"                         // for MultiTargetTreeView
 #include "xgboost/base.h"                      // for bst_idx_t
 #include "xgboost/context.h"                   // for Context
+#include "xgboost/gradient.h"                  // for GradientContainer
 #include "xgboost/host_device_vector.h"        // for HostDeviceVector
 #include "xgboost/tree_model.h"                // for RegTree
 
@@ -49,21 +51,25 @@ class MultiTargetHistMaker {
   std::shared_ptr<common::HistogramCuts const> const cuts_;
   std::unique_ptr<FeatureGroups> feature_groups_;
   DeviceHistogramBuilder histogram_;
-  std::unique_ptr<MultiGradientQuantiser> quantiser_;
+  std::unique_ptr<MultiGradientQuantiser> split_quantizer_;
+  std::unique_ptr<MultiGradientQuantiser> value_quantizer_;
 
   MultiHistEvaluator evaluator_;
 
-  linalg::Matrix<GradientPair> dh_gpair_;
+  // Gradient used for building the tree structure
+  linalg::Matrix<GradientPair> split_gpair_;
+  // Gradient used for calculating the leaf values
+  linalg::Matrix<GradientPair> value_gpair_;
   std::vector<bst_idx_t> const batch_ptr_;
 
   dh::PinnedMemory pinned_;
 
   void BuildHist(EllpackPage const& page, std::int32_t k, bst_node_t nidx) {
-    auto d_gpair = this->dh_gpair_.View(this->ctx_->Device());
+    auto d_gpair = this->split_gpair_.View(this->ctx_->Device());
     CHECK(!this->partitioners_.empty());
     auto d_ridx = this->partitioners_.at(k)->GetRows(nidx);
     auto hist = histogram_.GetNodeHistogram(nidx);
-    auto roundings = this->quantiser_->Quantizers();
+    auto roundings = this->split_quantizer_->Quantizers();
     auto acc = page.Impl()->GetDeviceEllpack(this->ctx_, {});
     histogram_.BuildHistogram(this->ctx_->CUDACtx(), acc,
                               this->feature_groups_->DeviceAccessor(this->ctx_->Device()), d_gpair,
@@ -71,9 +77,9 @@ class MultiTargetHistMaker {
   }
 
  public:
-  void Reset(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, RegTree* p_tree) {
-    bst_idx_t n_targets = p_tree->NumTargets();
-    auto in_gpair = linalg::MakeTensorView(ctx_, gpair_all, p_fmat->Info().num_row_, n_targets);
+  void Reset(linalg::Matrix<GradientPair>* gpair_all, DMatrix* p_fmat) {
+    bst_idx_t n_targets = gpair_all->Shape(1);
+    auto in_gpair = gpair_all->View(ctx_->Device());
 
     /**
      * Initialize the partitioners
@@ -97,11 +103,16 @@ class MultiTargetHistMaker {
      * Initialize the histogram
      */
     std::size_t shape[2]{p_fmat->Info().num_row_, n_targets};
-    dh_gpair_ = linalg::Matrix<GradientPair>{shape, ctx_->Device(), linalg::kF};
-    TransposeGradient(this->ctx_, in_gpair, dh_gpair_.View(ctx_->Device()));
+    split_gpair_ = linalg::Matrix<GradientPair>{shape, ctx_->Device(), linalg::kF};
+    TransposeGradient(this->ctx_, in_gpair, split_gpair_.View(ctx_->Device()));
 
-    this->quantiser_ = std::make_unique<MultiGradientQuantiser>(
-        this->ctx_, dh_gpair_.View(ctx_->Device()), p_fmat->Info());
+    this->split_quantizer_ = std::make_unique<MultiGradientQuantiser>(
+        this->ctx_, split_gpair_.View(ctx_->Device()), p_fmat->Info());
+
+    if (!this->value_gpair_.Empty()) {
+      this->value_quantizer_ = std::make_unique<MultiGradientQuantiser>(
+          this->ctx_, value_gpair_.View(ctx_->Device()), p_fmat->Info());
+    }
 
     bool force_global = true;
     histogram_.Reset(this->ctx_, this->hist_param_->MaxCachedHistNodes(ctx_->Device()),
@@ -109,11 +120,12 @@ class MultiTargetHistMaker {
                      cuts_->TotalBins() * n_targets, force_global);
   }
 
-  [[nodiscard]] MultiExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree) {
-    auto d_gpair = dh_gpair_.View(ctx_->Device());
+  dh::device_vector<GradientPairInt64> CalcRootSum(
+      linalg::MatrixView<GradientPair> d_gpair,
+      common::Span<GradientQuantiser const> roundings) const {
+    // fixme: merge with fit stump.
     auto n_samples = d_gpair.Shape(0);
     auto n_targets = d_gpair.Shape(1);
-
     // Calculate the root sum
     dh::device_vector<GradientPairInt64> root_sum(n_targets);
 
@@ -121,16 +133,29 @@ class MultiTargetHistMaker {
       auto cidx = i / n_samples;
       return cidx;
     });
-    auto d_roundings = quantiser_->Quantizers();
     auto val_it =
         dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) -> GradientPairInt64 {
           auto cidx = i / n_samples;
           auto ridx = i % n_samples;
           auto g = d_gpair(ridx, cidx);
-          return d_roundings[cidx].ToFixedPoint(g);
+          return roundings[cidx].ToFixedPoint(g);
         });
     thrust::reduce_by_key(ctx_->CUDACtx()->CTP(), key_it, key_it + d_gpair.Size(), val_it,
                           thrust::make_discard_iterator(), root_sum.begin());
+    return root_sum;
+  }
+
+  [[nodiscard]] MultiExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree) {
+    auto d_gpair = split_gpair_.View(ctx_->Device());
+    auto n_samples = d_gpair.Shape(0);
+    auto n_targets = d_gpair.Shape(1);
+
+    // Calculate the root sum
+    auto root_sum = this->CalcRootSum(d_gpair, this->split_quantizer_->Quantizers());
+    this->evaluator_.AllocNodeSum(RegTree::kRoot, n_targets);
+    auto d_root_sum = this->evaluator_.GetNodeSum(RegTree::kRoot, n_targets);
+    dh::safe_cuda(cudaMemcpyAsync(d_root_sum.data(), root_sum.data().get(), d_root_sum.size_bytes(),
+                                  cudaMemcpyDefault, this->ctx_->CUDACtx()->Stream()));
 
     // Build the root histogram.
     histogram_.AllocateHistograms(ctx_, {RegTree::kRoot});
@@ -146,6 +171,7 @@ class MultiTargetHistMaker {
     auto node_hist = this->histogram_.GetNodeHistogram(RegTree::kRoot);
     MultiEvaluateSplitInputs input{RegTree::kRoot, p_tree->GetDepth(RegTree::kRoot),
                                    dh::ToSpan(root_sum), node_hist};
+    auto d_roundings = split_quantizer_->Quantizers();
     GPUTrainingParam param{this->param_};
     MultiEvaluateSplitSharedInputs shared_inputs{d_roundings,
                                                  this->cuts_->cut_ptrs_.ConstDeviceSpan(),
@@ -156,9 +182,12 @@ class MultiTargetHistMaker {
     auto entry = this->evaluator_.EvaluateSingleSplit(ctx_, input, shared_inputs);
 
     // TODO(jiamingy): Support learning rate.
+    // TODO(jiamingy): We need to modify the tree structure to account for internal reduced weight
+    // size.
     std::vector<float> h_base_weight(entry.base_weight.size());
     dh::CopyDeviceSpanToVector(&h_base_weight, entry.base_weight);
-    p_tree->SetLeaf(RegTree::kRoot, linalg::MakeVec(h_base_weight));
+    p_tree->SetRoot(linalg::MakeVec(h_base_weight));
+
     return entry;
   }
 
@@ -171,6 +200,7 @@ class MultiTargetHistMaker {
     dh::CopyDeviceSpanToVector(&h_base_weight, candidate.base_weight);
     dh::CopyDeviceSpanToVector(&h_left_weight, candidate.left_weight);
     dh::CopyDeviceSpanToVector(&h_right_weight, candidate.right_weight);
+
     p_tree->ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
                        candidate.split.dir == kLeftDir, linalg::MakeVec(h_base_weight),
                        linalg::MakeVec(h_left_weight), linalg::MakeVec(h_right_weight));
@@ -178,6 +208,31 @@ class MultiTargetHistMaker {
     this->evaluator_.ApplyTreeSplit(this->ctx_, p_tree, candidate);
   }
 
+  void UpdateTreeLeaf(linalg::Matrix<GradientPair> const& full_grad, RegTree* p_tree) const {
+    auto const& h_segments = this->partitioners_.front()->GetSegmentsHost();
+    std::vector<bst_node_t> leaves;
+    bst_node_t nidx = 0;
+    for (auto const& node : h_segments) {
+      if (node.IsLeaf()) {
+        leaves.push_back(nidx);
+      }
+      nidx += 1;
+    }
+    // Calculate the leaf weight based on the node sum for each leaf.
+    // Update the leaf weight, with learning rate.
+    linalg::Matrix<GradientPairInt64> out_sum(
+        {leaves.size(), static_cast<std::size_t>(p_tree->NumTargets())}, this->ctx_->Device());
+    LeafGradSum(this->ctx_, this->partitioners_.front()->GetSegmentsHost(),
+                this->value_quantizer_->Quantizers(), this->partitioners_.front()->GetRows(),
+                full_grad.View(this->ctx_->Device()), out_sum.View(this->ctx_->Device()));
+    auto param = GPUTrainingParam{this->param_};
+    linalg::Matrix<float> out_weight =
+        linalg::Empty<float>(this->ctx_, leaves.size(), p_tree->NumTargets());
+    LeafWeight(this->ctx_, param, this->value_quantizer_->Quantizers(),
+               out_sum.View(this->ctx_->Device()), out_weight.View(this->ctx_->Device()));
+    p_tree->SetLeaves(leaves, out_weight.Data()->ConstHostSpan());
+  }
+
   struct NodeSplitData {
     bst_node_t nidx;
   };
@@ -258,8 +313,6 @@ class MultiTargetHistMaker {
     }
 
     histogram_.AllocateHistograms(ctx_, build_nidx);
-    // Use a device view.
-    mt_tree = MultiTargetTreeView{this->ctx_->Device(), p_tree};
 
     std::int32_t k{0};
     // TODO(jiamingy): Support external memory.
@@ -288,7 +341,7 @@ class MultiTargetHistMaker {
     }
     GPUTrainingParam param{this->param_};
     MultiEvaluateSplitSharedInputs shared_inputs{
-        this->quantiser_->Quantizers(),
+        this->split_quantizer_->Quantizers(),
         this->cuts_->cut_ptrs_.ConstDeviceSpan(),
         this->cuts_->cut_values_.ConstDeviceSpan(),
         this->cuts_->min_vals_.ConstDeviceSpan(),
@@ -307,17 +360,18 @@ class MultiTargetHistMaker {
       bst_node_t right_nidx = mt_tree.RightChild(candidate.nidx);
       max_nidx = std::max({max_nidx, left_nidx, right_nidx});
     }
-
+    auto n_targets = this->split_gpair_.Shape(1);
     for (std::size_t i = 0; i < candidates.size(); i++) {
       auto candidate = candidates.at(i);
       bst_node_t left_nidx = mt_tree.LeftChild(candidate.nidx);
       bst_node_t right_nidx = mt_tree.RightChild(candidate.nidx);
       // Make sure no allocation is happening.
       // The parent sum is calculated in the last apply tree split.
-      auto parent_sum = this->evaluator_.GetNodeSum(candidate.nidx, mt_tree.NumTargets());
-      auto left = MultiEvaluateSplitInputs{left_nidx, candidate.depth + 1, parent_sum,
+      auto left = MultiEvaluateSplitInputs{left_nidx, candidate.depth + 1,
+                                           this->evaluator_.GetNodeSum(left_nidx, n_targets),
                                            histogram_.GetNodeHistogram(left_nidx)};
-      auto right = MultiEvaluateSplitInputs{right_nidx, candidate.depth + 1, parent_sum,
+      auto right = MultiEvaluateSplitInputs{right_nidx, candidate.depth + 1,
+                                            this->evaluator_.GetNodeSum(right_nidx, n_targets),
                                             histogram_.GetNodeHistogram(right_nidx)};
       h_node_inputs[i * 2] = left;
       h_node_inputs[i * 2 + 1] = right;
@@ -332,20 +386,37 @@ class MultiTargetHistMaker {
                                   ctx_->CUDACtx()->Stream()));
   }
 
-  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const*,
-                  RegTree* p_tree, HostDeviceVector<bst_node_t>*) {
+  void UpdateTree(GradientContainer* gpair, DMatrix* p_fmat, ObjInfo const* task, RegTree* p_tree) {
+    auto* split_grad = gpair->Grad();
+    if (gpair->HasValueGrad()) {
+      this->value_gpair_ =
+          linalg::Matrix<GradientPair>{gpair->value_gpair.Shape(), ctx_->Device(), linalg::kF};
+      TransposeGradient(this->ctx_, gpair->value_gpair.View(this->ctx_->Device()),
+                        value_gpair_.View(this->ctx_->Device()));
+    }
+
+    this->GrowTree(split_grad, p_fmat, task, p_tree);
+
+    if (gpair->HasValueGrad()) {
+      this->UpdateTreeLeaf(gpair->value_gpair, p_tree);
+    }
+  }
+
+  void GrowTree(linalg::Matrix<GradientPair>* splti_gpair, DMatrix* p_fmat, ObjInfo const*,
+                RegTree* p_tree) {
     if (this->param_.learning_rate - 1.0 != 0.0) {
       LOG(FATAL) << "GPU" << MTNotImplemented();
     }
     Driver<MultiExpandEntry> driver{param_, kMaxNodeBatchSize};
 
-    this->Reset(gpair_all, p_fmat, p_tree);
+    this->Reset(splti_gpair, p_fmat);
     driver.Push({this->InitRoot(p_fmat, p_tree)});
 
     // The set of leaves that can be expanded asynchronously
     auto expand_set = driver.Pop();
     while (!expand_set.empty()) {
       for (auto& candidate : expand_set) {
+        // fixme: prevent node size == 0
         this->ApplySplit(candidate, p_tree);
       }
 
diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc
index 2c2d1a2f0d93..2e95885b88d7 100644
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2025, XGBoost Contributors
  * \file updater_prune.cc
  * \brief prune a tree given the statistics
  * \author Tianqi Chen
@@ -11,7 +11,9 @@
 #include "../common/timer.h"
 #include "./param.h"
 #include "xgboost/base.h"
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/json.h"
+
 namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_prune);
 
@@ -31,14 +33,14 @@ class TreePruner : public TreeUpdater {
   [[nodiscard]] bool CanModifyTree() const override { return true; }
 
   // update the tree, do pruning
-  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
+  void Update(TrainParam const* param, GradientContainer* in_gpair, DMatrix* p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     pruner_monitor_.Start("PrunerUpdate");
     for (auto tree : trees) {
       this->DoPrune(param, tree);
     }
-    syncher_->Update(param, gpair, p_fmat, out_position, trees);
+    syncher_->Update(param, in_gpair, p_fmat, out_position, trees);
     pruner_monitor_.Stop("PrunerUpdate");
   }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 25b54a37e7de..08541f01a435 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -33,6 +33,7 @@
 #include "xgboost/base.h"                    // for Args, GradientPairPrecise, GradientPair, Gra...
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for BatchSet, DMatrix, BatchIterator, MetaInfo
+#include "xgboost/gradient.h"                // for GradientContainer
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
 #include "xgboost/json.h"                    // for Object, Json, FromJson, ToJson, get
 #include "xgboost/linalg.h"                  // for MatrixView, TensorView, All, Matrix, Empty
@@ -219,7 +220,7 @@ class MultiTargetHistBuilder {
     std::transform(linalg::cbegin(weight_t), linalg::cend(weight_t), linalg::begin(weight_t),
                    [&](float w) { return w * param_->learning_rate; });
 
-    p_tree->SetLeaf(RegTree::kRoot, weight_t);
+    p_tree->SetRoot(weight_t);
     std::vector<BoundedHistCollection const *> hists;
     std::vector<MultiExpandEntry> nodes{{RegTree::kRoot, 0}};
     for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
@@ -516,7 +517,7 @@ class QuantileHistMaker : public TreeUpdater {
 
   [[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
 
-  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
+  void Update(TrainParam const *param, GradientContainer *in_gpair, DMatrix *p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
     if (!column_sampler_) {
@@ -539,7 +540,7 @@ class QuantileHistMaker : public TreeUpdater {
     }
 
     bst_target_t n_targets = trees.front()->NumTargets();
-    auto h_gpair = gpair->HostView();
+    auto h_gpair = in_gpair->FullGradOnly()->HostView();
 
     linalg::Matrix<GradientPair> sample_out;
     auto h_sample_out = h_gpair;
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index af1af2fb76f1..62639e3542a3 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -4,8 +4,6 @@
  * \brief refresh the statistics and leaf value on the tree on the dataset
  * \author Tianqi Chen
  */
-#include <xgboost/tree_updater.h>
-
 #include <limits>
 #include <vector>
 
@@ -14,7 +12,9 @@
 #include "../predictor/predict_fn.h"
 #include "../tree/tree_view.h"  // for ScalarTreeView
 #include "./param.h"
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/json.h"
+#include "xgboost/tree_updater.h"
 
 namespace xgboost::tree {
 
@@ -30,13 +30,14 @@ class TreeRefresher : public TreeUpdater {
 
   [[nodiscard]] char const *Name() const override { return "refresh"; }
   [[nodiscard]] bool CanModifyTree() const override { return true; }
-  // update the tree, do pruning
-  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
+  // Update the tree, do pruning
+  void Update(TrainParam const *param, GradientContainer *in_gpair, DMatrix *p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree *> &trees) override {
     if (trees.size() == 0) {
       return;
     }
+    auto gpair = in_gpair->FullGradOnly();
     CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
     const std::vector<GradientPair> &gpair_h = gpair->Data()->ConstHostVector();
     // Thread local variables.
diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc
index fd1eb943781e..17244bc2c6a7 100644
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -9,6 +9,7 @@
 #include "../collective/broadcast.h"         // for Broadcast
 #include "../collective/communicator-inl.h"  // for GetRank, GetWorldSize
 #include "xgboost/context.h"                 // for Context
+#include "xgboost/gradient.h"                // for GradientContainer
 #include "xgboost/json.h"                    // for Json, Object
 #include "xgboost/linalg.h"                  // for Matrix
 #include "xgboost/tree_updater.h"            // for TreeUpdater
@@ -31,9 +32,9 @@ class TreeSyncher : public TreeUpdater {
 
   [[nodiscard]] char const* Name() const override { return "sync"; }
 
-  void Update(TrainParam const*, linalg::Matrix<GradientPair>*, DMatrix*,
+  void Update(TrainParam const*, GradientContainer*, DMatrix*,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
-              const std::vector<RegTree*>& trees) override {
+              std::vector<RegTree*> const& trees) override {
     if (collective::GetWorldSize() == 1) {
       return;
     }
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index abe0abccba8a..c62b3deaf0d7 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -65,8 +65,10 @@ TEST(GBTree, PredictionCache) {
 
   gbtree.Configure({{"tree_method", "hist"}});
   auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>{{kRows}, ctx.Device()};
+  gpair.gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   PredictionCacheEntry out_predictions;
   gbtree.DoBoost(p_m.get(), &gpair, &out_predictions, nullptr);
@@ -208,8 +210,9 @@ TEST(GBTree, ChooseTreeMethod) {
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
-      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, DeviceOrd::CPU()};
-      gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
+      GradientContainer gpair;
+      gpair.gpair = linalg::Matrix<GradientPair>{{Xy->Info().num_row_}, DeviceOrd::CPU()};
+      gpair.gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
       learner->BoostOneIter(0, Xy, &gpair);
     }
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index dcf0a694c897..83e0c17c4819 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -666,8 +666,9 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
   }
   p_dmat->Info().labels =
       linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, DeviceOrd::CPU()};
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Device());
-  auto h_gpair = gpair.HostView();
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>{{kRows}, ctx->Device()};
+  auto h_gpair = gpair.gpair.HostView();
   for (size_t i = 0; i < kRows; ++i) {
     h_gpair(i) = GradientPair{static_cast<float>(i), 1};
   }
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index c5b9e0153135..04e0bf806cd2 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -394,13 +394,12 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
   return gpair;
 }
 
-inline linalg::Matrix<GradientPair> GenerateRandomGradients(Context const* ctx, bst_idx_t n_rows,
-                                                            bst_target_t n_targets,
-                                                            float lower = 0.0f,
-                                                            float upper = 1.0f) {
+inline auto GenerateRandomGradients(Context const* ctx, bst_idx_t n_rows, bst_target_t n_targets,
+                                    float lower = 0.0f, float upper = 1.0f) {
   auto g = GenerateRandomGradients(n_rows * n_targets, lower, upper);
-  linalg::Matrix<GradientPair> gpair({n_rows, static_cast<bst_idx_t>(n_targets)}, ctx->Device());
-  gpair.Data()->Copy(g);
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>{{n_rows, static_cast<bst_idx_t>(n_targets)}, ctx->Device()};
+  gpair.gpair.Data()->Copy(g);
   return gpair;
 }
 
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 66d3e312c76b..bda5d43d58e9 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -202,8 +202,9 @@ void TestUpdatePredictionCache(bool use_subsampling) {
 
   auto dmat = RandomDataGenerator(kRows, kCols, 0).Classes(kClasses).GenerateDMatrix(true);
 
-  linalg::Matrix<GradientPair> gpair({kRows, kClasses}, ctx.Device());
-  auto h_gpair = gpair.HostView();
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>({kRows, kClasses}, ctx.Device());
+  auto h_gpair = gpair.gpair.HostView();
   for (size_t i = 0; i < kRows * kClasses; ++i) {
     std::apply(h_gpair, linalg::UnravelIndex(i, kRows, kClasses)) = {static_cast<float>(i), 1};
   }
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 367ec382e98f..a5858bb3e890 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -204,7 +204,7 @@ TEST(HistMultiEvaluator, Evaluate) {
 
   RegTree tree{n_targets, n_features};
   auto weight = evaluator.InitRoot(root_sum.HostView());
-  tree.SetLeaf(RegTree::kRoot, weight.HostView());
+  tree.SetRoot(weight.HostView());
   auto w = weight.HostView();
   ASSERT_EQ(w.Size(), n_targets);
   ASSERT_EQ(w(0), -1.5);
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 29d734c1fce1..a7623a658770 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -1,7 +1,9 @@
 /**
- * Copyright 2021-2024, XGBoost contributors.
+ * Copyright 2021-2025, XGBoost contributors.
  */
 #include <gtest/gtest.h>
+#include <xgboost/gradient.h>      // for GradientContainer
+#include <xgboost/tree_model.h>    // for RegTree
 #include <xgboost/tree_updater.h>  // for TreeUpdater
 
 #include <algorithm>  // for transform
@@ -14,7 +16,6 @@
 #include "../helpers.h"
 #include "test_column_split.h"  // for TestColumnSplit
 #include "test_partitioner.h"
-#include "xgboost/tree_model.h"  // for RegTree
 
 namespace xgboost::tree {
 namespace {
@@ -89,8 +90,7 @@ TEST(Approx, InteractionConstraint) {
   auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.6f, false);
   Context ctx;
 
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  GradientContainer gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   ObjInfo task{ObjInfo::kRegression};
   {
diff --git a/tests/cpp/tree/test_gpu_approx.cu b/tests/cpp/tree/test_gpu_approx.cu
index 7df60b8cbcd2..295d79c0a477 100644
--- a/tests/cpp/tree/test_gpu_approx.cu
+++ b/tests/cpp/tree/test_gpu_approx.cu
@@ -1,7 +1,8 @@
 /**
- * Copyright 2024, XGBoost contributors
+ * Copyright 2024-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/gradient.h>      // for GradientContainer
 #include <xgboost/json.h>          // for Json
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_model.h>    // for RegTree
@@ -21,8 +22,7 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
 
-  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
-  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+  auto gpair = GenerateRandomGradients(ctx, dmat->Info().num_row_, 1);
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index d0f546e6134e..01e378172959 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -1,9 +1,10 @@
 /**
- * Copyright 2017-2024, XGBoost contributors
+ * Copyright 2017-2025, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>                // for Args
 #include <xgboost/context.h>             // for Context
+#include <xgboost/gradient.h>            // for GradientContainer
 #include <xgboost/host_device_vector.h>  // for HostDeviceVector
 #include <xgboost/json.h>                // for Json
 #include <xgboost/task.h>                // for ObjInfo
@@ -21,7 +22,7 @@
 
 namespace xgboost::tree {
 namespace {
-void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
+void UpdateTree(Context const* ctx, GradientContainer* gpair, DMatrix* dmat,
                 RegTree* tree, HostDeviceVector<bst_float>* preds, float subsample,
                 const std::string& sampling_method, bst_bin_t max_bin, bool concat_pages) {
   Args args{
@@ -67,8 +68,7 @@ TEST(GpuHist, UniformSampling) {
   auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
   ASSERT_TRUE(p_fmat->SingleColBlock());
 
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  auto gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
@@ -97,9 +97,7 @@ TEST(GpuHist, GradientBasedSampling) {
 
   // Create an in-memory DMatrix.
   auto p_fmat = RandomDataGenerator{kRows, kCols, 0.0f}.GenerateDMatrix(true);
-
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  auto gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
@@ -135,8 +133,7 @@ TEST(GpuHist, ExternalMemory) {
   ASSERT_TRUE(p_fmat->SingleColBlock());
 
   auto ctx = MakeCUDACtx(0);
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  auto gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
@@ -177,8 +174,7 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
                         .GenerateSparsePageDMatrix("temp", true);
   ASSERT_FALSE(p_fmat_ext->SingleColBlock());
 
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
-  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  auto gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   // Build a tree using the in-memory DMatrix.
   auto rng = common::GlobalRandom();
@@ -276,9 +272,7 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
 
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
-
-  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
-  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+  auto gpair = GenerateRandomGradients(ctx, dmat->Info().num_row_, 1);
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   RegTree tree;
diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc
index 1a3ec532e18b..0b8e3258c9cd 100644
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -1,9 +1,10 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
-#include <xgboost/host_device_vector.h>
+#include <xgboost/gradient.h>            // for GradientContainer
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
 #include <xgboost/learner.h>
 #include <xgboost/tree_updater.h>
 
@@ -24,7 +25,8 @@ TEST(Updater, Prune) {
   Context ctx;
 
   // These data are just place holders.
-  linalg::Matrix<GradientPair> gpair
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>
       {{ {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f},
          {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} }, {8, 1}, ctx.Device()};
   std::shared_ptr<DMatrix> p_dmat{RandomDataGenerator{32, 10, 0}.GenerateDMatrix()};
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index d8e1e2c016ee..03763d179e87 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -1,7 +1,8 @@
 /**
- * Copyright 2018-2024, XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/gradient.h>  // for GradientContainer
 #include <xgboost/host_device_vector.h>
 #include <xgboost/linalg.h>
 #include <xgboost/tree_updater.h>
@@ -290,8 +291,9 @@ void TestPartitionerOverrun(bst_target_t n_targets) {
           "part_resize_big_first", true);
 
   std::size_t shape_large[2]{dmat_large->Info().num_row_, n_targets_size};
-  linalg::Matrix<GradientPair> gpair_large(shape_large, ctx.Device());
-  FillGradients(&gpair_large);
+  GradientContainer gpair_large;
+  gpair_large.gpair = linalg::Matrix<GradientPair>{shape_large, ctx.Device()};
+  FillGradients(&gpair_large.gpair);
 
   RegTree tree_large{n_targets, static_cast<bst_feature_t>(kCols)};
   std::vector<RegTree*> trees_large{&tree_large};
@@ -318,8 +320,9 @@ void TestPartitionerOverrun(bst_target_t n_targets) {
   std::memcpy(tail_before.data(), hv.data() + hv.size(), tail_elems * sizeof(bst_node_t));
 
   std::size_t shape_small[2]{dmat_small->Info().num_row_, n_targets_size};
-  linalg::Matrix<GradientPair> gpair_small(shape_small, ctx.Device());
-  FillGradients(&gpair_small);
+  GradientContainer gpair_small;
+  gpair_small.gpair = linalg::Matrix<GradientPair>{shape_small, ctx.Device()};
+  FillGradients(&gpair_small.gpair);
 
   RegTree tree_small{n_targets, static_cast<bst_feature_t>(kCols)};
   std::vector<RegTree*> trees_small{&tree_small};
diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc
index bbd274a08d0f..01052861ab03 100644
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -1,7 +1,8 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2025, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/gradient.h>  // for GradientContainer
 #include <xgboost/host_device_vector.h>
 #include <xgboost/task.h>  // for ObjInfo
 #include <xgboost/tree_updater.h>
@@ -19,9 +20,18 @@ TEST(Updater, Refresh) {
   bst_feature_t constexpr kCols = 16;
   Context ctx;
 
-  linalg::Matrix<GradientPair> gpair
-      {{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
-         {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} }, {8, 1}, ctx.Device()};
+  GradientContainer gpair;
+  gpair.gpair = linalg::Matrix<GradientPair>{{{0.23f, 0.24f},
+                                              {0.23f, 0.24f},
+                                              {0.23f, 0.24f},
+                                              {0.23f, 0.24f},
+                                              {0.27f, 0.29f},
+                                              {0.27f, 0.29f},
+                                              {0.27f, 0.29f},
+                                              {0.27f, 0.29f}},
+                                             {8, 1},
+                                             ctx.Device()};
+
   std::shared_ptr<DMatrix> p_dmat{
     RandomDataGenerator{kRows, kCols, 0.4f}.Seed(3).GenerateDMatrix()};
   std::vector<std::pair<std::string, std::string>> cfg{
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index 920c3e6af99c..7654c04a2125 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -3,6 +3,7 @@
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>       // for Context
+#include <xgboost/gradient.h>      // for GradientContainer
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_model.h>    // for RegTree
 #include <xgboost/tree_updater.h>  // for TreeUpdater
@@ -21,16 +22,15 @@ namespace xgboost {
 class UpdaterTreeStatTest : public ::testing::Test {
  protected:
   std::shared_ptr<DMatrix> p_dmat_;
-  linalg::Matrix<GradientPair> gpairs_;
+  GradientContainer gpairs_;
   size_t constexpr static kRows = 10;
   size_t constexpr static kCols = 10;
 
  protected:
   void SetUp() override {
     p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix(true);
-    auto g = GenerateRandomGradients(kRows);
-    gpairs_.Reshape(kRows, 1);
-    gpairs_.Data()->Copy(g);
+    Context ctx;
+    gpairs_ = GenerateRandomGradients(&ctx, kRows, 1);
   }
 
   void RunTest(Context const* ctx, std::string updater) {
@@ -99,7 +99,7 @@ class TestSplitWithEta : public ::testing::Test {
       updater->Configure({});
 
       auto grad = GenerateRandomGradients(ctx, Xy->Info().num_row_, n_targets);
-      CHECK_EQ(grad.Shape(1), n_targets);
+      CHECK_EQ(grad.gpair.Shape(1), n_targets);
       tree::TrainParam param;
       param.Init(Args{{"learning_rate", std::to_string(eta)}});
       HostDeviceVector<bst_node_t> position;
@@ -192,15 +192,15 @@ TEST_F(TestSplitWithEta, GpuApprox) {
 
 class TestMinSplitLoss : public ::testing::Test {
   std::shared_ptr<DMatrix> dmat_;
-  linalg::Matrix<GradientPair> gpair_;
+  GradientContainer gpair_;
 
   void SetUp() override {
     constexpr size_t kRows = 32;
     constexpr size_t kCols = 16;
     constexpr float kSparsity = 0.6;
     dmat_ = RandomDataGenerator(kRows, kCols, kSparsity).Seed(3).GenerateDMatrix();
-    gpair_.Reshape(kRows, 1);
-    gpair_.Data()->Copy(GenerateRandomGradients(kRows));
+    Context ctx;
+    gpair_ = GenerateRandomGradients(&ctx, kRows, 1);
   }
 
   std::int32_t Update(Context const* ctx, std::string updater, float gamma) {
diff --git a/tests/python-gpu/test_gpu_multi_target.py b/tests/python-gpu/test_gpu_multi_target.py
index 70acdf34924e..96c4d2c0cc18 100644
--- a/tests/python-gpu/test_gpu_multi_target.py
+++ b/tests/python-gpu/test_gpu_multi_target.py
@@ -1,4 +1,8 @@
-from xgboost.testing.multi_target import run_multiclass, run_multilabel
+from xgboost.testing.multi_target import (
+    run_multiclass,
+    run_multilabel,
+    run_reduced_grad,
+)
 
 
 def test_multiclass() -> None:
@@ -9,3 +13,7 @@ def test_multiclass() -> None:
 def test_multilabel() -> None:
     # learning_rate is not yet supported.
     run_multilabel("cuda", 1.0)
+
+
+def test_reduced_grad() -> None:
+    run_reduced_grad("cuda")

From 3e3c2ce7015e5d082615ed90c02cdb5994ae1f81 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 19:25:35 +0800
Subject: [PATCH 02/24] a class interface.

---
 python-package/xgboost/core.py                | 45 +++++++--------
 python-package/xgboost/objective.py           | 49 +++++++++++++++++
 .../xgboost/testing/multi_target.py           | 55 ++++++++++---------
 python-package/xgboost/training.py            |  3 +-
 4 files changed, 99 insertions(+), 53 deletions(-)
 create mode 100644 python-package/xgboost/objective.py

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 4066cd84a718..2a40d19beef7 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -77,6 +77,7 @@
     py_str,
 )
 from .libpath import find_lib_path
+from .objective import TreeObjective
 
 if TYPE_CHECKING:
     from pandas import DataFrame as PdDataFrame
@@ -2389,7 +2390,6 @@ def update(
         dtrain: DMatrix,
         iteration: int,
         fobj: Optional[Objective] = None,
-        fred: Optional[Objective] = None,  # fixme: type
     ) -> None:
         """Update for one iteration, with objective function calculated
         internally.  This function should not be called directly by users.
@@ -2408,37 +2408,29 @@ def update(
             raise TypeError(f"Invalid training matrix: {type(dtrain).__name__}")
         self._assign_dmatrix_features(dtrain)
 
-        if fobj is None and fred is None:
+        if fobj is None:
             _check_call(
                 _LIB.XGBoosterUpdateOneIter(
                     self.handle, ctypes.c_int(iteration), dtrain.handle
                 )
             )
-        elif fobj is not None and fred is not None:
+        else:
             pred = self.predict(dtrain, output_margin=True, training=True)
+            vgrad: Optional[ArrayLike]
+            vhess: Optional[ArrayLike]
             vgrad, vhess = fobj(pred, dtrain)
-            sgrad, shess = fred(vgrad, vhess, dtrain)
+            if isinstance(fobj, TreeObjective):
+                sgrad, shess = fobj.split_grad(vgrad, vhess)
+            else:
+                sgrad, shess = vgrad, vhess
+                vgrad, vhess = None, None
             self.boost(
                 dtrain,
                 iteration=iteration,
                 grad=sgrad,
                 hess=shess,
-                vgrad=vgrad,
-                vhess=vhess,
-            )
-        elif fobj is not None:
-            pred = self.predict(dtrain, output_margin=True, training=True)
-            grad, hess = fobj(pred, dtrain)
-            self.boost(
-                dtrain,
-                iteration=iteration,
-                grad=grad,
-                hess=hess,
-            )
-        else:
-            raise NotImplementedError(
-                "A custom gradient reducer with built-in objective is not yet"
-                " implemented."
+                _vgrad=vgrad,
+                _vhess=vhess,
             )
 
     def boost(
@@ -2447,8 +2439,8 @@ def boost(
         iteration: int,
         grad: NumpyOrCupy,
         hess: NumpyOrCupy,
-        vgrad: Optional[NumpyOrCupy] = None,
-        vhess: Optional[NumpyOrCupy] = None,
+        _vgrad: Optional[NumpyOrCupy] = None,  # WIP vector-leaf support
+        _vhess: Optional[NumpyOrCupy] = None,  # WIP vector-leaf support
     ) -> None:
         """Boost the booster for one iteration with customized gradient statistics.
         Like :py:func:`xgboost.Booster.update`, this function should not be called
@@ -2499,8 +2491,9 @@ def grad_arrinf(array: NumpyOrCupy) -> bytes:
 
             return interface
 
-        if vgrad is not None or vhess is not None:
-            assert vhess is not None and vgrad is not None
+        if _vgrad is not None or _vhess is not None:
+            assert _vhess is not None and _vgrad is not None
+            print("use with obj")
             _check_call(
                 _LIB.XGBoosterTrainOneIterWithObj(
                     self.handle,
@@ -2508,8 +2501,8 @@ def grad_arrinf(array: NumpyOrCupy) -> bytes:
                     iteration,
                     grad_arrinf(grad),
                     grad_arrinf(hess),
-                    grad_arrinf(vgrad),
-                    grad_arrinf(vhess),
+                    grad_arrinf(_vgrad),
+                    grad_arrinf(_vhess),
                 )
             )
         else:
diff --git a/python-package/xgboost/objective.py b/python-package/xgboost/objective.py
new file mode 100644
index 000000000000..756d7d4b96eb
--- /dev/null
+++ b/python-package/xgboost/objective.py
@@ -0,0 +1,49 @@
+"""Experimental support for a new objective interface with target dimension
+reduction.
+
+.. warning::
+
+  Do not use this module unless you want to participate in development.
+
+.. versionadded:: 3.2.0
+
+"""
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Tuple
+
+from ._typing import ArrayLike
+
+if TYPE_CHECKING:
+    from .core import DMatrix
+
+
+class Objective(ABC):
+    """Base class for custom objective function.
+
+    .. warning::
+
+        Do not use this class unless you want to participate in development.
+
+    """
+
+    @abstractmethod
+    def __call__(
+        self, y_pred: ArrayLike, dtrain: "DMatrix"
+    ) -> Tuple[ArrayLike, ArrayLike]: ...
+
+
+class TreeObjective(Objective):
+    """Base class for tree-specific custom objective function.
+
+    .. warning::
+
+        Do not use this class unless you want to participate in development.
+
+    """
+
+    def split_grad(
+        self, grad: ArrayLike, hess: ArrayLike
+    ) -> Tuple[ArrayLike, ArrayLike]:
+        """Provide different gradient type for finding tree structure."""
+        return grad, hess
diff --git a/python-package/xgboost/testing/multi_target.py b/python-package/xgboost/testing/multi_target.py
index 3a932e3e478f..212f4c82b7ce 100644
--- a/python-package/xgboost/testing/multi_target.py
+++ b/python-package/xgboost/testing/multi_target.py
@@ -1,6 +1,6 @@
 """Tests for multi-target training."""
 
-from typing import Callable, Dict, Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 import numpy as np
 from sklearn.datasets import (
@@ -11,7 +11,9 @@
 
 import xgboost.testing as tm
 
-from ..core import Booster, QuantileDMatrix
+from .._typing import ArrayLike
+from ..core import Booster, DMatrix, QuantileDMatrix
+from ..objective import TreeObjective
 from ..sklearn import XGBClassifier
 from ..training import train
 from .updater import ResetStrategy
@@ -64,25 +66,25 @@ def run_reduced_grad(device: Device) -> None:
     """Basic test for using reduced gradient for tree splits."""
     import cupy as cp
 
-    def reducer(
-        grad: np.ndarray, hess: np.ndarray, dtrain: QuantileDMatrix
-    ) -> tuple[cp.ndarray, cp.ndarray]:
-        return cp.array(grad), cp.array(hess)
+    class LsObj(TreeObjective):
+        def __call__(
+            self, y_pred: ArrayLike, dtrain: DMatrix
+        ) -> Tuple[cp.ndarray, cp.ndarray]:
+            y_true = dtrain.get_label().reshape(y_pred.shape)
+            grad, hess = tm.ls_obj(y_true, y_pred, None)
+            return cp.array(grad), cp.array(hess)
 
-    def ls_obj(
-        y_pred: np.ndarray, dtrain: QuantileDMatrix
-    ) -> Tuple[cp.ndarray, cp.ndarray]:
-        # no weight yet
-        y_true = dtrain.get_label().reshape(y_pred.shape)
-        grad, hess = tm.ls_obj(y_true, y_pred, None)
-        return cp.array(grad), cp.array(hess)
+        def split_grad(
+            self, grad: ArrayLike, hess: ArrayLike
+        ) -> Tuple[ArrayLike, ArrayLike]:
+            return cp.array(grad), cp.array(hess)
 
     X, y = make_regression(
         n_samples=1024, n_features=16, random_state=1994, n_targets=5
     )
     Xy = QuantileDMatrix(X, y)
 
-    def run_test(reducer: Optional[Callable]) -> Booster:
+    def run_test(obj: Optional[TreeObjective]) -> Booster:
         evals_result: Dict[str, Dict] = {}
         booster = train(
             {
@@ -92,26 +94,29 @@ def run_test(reducer: Optional[Callable]) -> Booster:
             },
             Xy,
             evals=[(Xy, "Train")],
-            red=reducer,
-            obj=ls_obj,
+            obj=obj,
             num_boost_round=8,
             evals_result=evals_result,
         )
         assert tm.non_increasing(evals_result["Train"]["rmse"])
         return booster
 
-    booster_0 = run_test(reducer)
+    booster_0 = run_test(LsObj())
     booster_1 = run_test(None)
     np.testing.assert_allclose(
         booster_0.inplace_predict(X), booster_1.inplace_predict(X)
     )
 
     # Use mean gradient, should still converge.
-    def reducer_1(
-        grad: np.ndarray, hess: np.ndarray, dtrain: QuantileDMatrix
-    ) -> tuple[cp.ndarray, cp.ndarray]:
-        sgrad = cp.mean(grad, axis=1)
-        shess = cp.mean(hess, axis=1)
-        return sgrad, shess
-
-    run_test(reducer_1)
+    class LsObj1(LsObj):
+        def split_grad(
+            self, grad: ArrayLike, hess: ArrayLike
+        ) -> Tuple[cp.ndarray, cp.ndarray]:
+            sgrad = cp.mean(grad, axis=1)
+            shess = cp.mean(hess, axis=1)
+            print(shess.shape)
+            assert False
+            return sgrad, shess
+
+    # booster_2 = run_test(LsObj1())
+    # print(booster_2.get_dump())
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index 233b62331bb0..75cebeff60d4 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -56,7 +56,6 @@ def train(
     *,
     evals: Optional[Sequence[Tuple[DMatrix, str]]] = None,
     obj: Optional[Objective] = None,
-    red: Optional[Objective] = None,  # fixme: type
     maximize: Optional[bool] = None,
     early_stopping_rounds: Optional[int] = None,
     evals_result: Optional[TrainingCallback.EvalsLog] = None,
@@ -197,7 +196,7 @@ def train(
     for i in range(start_iteration, num_boost_round):
         if cb_container.before_iteration(bst, i, dtrain, evals):
             break
-        bst.update(dtrain, iteration=i, fobj=obj, fred=red)
+        bst.update(dtrain, iteration=i, fobj=obj)
         if cb_container.after_iteration(bst, i, dtrain, evals):
             break
 

From 849248edc71ecb9ed3ca424a80ae5b80d0dc45e0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 19:36:35 +0800
Subject: [PATCH 03/24] tests.

---
 python-package/xgboost/core.py                |  1 -
 .../xgboost/testing/multi_target.py           | 42 ++++++++++++++-----
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 2a40d19beef7..84ac51751073 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2493,7 +2493,6 @@ def grad_arrinf(array: NumpyOrCupy) -> bytes:
 
         if _vgrad is not None or _vhess is not None:
             assert _vhess is not None and _vgrad is not None
-            print("use with obj")
             _check_call(
                 _LIB.XGBoosterTrainOneIterWithObj(
                     self.handle,
diff --git a/python-package/xgboost/testing/multi_target.py b/python-package/xgboost/testing/multi_target.py
index 212f4c82b7ce..8d914217d4fb 100644
--- a/python-package/xgboost/testing/multi_target.py
+++ b/python-package/xgboost/testing/multi_target.py
@@ -3,6 +3,7 @@
 from typing import Dict, Optional, Tuple
 
 import numpy as np
+import pytest
 from sklearn.datasets import (
     make_classification,
     make_multilabel_classification,
@@ -13,7 +14,7 @@
 
 from .._typing import ArrayLike
 from ..core import Booster, DMatrix, QuantileDMatrix
-from ..objective import TreeObjective
+from ..objective import Objective, TreeObjective
 from ..sklearn import XGBClassifier
 from ..training import train
 from .updater import ResetStrategy
@@ -66,7 +67,7 @@ def run_reduced_grad(device: Device) -> None:
     """Basic test for using reduced gradient for tree splits."""
     import cupy as cp
 
-    class LsObj(TreeObjective):
+    class LsObj0(TreeObjective):
         def __call__(
             self, y_pred: ArrayLike, dtrain: DMatrix
         ) -> Tuple[cp.ndarray, cp.ndarray]:
@@ -79,18 +80,29 @@ def split_grad(
         ) -> Tuple[ArrayLike, ArrayLike]:
             return cp.array(grad), cp.array(hess)
 
+    class LsObj1(Objective):
+        def __call__(
+            self, y_pred: ArrayLike, dtrain: DMatrix
+        ) -> Tuple[cp.ndarray, cp.ndarray]:
+            y_true = dtrain.get_label().reshape(y_pred.shape)
+            grad, hess = tm.ls_obj(y_true, y_pred, None)
+            return cp.array(grad), cp.array(hess)
+
     X, y = make_regression(
         n_samples=1024, n_features=16, random_state=1994, n_targets=5
     )
     Xy = QuantileDMatrix(X, y)
 
-    def run_test(obj: Optional[TreeObjective]) -> Booster:
+    def run_test(
+        obj: Optional[Objective], base_score: Optional[list[float]] = None
+    ) -> Booster:
         evals_result: Dict[str, Dict] = {}
         booster = train(
             {
                 "device": device,
                 "multi_strategy": "multi_output_tree",
                 "learning_rate": 1,
+                "base_score": base_score,
             },
             Xy,
             evals=[(Xy, "Train")],
@@ -101,22 +113,32 @@ def run_test(obj: Optional[TreeObjective]) -> Booster:
         assert tm.non_increasing(evals_result["Train"]["rmse"])
         return booster
 
-    booster_0 = run_test(LsObj())
-    booster_1 = run_test(None)
+    booster_0 = run_test(LsObj0())
+    booster_1 = run_test(LsObj1())
     np.testing.assert_allclose(
         booster_0.inplace_predict(X), booster_1.inplace_predict(X)
     )
 
+    booster_2 = run_test(LsObj0(), [0.5] * y.shape[1])
+    booster_3 = run_test(None, [0.5] * y.shape[1])
+    np.testing.assert_allclose(
+        booster_2.inplace_predict(X), booster_3.inplace_predict(X)
+    )
+
     # Use mean gradient, should still converge.
-    class LsObj1(LsObj):
+    class LsObj2(LsObj0):
+        def __init__(self, check_used: bool):
+            self._chk = check_used
+
         def split_grad(
             self, grad: ArrayLike, hess: ArrayLike
         ) -> Tuple[cp.ndarray, cp.ndarray]:
+            if self._chk:
+                assert False
             sgrad = cp.mean(grad, axis=1)
             shess = cp.mean(hess, axis=1)
-            print(shess.shape)
-            assert False
             return sgrad, shess
 
-    # booster_2 = run_test(LsObj1())
-    # print(booster_2.get_dump())
+    run_test(LsObj2(False))
+    with pytest.raises(AssertionError):
+        run_test(LsObj2(True))

From ac833ec576e3f8bb908522904976a82a84efcc8d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 20:27:42 +0800
Subject: [PATCH 04/24] cpu build.

---
 src/objective/adaptive.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index abbde7f3c0e9..3f8fe4f2a17e 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -163,7 +163,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
 }
 
 #if !defined(XGBOOST_USE_CUDA)
-void UpdateTreeLeafDevice(Context const*, common::Span<bst_node_t const>, std::int32_t,
+void UpdateTreeLeafDevice(Context const*, common::Span<bst_node_t const>, bst_target_t,
                           MetaInfo const&, float, HostDeviceVector<float> const&, float, RegTree*) {
   common::AssertGPUSupport();
 }

From bf83fe2d88c09557d80e2e3142e6c7ac749b4dfc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 20:29:29 +0800
Subject: [PATCH 05/24] lint.

---
 src/gbm/gbtree.cc |  5 +++--
 src/learner.cc    | 19 -------------------
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 219d14bba2cf..3239742452f1 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -266,7 +266,8 @@ void GBTree::DoBoost(DMatrix* p_fmat, GradientContainer* in_gpair, PredictionCac
     }
   } else {
     // Multi-target, scalar leaf
-    CHECK_EQ(in_gpair->gpair.Size() % n_groups, 0U) << "Must have exactly n_groups * n_samples gpairs.";
+    CHECK_EQ(in_gpair->gpair.Size() % n_groups, 0U)
+        << "Must have exactly n_groups * n_samples gpairs.";
     GradientContainer tmp;
     tmp.gpair = linalg::Matrix<GradientPair>{
         {in_gpair->gpair.Shape(0), static_cast<std::size_t>(1ul)}, ctx_->Device()};
@@ -342,7 +343,7 @@ void GBTree::BoostNewTrees(GradientContainer* gpair, DMatrix* p_fmat, int bst_gr
       "Mismatching size between number of rows from input data and size of gradient vector."};
   if (!model_.learner_model_param->IsVectorLeaf() && p_fmat->Info().num_row_ != 0) {
     CHECK_EQ(n_out % gpair->gpair.Size(), 0) << msg;
-  } else if (model_.learner_model_param->IsVectorLeaf()){
+  } else if (model_.learner_model_param->IsVectorLeaf()) {
     // vector leaf
     if (!gpair->HasValueGrad()) {
       CHECK_EQ(gpair->gpair.Size(), n_out) << msg;
diff --git a/src/learner.cc b/src/learner.cc
index 8faf0506c5d5..92d871f4ff9f 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1150,25 +1150,6 @@ class LearnerImpl : public LearnerIO {
     monitor_.Stop("UpdateOneIter");
   }
 
-  // void BoostOneIter(int iter, std::shared_ptr<DMatrix> train,
-  //                   linalg::Matrix<GradientPair>* in_gpair) override {
-  //   monitor_.Start("BoostOneIter");
-  //   this->Configure();
-
-  //   if (ctx_.seed_per_iteration) {
-  //     common::GlobalRandom().seed(ctx_.seed * kRandSeedMagic + iter);
-  //   }
-
-  //   this->ValidateDMatrix(train.get(), true);
-
-  //   CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
-  //       << "The number of columns in gradient should be equal to the number of targets/classes in "
-  //          "the model.";
-  //   auto predt = prediction_container_.Cache(train, ctx_.Device());
-  //   gbm_->DoBoost(train.get(), in_gpair, predt.get(), obj_.get());
-  //   monitor_.Stop("BoostOneIter");
-  // }
-
   void BoostOneIter(std::int32_t iter, std::shared_ptr<DMatrix> train,
                     GradientContainer* in_gpair) override {
     this->monitor_.Start(__func__);

From 37b5474d43a1d6a5331c691ed3fa2ba228e77165 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 20:40:58 +0800
Subject: [PATCH 06/24] cleanup.

---
 src/learner.cc                        | 2 +-
 src/tree/gpu_hist/row_partitioner.cuh | 2 +-
 src/tree/updater_gpu_hist.cuh         | 5 ++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/learner.cc b/src/learner.cc
index 92d871f4ff9f..a5a28550bdb7 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1160,7 +1160,7 @@ class LearnerImpl : public LearnerIO {
     }
 
     this->ValidateDMatrix(train.get(), true);
-    // fixme: avoid duplicated code, including the error message
+
     CHECK_GE(this->learner_model_param_.OutputLength(), in_gpair->value_gpair.Shape(1))
         << "The number of columns in gradient should be equal to or lesser than the number of "
            "targets/classes in the model.";
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index bc22e80cec16..1d51eda307ed 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -304,7 +304,7 @@ class RowPartitioner {
   [[nodiscard]] bst_node_t GetNumNodes() const { return n_nodes_; }
 
   /**
-   * @brief Convenient method for testing.
+   * @brief Convenience method for testing.
    */
   std::vector<RowIndexT> GetRowsHost(bst_node_t nidx);
 
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index 0965d1f9ace8..74b017219c2a 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -123,7 +123,6 @@ class MultiTargetHistMaker {
   dh::device_vector<GradientPairInt64> CalcRootSum(
       linalg::MatrixView<GradientPair> d_gpair,
       common::Span<GradientQuantiser const> roundings) const {
-    // fixme: merge with fit stump.
     auto n_samples = d_gpair.Shape(0);
     auto n_targets = d_gpair.Shape(1);
     // Calculate the root sum
@@ -402,14 +401,14 @@ class MultiTargetHistMaker {
     }
   }
 
-  void GrowTree(linalg::Matrix<GradientPair>* splti_gpair, DMatrix* p_fmat, ObjInfo const*,
+  void GrowTree(linalg::Matrix<GradientPair>* split_gpair, DMatrix* p_fmat, ObjInfo const*,
                 RegTree* p_tree) {
     if (this->param_.learning_rate - 1.0 != 0.0) {
       LOG(FATAL) << "GPU" << MTNotImplemented();
     }
     Driver<MultiExpandEntry> driver{param_, kMaxNodeBatchSize};
 
-    this->Reset(splti_gpair, p_fmat);
+    this->Reset(split_gpair, p_fmat);
     driver.Push({this->InitRoot(p_fmat, p_tree)});
 
     // The set of leaves that can be expanded asynchronously

From db2c7638a0de5ea36882ce613c5983764321cc9b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 20:45:02 +0800
Subject: [PATCH 07/24] sycl.

---
 plugin/sycl/tree/updater_quantile_hist.cc |  5 +++--
 plugin/sycl/tree/updater_quantile_hist.h  | 22 ++++++++++------------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/plugin/sycl/tree/updater_quantile_hist.cc b/plugin/sycl/tree/updater_quantile_hist.cc
index a8fe602e6399..5aa89f6222a1 100644
--- a/plugin/sycl/tree/updater_quantile_hist.cc
+++ b/plugin/sycl/tree/updater_quantile_hist.cc
@@ -8,6 +8,7 @@
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wtautological-constant-compare"
 #pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/tree_updater.h"
 #pragma GCC diagnostic pop
 
@@ -72,11 +73,11 @@ void QuantileHistMaker::CallUpdate(
   }
 }
 
-void QuantileHistMaker::Update(xgboost::tree::TrainParam const *param,
-                               linalg::Matrix<GradientPair>* gpair,
+void QuantileHistMaker::Update(xgboost::tree::TrainParam const *param, GradientContainer *in_gpair,
                                DMatrix *dmat,
                                xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
                                const std::vector<RegTree *> &trees) {
+  auto gpair = in_gpair->FullGradOnly();
   gpair->Data()->SetDevice(ctx_->Device());
   if (dmat != p_last_dmat_ || is_gmat_initialized_ == false) {
     updater_monitor_.Start("GmatInitialization");
diff --git a/plugin/sycl/tree/updater_quantile_hist.h b/plugin/sycl/tree/updater_quantile_hist.h
index e60153fa7d32..d89b07d80ccf 100644
--- a/plugin/sycl/tree/updater_quantile_hist.h
+++ b/plugin/sycl/tree/updater_quantile_hist.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2024 by Contributors
+/**
+ * Copyright 2017-2025, XGBoost Contributors
  * \file updater_quantile_hist.h
  */
 #ifndef PLUGIN_SYCL_TREE_UPDATER_QUANTILE_HIST_H_
@@ -8,21 +8,21 @@
 #include <dmlc/timer.h>
 #include <xgboost/tree_updater.h>
 
-#include <vector>
 #include <memory>
+#include <vector>
 
-#include "../data/gradient_index.h"
+#include "../../src/common/random.h"
+#include "../../src/tree/constraints.h"
 #include "../common/hist_util.h"
-#include "../common/row_set.h"
 #include "../common/partition_builder.h"
-#include "split_evaluator.h"
+#include "../common/row_set.h"
+#include "../data/gradient_index.h"
 #include "../device_manager.h"
 #include "hist_updater.h"
+#include "split_evaluator.h"
 #include "xgboost/data.h"
-
+#include "xgboost/gradient.h"  // for GradientContainer
 #include "xgboost/json.h"
-#include "../../src/tree/constraints.h"
-#include "../../src/common/random.h"
 
 namespace xgboost {
 namespace sycl {
@@ -48,9 +48,7 @@ class QuantileHistMaker: public TreeUpdater {
   }
   void Configure(const Args& args) override;
 
-  void Update(xgboost::tree::TrainParam const *param,
-              linalg::Matrix<GradientPair>* gpair,
-              DMatrix* dmat,
+  void Update(xgboost::tree::TrainParam const* param, GradientContainer* in_gpair, DMatrix* dmat,
               xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override;
 

From 009c14d1ea2b47c97a64c6c2b6641ad65cf65789 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 20:49:26 +0800
Subject: [PATCH 08/24] rename.

---
 python-package/xgboost/core.py |  2 +-
 src/c_api/c_api.cc             | 26 +++++++++++++++++---------
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 84ac51751073..016bb4254d0a 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2494,7 +2494,7 @@ def grad_arrinf(array: NumpyOrCupy) -> bytes:
         if _vgrad is not None or _vhess is not None:
             assert _vhess is not None and _vgrad is not None
             _check_call(
-                _LIB.XGBoosterTrainOneIterWithObj(
+                _LIB.XGBoosterTrainOneIterWithWithSplitGrad(
                     self.handle,
                     dtrain.handle,
                     iteration,
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 35bc730044ed..3d99dbf89d9e 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1252,20 +1252,27 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in
   API_END();
 }
 
-// Hidden, experimental
-// fixme: find a better way to consume gradients, maybe expose the objective.
-//
-// We can not obtain the gradient from built-in objectives without making copy due to
-// array-of-structs.
-XGB_DLL int XGBoosterTrainOneIterWithObj(BoosterHandle handle, DMatrixHandle dtrain, int iter,
-                                         char const *split_grad, char const *split_hess,
-                                         char const *value_grad, char const *value_hess) {
+typedef char const *JArrayStr;  // NOLINT(modernize-use-using)
+
+// Hidden, working-in-progress support for reduced gradient. CUDA-only at the moment.
+/**
+ * @brief Use a different type of gradient for tree split.
+ *
+ * @param split_grad Gradient for finding tree splits.
+ * @param split_hess Hessian for finding tree splits.
+ * @param value_grad Gradient for calculating tree leaf weight.
+ * @param value_hess Hessian for calculating tree leaf weight.
+ */
+XGB_DLL int XGBoosterTrainOneIterWithWithSplitGrad(BoosterHandle handle, DMatrixHandle dtrain,
+                                                   int iter, JArrayStr split_grad,
+                                                   JArrayStr split_hess, JArrayStr value_grad,
+                                                   JArrayStr value_hess) {
   API_BEGIN();
   CHECK_HANDLE();
   auto *learner = static_cast<Learner *>(handle);
   GradientContainer gpair;
   auto ctx = learner->Ctx();
-
+  CHECK(ctx->IsCUDA()) << "Reduced gradient with CPU" << MTNotImplemented();
   {
     ArrayInterface<2, false> i_grad{StringView{split_grad}};
     ArrayInterface<2, false> i_hess{StringView{split_hess}};
@@ -1280,6 +1287,7 @@ XGB_DLL int XGBoosterTrainOneIterWithObj(BoosterHandle handle, DMatrixHandle dtr
         << "Reduced gradient with CPU" << MTNotImplemented();
     CopyGradientFromCudaArrays(ctx, i_grad, i_hess, &gpair.value_gpair);
   }
+
   auto p_fmat = CastDMatrixHandle(dtrain);
   learner->BoostOneIter(iter, p_fmat, &gpair);
 

From 165c36a2d8f4a7f4f98487f0f04bd6e66120847b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 21:13:39 +0800
Subject: [PATCH 09/24] Cleanup.

---
 src/common/device_debug.cuh                |  5 +++--
 src/tree/gpu_hist/multi_evaluate_splits.cu |  2 +-
 src/tree/gpu_hist/row_partitioner.cuh      | 17 ++++++++++++++++-
 src/tree/leaf_sum.cu                       | 16 +++++-----------
 src/tree/leaf_sum.cuh                      |  4 ++--
 src/tree/tree_model.cc                     |  1 -
 src/tree/updater_gpu_hist.cuh              | 22 +++++++++-------------
 7 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/src/common/device_debug.cuh b/src/common/device_debug.cuh
index fa36a3f3c8dd..9341f2fe0a78 100644
--- a/src/common/device_debug.cuh
+++ b/src/common/device_debug.cuh
@@ -1,8 +1,9 @@
 /**
  * Copyright 2025, XGBoost contributors
  */
-#include <cstddef>  // for size_t
-#include <vector>   // for vector
+#include <cstddef>   // for size_t
+#include <iostream>  // for cout
+#include <vector>    // for vector
 
 #include "common.h"
 #include "device_helpers.cuh"     // for CopyDeviceSpanToVector
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
index 6157e5a8ac03..a9fd02fcb829 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cu
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -275,7 +275,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
       dh::ToSpan(d_splits));
 
   // Find best split for each node
-  // * 3 because of three nodes, parent, left, right
+  // * 3 because of base, left, right weights.
   this->weights_.resize(n_nodes * n_targets * 3);
   auto d_weights = dh::ToSpan(this->weights_);
 
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 1d51eda307ed..5e0dc68661d6 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -208,6 +208,11 @@ struct NodePositionInfo {
   [[nodiscard]] XGBOOST_DEVICE bool IsLeaf() const { return left_child == -1; }
 };
 
+struct LeafInfo {
+  bst_node_t nidx;
+  NodePositionInfo node;
+};
+
 XGBOOST_DEV_INLINE int GetPositionFromSegments(std::size_t idx,
                                                const NodePositionInfo* d_node_info) {
   int position = 0;
@@ -308,7 +313,17 @@ class RowPartitioner {
    */
   std::vector<RowIndexT> GetRowsHost(bst_node_t nidx);
 
-  std::vector<NodePositionInfo> const& GetSegmentsHost() const { return this->ridx_segments_; }
+  [[nodiscard]] std::vector<LeafInfo> GetLeaves() const {
+    std::vector<LeafInfo> leaves;
+    bst_node_t nidx = 0;
+    for (auto const& node : this->ridx_segments_) {
+      if (node.IsLeaf()) {
+        leaves.emplace_back(LeafInfo{nidx, node});
+      }
+      nidx += 1;
+    }
+    return leaves;
+  }
 
   /**
    * \brief Updates the tree position for set of training instances being split
diff --git a/src/tree/leaf_sum.cu b/src/tree/leaf_sum.cu
index b81515f29b64..a83ab076259c 100644
--- a/src/tree/leaf_sum.cu
+++ b/src/tree/leaf_sum.cu
@@ -5,7 +5,7 @@
 
 #include "../common/linalg_op.cuh"       // for tbegin
 #include "gpu_hist/quantiser.cuh"        // for GradientQuantiser
-#include "gpu_hist/row_partitioner.cuh"  // for RowIndexT, NodePositionInfo
+#include "gpu_hist/row_partitioner.cuh"  // for RowIndexT, LeafInfo
 #include "leaf_sum.cuh"
 #include "updater_gpu_common.cuh"  // for GPUTrainingParam
 #include "xgboost/base.h"          // for GradientPairInt64
@@ -14,25 +14,19 @@
 #include "xgboost/span.h"          // for Span
 
 namespace xgboost::tree::cuda_impl {
-void LeafGradSum(Context const* ctx, std::vector<NodePositionInfo> const& h_segments,
+void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
                  common::Span<GradientQuantiser const> roundings,
                  common::Span<RowIndexT const> sorted_ridx,
                  linalg::MatrixView<GradientPair const> grad,
                  linalg::MatrixView<GradientPairInt64> out_sum) {
-  std::vector<NodePositionInfo> h_leaves;
-  for (auto const& node : h_segments) {
-    if (node.IsLeaf()) {
-      h_leaves.push_back(node);
-    }
-  }
   CHECK_EQ(h_leaves.size(), out_sum.Shape(0));
 
-  dh::device_vector<NodePositionInfo> leaves(h_leaves);
+  dh::device_vector<LeafInfo> leaves(h_leaves);
   auto d_leaves = dh::ToSpan(leaves);
 
   std::vector<RowIndexT> h_indptr{0};
   for (auto const& node : h_leaves) {
-    h_indptr.push_back(node.segment.Size());
+    h_indptr.push_back(node.node.segment.Size());
   }
   // leaves form a complete partition
   dh::device_vector<RowIndexT> indptr{h_indptr};
@@ -50,7 +44,7 @@ void LeafGradSum(Context const* ctx, std::vector<NodePositionInfo> const& h_segm
     auto it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) {
       auto nidx_in_set = dh::SegmentId(d_indptr, i);
       auto k = i - d_indptr[nidx_in_set];
-      auto j = d_leaves[nidx_in_set].segment.begin + k;
+      auto j = d_leaves[nidx_in_set].node.segment.begin + k;
       auto g = grad(sorted_ridx[j], t);
       return roundings[t].ToFixedPoint(g);
     });
diff --git a/src/tree/leaf_sum.cuh b/src/tree/leaf_sum.cuh
index 4829eae1ed3e..4b880567f3f2 100644
--- a/src/tree/leaf_sum.cuh
+++ b/src/tree/leaf_sum.cuh
@@ -6,7 +6,7 @@
 #include <vector>  // for vector
 
 #include "gpu_hist/quantiser.cuh"        // for GradientQuantiser
-#include "gpu_hist/row_partitioner.cuh"  // for RowIndexT, NodePositionInfo
+#include "gpu_hist/row_partitioner.cuh"  // for RowIndexT, LeafInfo
 #include "updater_gpu_common.cuh"        // for GPUTrainingParam
 #include "xgboost/context.h"             // for Context
 #include "xgboost/linalg.h"              // for MatrixView
@@ -14,7 +14,7 @@
 
 namespace xgboost::tree::cuda_impl {
 // shape(out_sum) == (n_leaves, n_targets)
-void LeafGradSum(Context const* ctx, std::vector<NodePositionInfo> const& h_segments,
+void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
                  common::Span<GradientQuantiser const> roundings,
                  common::Span<RowIndexT const> sorted_ridx,
                  linalg::MatrixView<GradientPair const> grad,
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 96883d16a60b..6e8176fce21e 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -890,7 +890,6 @@ void RegTree::ExpandNode(bst_node_t nidx, bst_feature_t split_index, float split
 }
 
 void RegTree::SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights) {
-  // fixme: cleanup
   CHECK(IsMultiTarget());
   this->p_mt_tree_->SetLeaves(std::move(leaves), weights);
 }
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index 74b017219c2a..bb9dceaca065 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -208,28 +208,24 @@ class MultiTargetHistMaker {
   }
 
   void UpdateTreeLeaf(linalg::Matrix<GradientPair> const& full_grad, RegTree* p_tree) const {
-    auto const& h_segments = this->partitioners_.front()->GetSegmentsHost();
-    std::vector<bst_node_t> leaves;
-    bst_node_t nidx = 0;
-    for (auto const& node : h_segments) {
-      if (node.IsLeaf()) {
-        leaves.push_back(nidx);
-      }
-      nidx += 1;
-    }
+    // TODO(jiamingy): Need to iterate through partitioners for external memory support.
+    auto leaves = this->partitioners_.front()->GetLeaves();
     // Calculate the leaf weight based on the node sum for each leaf.
     // Update the leaf weight, with learning rate.
     linalg::Matrix<GradientPairInt64> out_sum(
         {leaves.size(), static_cast<std::size_t>(p_tree->NumTargets())}, this->ctx_->Device());
-    LeafGradSum(this->ctx_, this->partitioners_.front()->GetSegmentsHost(),
-                this->value_quantizer_->Quantizers(), this->partitioners_.front()->GetRows(),
-                full_grad.View(this->ctx_->Device()), out_sum.View(this->ctx_->Device()));
+    LeafGradSum(this->ctx_, leaves, this->value_quantizer_->Quantizers(),
+                this->partitioners_.front()->GetRows(), full_grad.View(this->ctx_->Device()),
+                out_sum.View(this->ctx_->Device()));
     auto param = GPUTrainingParam{this->param_};
     linalg::Matrix<float> out_weight =
         linalg::Empty<float>(this->ctx_, leaves.size(), p_tree->NumTargets());
     LeafWeight(this->ctx_, param, this->value_quantizer_->Quantizers(),
                out_sum.View(this->ctx_->Device()), out_weight.View(this->ctx_->Device()));
-    p_tree->SetLeaves(leaves, out_weight.Data()->ConstHostSpan());
+    std::vector<bst_node_t> leaves_idx(leaves.size());
+    std::transform(leaves.begin(), leaves.end(), leaves_idx.begin(),
+                   [](LeafInfo const& leaf) { return leaf.nidx; });
+    p_tree->SetLeaves(leaves_idx, out_weight.Data()->ConstHostSpan());
   }
 
   struct NodeSplitData {

From 354f08ccaffc1ff6e3af4b2f0395845da0bd03a7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 21:17:57 +0800
Subject: [PATCH 10/24] Cleanup.

---
 src/common/device_debug.cuh                | 2 ++
 src/learner.cc                             | 6 +++++-
 src/tree/gpu_hist/multi_evaluate_splits.cu | 1 -
 src/tree/updater_gpu_hist.cuh              | 1 -
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/common/device_debug.cuh b/src/common/device_debug.cuh
index 9341f2fe0a78..6a2dfd285ea4 100644
--- a/src/common/device_debug.cuh
+++ b/src/common/device_debug.cuh
@@ -1,6 +1,8 @@
 /**
  * Copyright 2025, XGBoost contributors
  */
+#pragma once
+
 #include <cstddef>   // for size_t
 #include <iostream>  // for cout
 #include <vector>    // for vector
diff --git a/src/learner.cc b/src/learner.cc
index a5a28550bdb7..ceb8303d77d4 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1161,9 +1161,13 @@ class LearnerImpl : public LearnerIO {
 
     this->ValidateDMatrix(train.get(), true);
 
-    CHECK_GE(this->learner_model_param_.OutputLength(), in_gpair->value_gpair.Shape(1))
+    CHECK_GE(this->learner_model_param_.OutputLength(), in_gpair->NumSplitTargets())
         << "The number of columns in gradient should be equal to or lesser than the number of "
            "targets/classes in the model.";
+    if (in_gpair->HasValueGrad()) {
+      CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->NumTargets())
+          << "Value gradient should have the same number of targets as the overall model.";
+    }
     auto predt = prediction_container_.Cache(train, ctx_.Device());
     this->gbm_->DoBoost(train.get(), in_gpair, predt.get(), obj_.get());
     this->monitor_.Stop(__func__);
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
index a9fd02fcb829..22bc31118164 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cu
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -260,7 +260,6 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
     scans[nidx_in_set] = dh::ToSpan(this->scan_buffer_)
                              .subspan(nidx_in_set * node_hist_size * 2, node_hist_size * 2);
   }
-  // fixme: make sure root sum is copied.
 
   // Launch histogram scan kernel
   dim3 grid{n_nodes, n_features, n_targets};
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index bb9dceaca065..ca8e9fcfe69d 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -411,7 +411,6 @@ class MultiTargetHistMaker {
     auto expand_set = driver.Pop();
     while (!expand_set.empty()) {
       for (auto& candidate : expand_set) {
-        // fixme: prevent node size == 0
         this->ApplySplit(candidate, p_tree);
       }
 

From e6fdc9a474f2c6b56729ca6fb39874fbfe225dfc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 21:20:28 +0800
Subject: [PATCH 11/24] lint.

---
 src/c_api/c_api.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 3d99dbf89d9e..b38ea6b8e2c4 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1252,7 +1252,7 @@ XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, in
   API_END();
 }
 
-typedef char const *JArrayStr;  // NOLINT(modernize-use-using)
+typedef char const *JArrayStr;  // NOLINT
 
 // Hidden, working-in-progress support for reduced gradient. CUDA-only at the moment.
 /**
@@ -1260,8 +1260,8 @@ typedef char const *JArrayStr;  // NOLINT(modernize-use-using)
  *
  * @param split_grad Gradient for finding tree splits.
  * @param split_hess Hessian for finding tree splits.
- * @param value_grad Gradient for calculating tree leaf weight.
- * @param value_hess Hessian for calculating tree leaf weight.
+ * @param value_grad Gradient for calculating tree leaf weights.
+ * @param value_hess Hessian for calculating tree leaf weights.
  */
 XGB_DLL int XGBoosterTrainOneIterWithWithSplitGrad(BoosterHandle handle, DMatrixHandle dtrain,
                                                    int iter, JArrayStr split_grad,

From e937028efe2ce8002813e8b02af608c275cd2070 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 22:04:11 +0800
Subject: [PATCH 12/24] Lint.

---
 src/tree/updater_gpu_hist.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index ca8e9fcfe69d..e348947e7f39 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -146,7 +146,6 @@ class MultiTargetHistMaker {
 
   [[nodiscard]] MultiExpandEntry InitRoot(DMatrix* p_fmat, RegTree* p_tree) {
     auto d_gpair = split_gpair_.View(ctx_->Device());
-    auto n_samples = d_gpair.Shape(0);
     auto n_targets = d_gpair.Shape(1);
 
     // Calculate the root sum

From efb83ea52c9c135485ba5b85f71d6c4f8569cf0a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 7 Nov 2025 22:09:00 +0800
Subject: [PATCH 13/24] Fix test.

---
 src/learner.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/learner.cc b/src/learner.cc
index ceb8303d77d4..5a10eda78832 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1160,13 +1160,13 @@ class LearnerImpl : public LearnerIO {
     }
 
     this->ValidateDMatrix(train.get(), true);
-
-    CHECK_GE(this->learner_model_param_.OutputLength(), in_gpair->NumSplitTargets())
-        << "The number of columns in gradient should be equal to or lesser than the number of "
-           "targets/classes in the model.";
     if (in_gpair->HasValueGrad()) {
       CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->NumTargets())
           << "Value gradient should have the same number of targets as the overall model.";
+    } else {
+      CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->NumSplitTargets())
+          << "The number of columns in gradient should be equal to the number of "
+             "targets/classes in the model.";
     }
     auto predt = prediction_container_.Cache(train, ctx_.Device());
     this->gbm_->DoBoost(train.get(), in_gpair, predt.get(), obj_.get());

From bfcdf188bab95ef1268c41131bbe47cdc9fc789d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 00:04:54 +0800
Subject: [PATCH 14/24] Cleanup.

---
 python-package/xgboost/core.py | 2 +-
 src/c_api/c_api.cc             | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 016bb4254d0a..58aef079a839 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2494,7 +2494,7 @@ def grad_arrinf(array: NumpyOrCupy) -> bytes:
         if _vgrad is not None or _vhess is not None:
             assert _vhess is not None and _vgrad is not None
             _check_call(
-                _LIB.XGBoosterTrainOneIterWithWithSplitGrad(
+                _LIB.XGBoosterTrainOneIterWithSplitGrad(
                     self.handle,
                     dtrain.handle,
                     iteration,
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index b38ea6b8e2c4..bb552276f0d8 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1263,10 +1263,9 @@ typedef char const *JArrayStr;  // NOLINT
  * @param value_grad Gradient for calculating tree leaf weights.
  * @param value_hess Hessian for calculating tree leaf weights.
  */
-XGB_DLL int XGBoosterTrainOneIterWithWithSplitGrad(BoosterHandle handle, DMatrixHandle dtrain,
-                                                   int iter, JArrayStr split_grad,
-                                                   JArrayStr split_hess, JArrayStr value_grad,
-                                                   JArrayStr value_hess) {
+XGB_DLL int XGBoosterTrainOneIterWithSplitGrad(BoosterHandle handle, DMatrixHandle dtrain, int iter,
+                                               JArrayStr split_grad, JArrayStr split_hess,
+                                               JArrayStr value_grad, JArrayStr value_hess) {
   API_BEGIN();
   CHECK_HANDLE();
   auto *learner = static_cast<Learner *>(handle);

From 1c96961f20ea44d4a07a21207c88dc943c6a0526 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 00:37:12 +0800
Subject: [PATCH 15/24] Device tree.

---
 src/tree/updater_gpu_hist.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index e348947e7f39..d71709c6207d 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -308,6 +308,8 @@ class MultiTargetHistMaker {
 
     histogram_.AllocateHistograms(ctx_, build_nidx);
 
+    // Pull to device
+    mt_tree = MultiTargetTreeView{this->ctx_->Device(), p_tree};
     std::int32_t k{0};
     // TODO(jiamingy): Support external memory.
     bool prefetch_copy = true;

From d523a96d1854431e48d27e05118868934d5f21be Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 00:58:27 +0800
Subject: [PATCH 16/24] pylint.

---
 python-package/xgboost/testing/multi_target.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python-package/xgboost/testing/multi_target.py b/python-package/xgboost/testing/multi_target.py
index 8d914217d4fb..ccfa5a83cb1c 100644
--- a/python-package/xgboost/testing/multi_target.py
+++ b/python-package/xgboost/testing/multi_target.py
@@ -45,6 +45,7 @@ def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
 
 def run_multilabel(device: Device, learning_rate: Optional[float]) -> None:
     """Use vector leaf for multi-label classification models."""
+    # pylint: disable=unbalanced-tuple-unpacking
     X, y = make_multilabel_classification(128, random_state=2025)
     clf = XGBClassifier(
         multi_strategy="multi_output_tree",
@@ -68,6 +69,8 @@ def run_reduced_grad(device: Device) -> None:
     import cupy as cp
 
     class LsObj0(TreeObjective):
+        """Split grad is the same as value grad."""
+
         def __call__(
             self, y_pred: ArrayLike, dtrain: DMatrix
         ) -> Tuple[cp.ndarray, cp.ndarray]:
@@ -81,6 +84,8 @@ def split_grad(
             return cp.array(grad), cp.array(hess)
 
     class LsObj1(Objective):
+        """No split grad."""
+
         def __call__(
             self, y_pred: ArrayLike, dtrain: DMatrix
         ) -> Tuple[cp.ndarray, cp.ndarray]:
@@ -88,7 +93,7 @@ def __call__(
             grad, hess = tm.ls_obj(y_true, y_pred, None)
             return cp.array(grad), cp.array(hess)
 
-    X, y = make_regression(
+    X, y = make_regression(  # pylint: disable=unbalanced-tuple-unpacking
         n_samples=1024, n_features=16, random_state=1994, n_targets=5
     )
     Xy = QuantileDMatrix(X, y)
@@ -127,6 +132,8 @@ def run_test(
 
     # Use mean gradient, should still converge.
     class LsObj2(LsObj0):
+        """Use mean as split grad."""
+
         def __init__(self, check_used: bool):
             self._chk = check_used
 

From 93cdc00942db0a141b55eb2e344e1e6c5422661d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 02:09:38 +0800
Subject: [PATCH 17/24] Test.

---
 tests/cpp/tree/gpu_hist/dummy_quantizer.cuh   | 22 +++++++++
 .../gpu_hist/test_multi_evaluate_splits.cu    |  4 +-
 .../cpp/tree/gpu_hist/test_multi_histogram.cu |  9 ++--
 tests/cpp/tree/test_leaf_sum.cu               | 47 +++++++++++++++++++
 4 files changed, 76 insertions(+), 6 deletions(-)
 create mode 100644 tests/cpp/tree/gpu_hist/dummy_quantizer.cuh
 create mode 100644 tests/cpp/tree/test_leaf_sum.cu

diff --git a/tests/cpp/tree/gpu_hist/dummy_quantizer.cuh b/tests/cpp/tree/gpu_hist/dummy_quantizer.cuh
new file mode 100644
index 000000000000..62619634264f
--- /dev/null
+++ b/tests/cpp/tree/gpu_hist/dummy_quantizer.cuh
@@ -0,0 +1,22 @@
+/**
+ * Copyright 2025, XGBoost Contributors
+ */
+#pragma once
+
+#include <xgboost/base.h>  // for bst_target_t
+
+#include <vector>  // for vector
+
+#include "../../../../src/common/device_vector.cuh"     // for device_vector
+#include "../../../../src/tree/gpu_hist/quantiser.cuh"  // for GradientQuantiser
+
+namespace xgboost::tree {
+inline auto MakeDummyQuantizers(bst_target_t n_targets) {
+  std::vector<GradientQuantiser> h_quantizers;
+  for (bst_target_t i = 0; i < n_targets; ++i) {
+    h_quantizers.emplace_back(GradientPairPrecise{1.0f, 1.0f}, GradientPairPrecise{1.0f, 1.0f});
+  }
+  dh::device_vector<GradientQuantiser> d_quantizers(h_quantizers);
+  return d_quantizers;
+}
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu
index 6e08cc420a4e..f0b5083075a2 100644
--- a/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_multi_evaluate_splits.cu
@@ -6,6 +6,7 @@
 #include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
 #include "../../../../src/tree/gpu_hist/multi_evaluate_splits.cuh"
 #include "../../helpers.h"
+#include "dummy_quantizer.cuh"  // for MakeDummyQuantizers
 
 namespace xgboost::tree::cuda_impl {
 class GpuMultiHistEvaluatorBasicTest : public ::testing::Test {
@@ -45,8 +46,7 @@ class GpuMultiHistEvaluatorBasicTest : public ::testing::Test {
     input.parent_sum = dh::ToSpan(parent_sum);
     input.histogram = dh::ToSpan(histogram);
 
-    GradientQuantiser quantizer{{1.0, 1.0}, {1.0, 1.0}};
-    quantizers.resize(2, quantizer);
+    quantizers = MakeDummyQuantizers(2);
 
     shared_inputs.roundings = dh::ToSpan(quantizers);
 
diff --git a/tests/cpp/tree/gpu_hist/test_multi_histogram.cu b/tests/cpp/tree/gpu_hist/test_multi_histogram.cu
index e59eb40818cc..f3911200f592 100644
--- a/tests/cpp/tree/gpu_hist/test_multi_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_multi_histogram.cu
@@ -7,6 +7,7 @@
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
 #include "../../helpers.h"
 #include "../../histogram_helpers.h"
+#include "dummy_quantizer.cuh"  // for MakeDummyQuantizers
 
 namespace xgboost::tree::cuda_impl {
 TEST(GpuMultiHistogram, Basic) {
@@ -27,17 +28,17 @@ TEST(GpuMultiHistogram, Basic) {
   bst_bin_t n_total_bins = n_targets * n_features * n_bins;
   histogram.Reset(&ctx, /*max_cached_hist_nodes=*/2, fg_acc, n_total_bins, true);
 
-  auto gpairs = linalg::Constant(&ctx, GradientPair{1, 1}, n_samples, n_targets);
+  auto gpairs = linalg::Constant(&ctx, GradientPair{1.0f, 1.0f}, n_samples, n_targets);
   dh::device_vector<std::uint32_t> ridx(n_samples);
   thrust::sequence(ctx.CUDACtx()->CTP(), ridx.begin(), ridx.end(), 0);
 
   histogram.AllocateHistograms(&ctx, {0});
   auto node_hist = histogram.GetNodeHistogram(0);
-  std::vector<GradientQuantiser> h_quantizers(n_targets, GradientQuantiser{{1.0, 1.0}, {1.0, 1.0}});
-  dh::device_vector<GradientQuantiser> d_quantizers{h_quantizers};
+  auto quantizers = MakeDummyQuantizers(n_targets);
+
   histogram.BuildHistogram(ctx.CUDACtx(), page->GetDeviceEllpack(&ctx, {}), fg_acc,
                            gpairs.View(ctx.Device()), dh::ToSpan(ridx), node_hist,
-                           dh::ToSpan(d_quantizers));
+                           dh::ToSpan(quantizers));
 
   std::vector<GradientPairInt64> h_node_hist(node_hist.size());
   dh::CopyDeviceSpanToVector(&h_node_hist, node_hist);
diff --git a/tests/cpp/tree/test_leaf_sum.cu b/tests/cpp/tree/test_leaf_sum.cu
new file mode 100644
index 000000000000..cf9813ca2dec
--- /dev/null
+++ b/tests/cpp/tree/test_leaf_sum.cu
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2025, XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <thrust/sequence.h>  // for sequence
+#include <xgboost/linalg.h>   // for Constant
+
+#include <vector>  // for vector
+
+#include "../../../src/common/device_vector.cuh"
+#include "../../../src/tree/gpu_hist/row_partitioner.cuh"  // for LeafInfo
+#include "../../../src/tree/leaf_sum.cuh"
+#include "../helpers.h"
+#include "gpu_hist/dummy_quantizer.cuh"  // for MakeDummyQuantizers
+
+namespace xgboost::tree::cuda_impl {
+TEST(LeafGradSum, Basic) {
+  auto ctx = MakeCUDACtx(0);
+
+  bst_target_t n_targets = 2;
+  bst_idx_t n_samples = 6;
+  bst_idx_t n_leaves = 2;
+
+  // Create leaf information
+  std::vector<LeafInfo> h_leaves(n_leaves);
+  h_leaves[0].nidx = 1;
+  h_leaves[0].node.segment = Segment{0, 3};
+  h_leaves[1].nidx = 2;
+  h_leaves[1].node.segment = Segment{3, 6};
+
+  auto gpairs = linalg::Constant(&ctx, GradientPair{1.0f, 1.0f}, n_samples, n_targets);
+
+  dh::device_vector<RowIndexT> sorted_ridx(n_samples);
+  thrust::sequence(ctx.CUDACtx()->CTP(), sorted_ridx.begin(), sorted_ridx.end(), 0);
+
+  auto quantizers = MakeDummyQuantizers(n_targets);
+  auto out_sum = linalg::Empty<GradientPairInt64>(&ctx, n_leaves, n_targets);
+
+  LeafGradSum(&ctx, h_leaves, dh::ToSpan(quantizers), dh::ToSpan(sorted_ridx),
+              gpairs.View(ctx.Device()), out_sum.View(ctx.Device()));
+
+  for (auto v : out_sum.HostView()) {
+    ASSERT_EQ(v.GetQuantisedGrad(), 3);
+    ASSERT_EQ(v.GetQuantisedHess(), 3);
+  }
+}
+}  // namespace xgboost::tree::cuda_impl

From b40cc9097105902795695867815e45c92e375daf Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 02:11:52 +0800
Subject: [PATCH 18/24] Move files.

---
 src/tree/{ => gpu_hist}/leaf_sum.cu            | 16 ++++++++--------
 src/tree/{ => gpu_hist}/leaf_sum.cuh           | 12 ++++++------
 src/tree/updater_gpu_hist.cuh                  |  2 +-
 tests/cpp/tree/{ => gpu_hist}/test_leaf_sum.cu | 10 +++++-----
 4 files changed, 20 insertions(+), 20 deletions(-)
 rename src/tree/{ => gpu_hist}/leaf_sum.cu (86%)
 rename src/tree/{ => gpu_hist}/leaf_sum.cuh (71%)
 rename tests/cpp/tree/{ => gpu_hist}/test_leaf_sum.cu (82%)

diff --git a/src/tree/leaf_sum.cu b/src/tree/gpu_hist/leaf_sum.cu
similarity index 86%
rename from src/tree/leaf_sum.cu
rename to src/tree/gpu_hist/leaf_sum.cu
index a83ab076259c..01c776552e6f 100644
--- a/src/tree/leaf_sum.cu
+++ b/src/tree/gpu_hist/leaf_sum.cu
@@ -3,15 +3,15 @@
  */
 #include <vector>  // for vector
 
-#include "../common/linalg_op.cuh"       // for tbegin
-#include "gpu_hist/quantiser.cuh"        // for GradientQuantiser
-#include "gpu_hist/row_partitioner.cuh"  // for RowIndexT, LeafInfo
+#include "../../common/linalg_op.cuh"  // for tbegin
+#include "../updater_gpu_common.cuh"   // for GPUTrainingParam
 #include "leaf_sum.cuh"
-#include "updater_gpu_common.cuh"  // for GPUTrainingParam
-#include "xgboost/base.h"          // for GradientPairInt64
-#include "xgboost/context.h"       // for Context
-#include "xgboost/linalg.h"        // for MatrixView
-#include "xgboost/span.h"          // for Span
+#include "quantiser.cuh"        // for GradientQuantiser
+#include "row_partitioner.cuh"  // for RowIndexT, LeafInfo
+#include "xgboost/base.h"       // for GradientPairInt64
+#include "xgboost/context.h"    // for Context
+#include "xgboost/linalg.h"     // for MatrixView
+#include "xgboost/span.h"       // for Span
 
 namespace xgboost::tree::cuda_impl {
 void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
diff --git a/src/tree/leaf_sum.cuh b/src/tree/gpu_hist/leaf_sum.cuh
similarity index 71%
rename from src/tree/leaf_sum.cuh
rename to src/tree/gpu_hist/leaf_sum.cuh
index 4b880567f3f2..633eb746759b 100644
--- a/src/tree/leaf_sum.cuh
+++ b/src/tree/gpu_hist/leaf_sum.cuh
@@ -5,12 +5,12 @@
 
 #include <vector>  // for vector
 
-#include "gpu_hist/quantiser.cuh"        // for GradientQuantiser
-#include "gpu_hist/row_partitioner.cuh"  // for RowIndexT, LeafInfo
-#include "updater_gpu_common.cuh"        // for GPUTrainingParam
-#include "xgboost/context.h"             // for Context
-#include "xgboost/linalg.h"              // for MatrixView
-#include "xgboost/span.h"                // for Span
+#include "../updater_gpu_common.cuh"  // for GPUTrainingParam
+#include "quantiser.cuh"              // for GradientQuantiser
+#include "row_partitioner.cuh"        // for RowIndexT, LeafInfo
+#include "xgboost/context.h"          // for Context
+#include "xgboost/linalg.h"           // for MatrixView
+#include "xgboost/span.h"             // for Span
 
 namespace xgboost::tree::cuda_impl {
 // shape(out_sum) == (n_leaves, n_targets)
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index d71709c6207d..0c63b312fbb0 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -11,10 +11,10 @@
 #include "driver.h"                            // for Driver
 #include "gpu_hist/feature_groups.cuh"         // for FeatureGroups
 #include "gpu_hist/histogram.cuh"              // for DeviceHistogramBuilder
+#include "gpu_hist/leaf_sum.cuh"               // for LeafGradSum
 #include "gpu_hist/multi_evaluate_splits.cuh"  // for MultiHistEvaluator
 #include "gpu_hist/row_partitioner.cuh"        // for RowPartitioner
 #include "hist/hist_param.h"                   // for HistMakerTrainParam
-#include "leaf_sum.cuh"                        // for LeafGradSum
 #include "tree_view.h"                         // for MultiTargetTreeView
 #include "xgboost/base.h"                      // for bst_idx_t
 #include "xgboost/context.h"                   // for Context
diff --git a/tests/cpp/tree/test_leaf_sum.cu b/tests/cpp/tree/gpu_hist/test_leaf_sum.cu
similarity index 82%
rename from tests/cpp/tree/test_leaf_sum.cu
rename to tests/cpp/tree/gpu_hist/test_leaf_sum.cu
index cf9813ca2dec..2718741ce491 100644
--- a/tests/cpp/tree/test_leaf_sum.cu
+++ b/tests/cpp/tree/gpu_hist/test_leaf_sum.cu
@@ -7,11 +7,11 @@
 
 #include <vector>  // for vector
 
-#include "../../../src/common/device_vector.cuh"
-#include "../../../src/tree/gpu_hist/row_partitioner.cuh"  // for LeafInfo
-#include "../../../src/tree/leaf_sum.cuh"
-#include "../helpers.h"
-#include "gpu_hist/dummy_quantizer.cuh"  // for MakeDummyQuantizers
+#include "../../../../src/common/device_vector.cuh"
+#include "../../../../src/tree/gpu_hist/leaf_sum.cuh"
+#include "../../../../src/tree/gpu_hist/row_partitioner.cuh"  // for LeafInfo
+#include "../../helpers.h"
+#include "dummy_quantizer.cuh"  // for MakeDummyQuantizers
 
 namespace xgboost::tree::cuda_impl {
 TEST(LeafGradSum, Basic) {

From 02b7cb38d1bc842aa354ef2d17b984915ad91359 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 02:14:49 +0800
Subject: [PATCH 19/24] Notes.

---
 include/xgboost/multi_target_tree_model.h  |  1 +
 src/tree/gpu_hist/leaf_sum.cu              |  9 ++++++---
 src/tree/gpu_hist/leaf_sum.cuh             | 14 +++++++++++---
 src/tree/gpu_hist/multi_evaluate_splits.cu |  1 +
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/xgboost/multi_target_tree_model.h b/include/xgboost/multi_target_tree_model.h
index 3f5645e46c43..ca0fa716284c 100644
--- a/include/xgboost/multi_target_tree_model.h
+++ b/include/xgboost/multi_target_tree_model.h
@@ -70,6 +70,7 @@ class MultiTargetTree : public Model {
               linalg::VectorView<float const> base_weight,
               linalg::VectorView<float const> left_weight,
               linalg::VectorView<float const> right_weight);
+  /** @see RegTree::SetLeaves */
   void SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights);
 
   [[nodiscard]] bool IsLeaf(bst_node_t nidx) const {
diff --git a/src/tree/gpu_hist/leaf_sum.cu b/src/tree/gpu_hist/leaf_sum.cu
index 01c776552e6f..7c57b9ae8731 100644
--- a/src/tree/gpu_hist/leaf_sum.cu
+++ b/src/tree/gpu_hist/leaf_sum.cu
@@ -1,7 +1,8 @@
 /**
  * Copyright 2025, XGBoost contributors
  */
-#include <vector>  // for vector
+#include <cstddef>  // for size_t
+#include <vector>   // for vector
 
 #include "../../common/linalg_op.cuh"  // for tbegin
 #include "../updater_gpu_common.cuh"   // for GPUTrainingParam
@@ -38,16 +39,18 @@ void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
   auto d_indptr = dh::ToSpan(indptr);
 
   for (bst_target_t t = 0, n_targets = grad.Shape(1); t < n_targets; ++t) {
-    // TODO(jiamingy): Avoid additional allocation for d_sum
     auto out_t = out_sum.Slice(linalg::All(), t);  // len == n_leaves
-    std::size_t n_bytes = 0;
     auto it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) {
       auto nidx_in_set = dh::SegmentId(d_indptr, i);
+      // Index within segment
       auto k = i - d_indptr[nidx_in_set];
+      // Global index (within a batch).
       auto j = d_leaves[nidx_in_set].node.segment.begin + k;
+      // gradient
       auto g = grad(sorted_ridx[j], t);
       return roundings[t].ToFixedPoint(g);
     });
+    std::size_t n_bytes = 0;
     dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, it, linalg::tbegin(out_t),
                                                   h_leaves.size(), indptr.data(), indptr.data() + 1,
                                                   ctx->CUDACtx()->Stream()));
diff --git a/src/tree/gpu_hist/leaf_sum.cuh b/src/tree/gpu_hist/leaf_sum.cuh
index 633eb746759b..18c305a89e70 100644
--- a/src/tree/gpu_hist/leaf_sum.cuh
+++ b/src/tree/gpu_hist/leaf_sum.cuh
@@ -13,15 +13,23 @@
 #include "xgboost/span.h"             // for Span
 
 namespace xgboost::tree::cuda_impl {
-// shape(out_sum) == (n_leaves, n_targets)
+/**
+ * @brief Calculate gradient sum for leaf nodes based on row partitions.
+ *
+ *   shape(out_sum) == (n_leaves, n_targets)
+ */
 void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
                  common::Span<GradientQuantiser const> roundings,
                  common::Span<RowIndexT const> sorted_ridx,
                  linalg::MatrixView<GradientPair const> grad,
                  linalg::MatrixView<GradientPairInt64> out_sum);
 
-// shape(grad_sum) == (n_leaves, n_targets)
-// shape(out_weights) == (n_leaves, n_targets)
+/**
+ * @brief Calculate leaf weights from gradient sum.
+ *
+ *   shape(grad_sum) == (n_leaves, n_targets)
+ *   shape(out_weights) == (n_leaves, n_targets)
+ */
 void LeafWeight(Context const* ctx, GPUTrainingParam const& param,
                 common::Span<GradientQuantiser const> roundings,
                 linalg::MatrixView<GradientPairInt64 const> grad_sum,
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
index 22bc31118164..49c428445912 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cu
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -286,6 +286,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
   auto s_d_splits = dh::ToSpan(d_splits);
 
   // Process results for each node
+  // TODO(jiamingy): This is terribly slow as we are looping through all features in each thread.
   dh::LaunchN(n_nodes, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t nidx_in_set) {
     auto input = d_inputs[nidx_in_set];
 

From 287b50ecc203b02636fa0e2464d0af282c60d46e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 02:24:58 +0800
Subject: [PATCH 20/24] parallel.

---
 src/tree/gpu_hist/multi_evaluate_splits.cu | 27 +++++++++++-----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
index 49c428445912..976910b2cc3f 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cu
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -286,7 +286,8 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
   auto s_d_splits = dh::ToSpan(d_splits);
 
   // Process results for each node
-  // TODO(jiamingy): This is terribly slow as we are looping through all features in each thread.
+  // TODO(jiamingy): This is terribly slow as we are looping through all features in each thread. We
+  // need to split this into two kernels, one for reduction, another one for calculating weights.
   dh::LaunchN(n_nodes, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t nidx_in_set) {
     auto input = d_inputs[nidx_in_set];
 
@@ -376,22 +377,20 @@ void MultiHistEvaluator::ApplyTreeSplit(Context const *ctx, RegTree const *p_tre
   auto right_sum = this->GetNodeSum(right_child, n_targets);
 
   // Calculate node sums
-  // TODO(jiamingy): We need to batch the targets and nodes
+  // TODO(jiamingy): We need to batch the nodes
   auto best_split = candidate.split;
 
   auto node_sum = best_split.node_sum;
-  dh::LaunchN(1, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t) {
-    for (bst_target_t t = 0; t < n_targets; ++t) {
-      auto sibling_sum = parent_sum[t] - node_sum[t];
-      if (best_split.dir == kRightDir) {
-        // forward pass, node_sum is the left sum
-        left_sum[t] = node_sum[t];
-        right_sum[t] = sibling_sum;
-      } else {
-        // backward pass, node_sum is the right sum
-        right_sum[t] = node_sum[t];
-        left_sum[t] = sibling_sum;
-      }
+  dh::LaunchN(n_targets, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t t) {
+    auto sibling_sum = parent_sum[t] - node_sum[t];
+    if (best_split.dir == kRightDir) {
+      // forward pass, node_sum is the left sum
+      left_sum[t] = node_sum[t];
+      right_sum[t] = sibling_sum;
+    } else {
+      // backward pass, node_sum is the right sum
+      right_sum[t] = node_sum[t];
+      left_sum[t] = sibling_sum;
     }
   });
 }

From 18a629dc8c76ec55f988119c5007df44da2ffad4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 02:36:34 +0800
Subject: [PATCH 21/24] Optimize the reduction.

---
 src/tree/gpu_hist/multi_evaluate_splits.cu | 30 ++++++++++++----------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
index 976910b2cc3f..457c4d6876a3 100644
--- a/src/tree/gpu_hist/multi_evaluate_splits.cu
+++ b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -1,11 +1,12 @@
 /**
  * Copyright 2025, XGBoost contributors
  */
+#include <thrust/reduce.h>  // for reduce_by_key
+
 #include <cub/block/block_scan.cuh>  // for BlockScan
 #include <cub/util_type.cuh>         // for KeyValuePair
 #include <cub/warp/warp_reduce.cuh>  // for WarpReduce
 #include <vector>                    // for vector
-#include "../../common/device_debug.cuh"
 
 #include "../../common/cuda_context.cuh"
 #include "../updater_gpu_common.cuh"  // for SumCallbackOp
@@ -286,20 +287,23 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
   auto s_d_splits = dh::ToSpan(d_splits);
 
   // Process results for each node
-  // TODO(jiamingy): This is terribly slow as we are looping through all features in each thread. We
-  // need to split this into two kernels, one for reduction, another one for calculating weights.
+  // Find best splits among all features for all nodes
+  auto key_it = dh::MakeIndexTransformIter([=] XGBOOST_DEVICE(std::size_t i) {
+    // Returns nidx_in_set
+    return i / n_features;
+  });
+  dh::device_vector<MultiSplitCandidate> best_splits(out_splits.size());
+  thrust::reduce_by_key(
+      ctx->CUDACtx()->CTP(), key_it, key_it + s_d_splits.size(), dh::tcbegin(s_d_splits),
+      thrust::make_discard_iterator(), best_splits.begin(), std::equal_to{},
+      [=] XGBOOST_DEVICE(MultiSplitCandidate const &lhs, MultiSplitCandidate const &rhs) {
+        return lhs.loss_chg > rhs.loss_chg ? lhs : rhs;
+      });
+  auto d_best_splits = dh::ToSpan(best_splits);
+
   dh::LaunchN(n_nodes, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t nidx_in_set) {
     auto input = d_inputs[nidx_in_set];
-
-    // Find best split among all features for this node
-    MultiSplitCandidate best_split{};
-    for (bst_feature_t f = 0; f < n_features; ++f) {
-      auto candidate = s_d_splits[nidx_in_set * n_features + f];
-      if (candidate.loss_chg > best_split.loss_chg) {
-        best_split = candidate;
-      }
-    }
-
+    MultiSplitCandidate best_split = d_best_splits[nidx_in_set];
     if (best_split.node_sum.empty()) {
       // Invalid split
       out_splits[nidx_in_set] = {};

From 36c17ae0a7c8f8f797241f68430cae2de307df35 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 04:43:36 +0800
Subject: [PATCH 22/24] cleanup.

---
 src/tree/updater_gpu_hist.cu | 2 +-
 tests/cpp/gbm/test_gbtree.cc | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 0bd4791ccb3f..416fac6e7166 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -3,7 +3,7 @@
  */
 #include <thrust/transform.h>  // for transform
 
-#include <algorithm>        // for max, none_of
+#include <algorithm>        // for max
 #include <cmath>            // for isnan
 #include <cstdint>          // for int32_t, uint32_t
 #include <cuda/functional>  // for plus
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index c62b3deaf0d7..fde9bc3d602f 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -66,9 +66,7 @@ TEST(GBTree, PredictionCache) {
   gbtree.Configure({{"tree_method", "hist"}});
   auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
 
-  GradientContainer gpair;
-  gpair.gpair = linalg::Matrix<GradientPair>{{kRows}, ctx.Device()};
-  gpair.gpair.Data()->Copy(GenerateRandomGradients(kRows));
+  GradientContainer gpair = GenerateRandomGradients(&ctx, kRows, 1);
 
   PredictionCacheEntry out_predictions;
   gbtree.DoBoost(p_m.get(), &gpair, &out_predictions, nullptr);
@@ -209,10 +207,9 @@ TEST(GBTree, ChooseTreeMethod) {
       learner->SetParam("device", d);
     }
     learner->Configure();
+    Context ctx;
     for (std::int32_t i = 0; i < 3; ++i) {
-      GradientContainer gpair;
-      gpair.gpair = linalg::Matrix<GradientPair>{{Xy->Info().num_row_}, DeviceOrd::CPU()};
-      gpair.gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
+      GradientContainer gpair = GenerateRandomGradients(&ctx, Xy->Info().num_row_, 1);
       learner->BoostOneIter(0, Xy, &gpair);
     }
 

From 78915cec003d7d4f0777160c5e909c49fed58994 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Nov 2025 17:09:27 +0800
Subject: [PATCH 23/24] Small cleanups.

---
 include/xgboost/gbm.h               | 4 ++--
 include/xgboost/learner.h           | 2 +-
 python-package/xgboost/objective.py | 6 +++---
 src/tree/updater_gpu_hist.cuh       | 5 +++--
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index 61f4b1a62fe6..65940773ffee 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -79,8 +79,8 @@ class GradientBooster : public Model, public Configurable {
    *                   the booster may change content of gpair
    * @param obj The objective function used for boosting.
    */
-  virtual void DoBoost(DMatrix* /*p_fmat*/, GradientContainer* /*in_gpair*/,
-                       PredictionCacheEntry* /*prediction*/, ObjFunction const* /*obj*/) = 0;
+  virtual void DoBoost(DMatrix* p_fmat, GradientContainer* in_gpair,
+                       PredictionCacheEntry* prediction, ObjFunction const* obj) = 0;
 
   /**
    * \brief Generate predictions for given feature matrix
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 5fab2ff2b574..ffaddfbe6442 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -54,7 +54,7 @@ enum class PredictionType : std::uint8_t {  // NOLINT
  *  The Load/Save function corresponds to the model used in python/R.
  *  @code
  *
- *  std::unique_ptr<Learner> learner(new Learner::Create(cache_mats));
+ *  std::unique_ptr<Learner> learner{Learner::Create(cache_mats)};
  *  learner->Configure(configs);
  *
  *  for (int iter = 0; iter < max_iter; ++iter) {
diff --git a/python-package/xgboost/objective.py b/python-package/xgboost/objective.py
index 756d7d4b96eb..2a05e15339e3 100644
--- a/python-package/xgboost/objective.py
+++ b/python-package/xgboost/objective.py
@@ -19,7 +19,7 @@
 
 
 class Objective(ABC):
-    """Base class for custom objective function.
+    """Base class for custom objective functions.
 
     .. warning::
 
@@ -34,7 +34,7 @@ def __call__(
 
 
 class TreeObjective(Objective):
-    """Base class for tree-specific custom objective function.
+    """Base class for tree-specific custom objective functions.
 
     .. warning::
 
@@ -45,5 +45,5 @@ class TreeObjective(Objective):
     def split_grad(
         self, grad: ArrayLike, hess: ArrayLike
     ) -> Tuple[ArrayLike, ArrayLike]:
-        """Provide different gradient type for finding tree structure."""
+        """Provide a different gradient type for finding tree structures."""
         return grad, hess
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
index 0c63b312fbb0..ed2139399919 100644
--- a/src/tree/updater_gpu_hist.cuh
+++ b/src/tree/updater_gpu_hist.cuh
@@ -208,6 +208,7 @@ class MultiTargetHistMaker {
 
   void UpdateTreeLeaf(linalg::Matrix<GradientPair> const& full_grad, RegTree* p_tree) const {
     // TODO(jiamingy): Need to iterate through partitioners for external memory support.
+    CHECK_EQ(this->partitioners_.size(), 1);
     auto leaves = this->partitioners_.front()->GetLeaves();
     // Calculate the leaf weight based on the node sum for each leaf.
     // Update the leaf weight, with learning rate.
@@ -217,8 +218,8 @@ class MultiTargetHistMaker {
                 this->partitioners_.front()->GetRows(), full_grad.View(this->ctx_->Device()),
                 out_sum.View(this->ctx_->Device()));
     auto param = GPUTrainingParam{this->param_};
-    linalg::Matrix<float> out_weight =
-        linalg::Empty<float>(this->ctx_, leaves.size(), p_tree->NumTargets());
+    auto out_weight = linalg::Empty<float>(this->ctx_, leaves.size(), p_tree->NumTargets());
+    // Use full value gradient for leaf values.
     LeafWeight(this->ctx_, param, this->value_quantizer_->Quantizers(),
                out_sum.View(this->ctx_->Device()), out_weight.View(this->ctx_->Device()));
     std::vector<bst_node_t> leaves_idx(leaves.size());

From 573c946090576e6260b675b194f03e55ef67461b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 10 Nov 2025 01:53:58 +0800
Subject: [PATCH 24/24] note.

---
 src/tree/gpu_hist/histogram.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 8402bfbd8f0f..07c00c23b6b6 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -330,6 +330,7 @@ __global__ __launch_bounds__(kBlockThreads) void MultiHistKernel(
       }
       bst_target_t n_targets = roundings.size();
       compressed_bin *= n_targets;
+      // TODO(jiamingy): Assign a thread for each target.
       for (bst_target_t t = 0; t < n_targets; ++t) {
         auto adjusted = roundings[t].ToFixedPoint(d_gpair(ridx, t));
         AtomicAddGpairGlobal(d_node_hist + compressed_bin + t, adjusted);