dmlc
diff --git a/‎include/xgboost/multi_target_tree_model.h‎
Lines changed: 47 additions & 12 deletions b/‎include/xgboost/multi_target_tree_model.h‎
Lines changed: 47 additions & 12 deletions
diff --git a/‎include/xgboost/tree_model.h‎
Lines changed: 2 additions & 0 deletions b/‎include/xgboost/tree_model.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python-package/xgboost/testing/data.py‎
Lines changed: 3 additions & 5 deletions b/‎python-package/xgboost/testing/data.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎python-package/xgboost/testing/multi_target.py‎
Lines changed: 117 additions & 25 deletions b/‎python-package/xgboost/testing/multi_target.py‎
Lines changed: 117 additions & 25 deletions
diff --git a/‎src/common/cuda_rt_utils.cc‎
Lines changed: 8 additions & 0 deletions b/‎src/common/cuda_rt_utils.cc‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/common/cuda_rt_utils.h‎
Lines changed: 5 additions & 0 deletions b/‎src/common/cuda_rt_utils.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/common/cuda_stream.h‎
Lines changed: 12 additions & 0 deletions b/‎src/common/cuda_stream.h‎
Lines changed: 12 additions & 0 deletions
@@ -25,6 +25,15 @@ struct TreeParam;
 
 /**
  * @brief Tree structure for multi-target model.
+ *
+ * In order to support reduced gradient, the internal storage distinguishes weights
+ * between base weights and leaf weights. The former is the weight calculated from split
+ * gradient, and the later is the weight calculated from value gradient and used as
+ * outputs. Every node has a base weight, but only leaves have leaf weights.
+ *
+ * To access the leaf weights, we re-use the right child to store leaf indices. For split
+ * nodes, the `right_` member stores their right child node indices, for leaf nodes, the
+ * `right_` member stores the corresponding leaf weight indices.
  */
 class MultiTargetTree : public Model {
  public:
@@ -33,24 +42,36 @@ class MultiTargetTree : public Model {
 
  private:
   TreeParam const* param_;
+  // Mapping from node index to its left child. -1 for a leaf node.
   HostDeviceVector<bst_node_t> left_;
+  // Mapping from node index to its right child. Maps to leaf weight for a leaf node.
   HostDeviceVector<bst_node_t> right_;
+  // Mapping from node index to its parent.
   HostDeviceVector<bst_node_t> parent_;
+  // Feature index for node split.
   HostDeviceVector<bst_feature_t> split_index_;
+  // Whether the left child is the default node when split feature is missing.
   HostDeviceVector<std::uint8_t> default_left_;
+  // Threshold for splitting a node.
   HostDeviceVector<float> split_conds_;
+  // Internal base weights.
   HostDeviceVector<float> weights_;
+  // Output weights.
+  HostDeviceVector<float> leaf_weights_;
 
   [[nodiscard]] linalg::VectorView<float const> NodeWeight(bst_node_t nidx) const {
-    auto beg = nidx * this->NumTargets();
-    auto v = this->weights_.ConstHostSpan().subspan(beg, this->NumTargets());
+    auto beg = nidx * this->NumSplitTargets();
+    auto v = this->weights_.ConstHostSpan().subspan(beg, this->NumSplitTargets());
     return linalg::MakeTensorView(DeviceOrd::CPU(), v, v.size());
   }
-  [[nodiscard]] linalg::VectorView<float> NodeWeight(bst_node_t nidx) {
-    auto beg = nidx * this->NumTargets();
-    auto v = this->weights_.HostSpan().subspan(beg, this->NumTargets());
+  // Unlike the const version, `NumSplitTargets` is not reliable if the tree can change.
+  [[nodiscard]] linalg::VectorView<float> NodeWeight(bst_node_t nidx,
+                                                     bst_target_t n_split_targets) {
+    auto beg = nidx * n_split_targets;
+    auto v = this->weights_.HostSpan().subspan(beg, n_split_targets);
     return linalg::MakeTensorView(DeviceOrd::CPU(), v, v.size());
   }
+  [[nodiscard]] bst_node_t LeafIdx(bst_node_t nidx) const { return this->RightChild(nidx); }
 
  public:
   explicit MultiTargetTree(TreeParam const* param);
@@ -72,6 +93,8 @@ class MultiTargetTree : public Model {
               linalg::VectorView<float const> right_weight);
   /** @see RegTree::SetLeaves */
   void SetLeaves(std::vector<bst_node_t> leaves, common::Span<float const> weights);
+  /** @brief Copy base weight into leaf weight for a non-reduced multi-target tree. */
+  void SetLeaves();
 
   [[nodiscard]] bool IsLeaf(bst_node_t nidx) const {
     return left_.ConstHostVector()[nidx] == InvalidNodeId();
@@ -82,24 +105,36 @@ class MultiTargetTree : public Model {
   [[nodiscard]] bst_node_t RightChild(bst_node_t nidx) const {
     return right_.ConstHostVector().at(nidx);
   }
-
+  /**
+   * @brief Number of targets (size of a leaf).
+   */
   [[nodiscard]] bst_target_t NumTargets() const;
-  [[nodiscard]] auto NumLeaves() const { return this->weights_.Size() / this->NumTargets(); }
+  /**
+   * @brief Number of reduced targets.
+   */
+  [[nodiscard]] bst_target_t NumSplitTargets() const;
+  [[nodiscard]] auto NumLeaves() const { return this->leaf_weights_.Size() / this->NumTargets(); }
 
   [[nodiscard]] std::size_t Size() const;
   [[nodiscard]] MultiTargetTree* Copy(TreeParam const* param) const;
 
-  common::Span<float const> Weights(DeviceOrd device) const {
+  common::Span<float const> LeafWeights(DeviceOrd device) const {
     if (device.IsCPU()) {
-      return this->weights_.ConstHostSpan();
+      return this->leaf_weights_.ConstHostSpan();
     }
-    this->weights_.SetDevice(device);
-    return this->weights_.ConstDeviceSpan();
+    this->leaf_weights_.SetDevice(device);
+    return this->leaf_weights_.ConstDeviceSpan();
   }
 
   [[nodiscard]] linalg::VectorView<float const> LeafValue(bst_node_t nidx) const {
     CHECK(IsLeaf(nidx));
-    return this->NodeWeight(nidx);
+    auto n_targets = this->NumTargets();
+    auto h_leaf_mapping = this->right_.ConstHostSpan();
+    auto h_leaf_weights = this->leaf_weights_.ConstHostSpan();
+    auto lidx = h_leaf_mapping[nidx];
+    CHECK_NE(lidx, InvalidNodeId());
+    auto weight = h_leaf_weights.subspan(lidx * n_targets, n_targets);
+    return linalg::MakeVec(DeviceOrd::CPU(), weight);
   }
 
   void LoadModel(Json const& in) override;
 
@@ -408,6 +408,8 @@ class RegTree : public Model {
   [[nodiscard]] bst_node_t GetDepth(bst_node_t nidx) const;
   /**
    * @brief Set the root weight for a multi-target tree.
+   *
+   * @param weight Internal split weight, with size equals to reduced targets.
    */
   void SetRoot(linalg::VectorView<float const> weight) {
     CHECK(IsMultiTarget());
 
@@ -30,6 +30,7 @@
 from numpy.random import Generator as RNG
 from scipy import sparse
 
+from ..compat import concat
 from ..core import DataIter, DMatrix, QuantileDMatrix
 from ..data import is_pd_cat_dtype, pandas_pyarrow_mapper
 from ..sklearn import ArrayLike, XGBRanker
@@ -1150,11 +1151,8 @@ def as_arrays(
         self,
     ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
         """Return concatenated arrays."""
-        if isinstance(self.X[0], sparse.csr_matrix):
-            X = sparse.vstack(self.X, format="csr")
-        else:
-            X = np.concatenate(self.X, axis=0)
-        y = np.concatenate(self.y, axis=0)
+        X = concat(self.X)
+        y = concat(self.y)
         if self.w:
             w = np.concatenate(self.w, axis=0)
         else:
 
@@ -13,12 +13,14 @@
 import xgboost.testing as tm
 
 from .._typing import ArrayLike
-from ..core import Booster, DMatrix, QuantileDMatrix
+from ..compat import import_cupy
+from ..core import Booster, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
 from ..objective import Objective, TreeObjective
 from ..sklearn import XGBClassifier
 from ..training import train
+from .data import IteratorForTest
 from .updater import ResetStrategy
-from .utils import Device
+from .utils import Device, assert_allclose
 
 
 def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
@@ -64,34 +66,42 @@ def run_multilabel(device: Device, learning_rate: Optional[float]) -> None:
     assert proba.shape == y.shape
 
 
-def run_reduced_grad(device: Device) -> None:
-    """Basic test for using reduced gradient for tree splits."""
-    import cupy as cp
+class LsObj0(TreeObjective):
+    """Split grad is the same as value grad."""
 
-    class LsObj0(TreeObjective):
-        """Split grad is the same as value grad."""
+    def __call__(
+        self, y_pred: ArrayLike, dtrain: DMatrix
+    ) -> Tuple[ArrayLike, ArrayLike]:
+        cp = import_cupy()
 
-        def __call__(
-            self, y_pred: ArrayLike, dtrain: DMatrix
-        ) -> Tuple[cp.ndarray, cp.ndarray]:
-            y_true = dtrain.get_label().reshape(y_pred.shape)
-            grad, hess = tm.ls_obj(y_true, y_pred, None)
-            return cp.array(grad), cp.array(hess)
+        y_true = dtrain.get_label().reshape(y_pred.shape)
+        grad, hess = tm.ls_obj(y_true, y_pred, None)
+        return cp.array(grad), cp.array(hess)
 
-        def split_grad(
-            self, grad: ArrayLike, hess: ArrayLike
-        ) -> Tuple[ArrayLike, ArrayLike]:
-            return cp.array(grad), cp.array(hess)
+    def split_grad(
+        self, grad: ArrayLike, hess: ArrayLike
+    ) -> Tuple[ArrayLike, ArrayLike]:
+        cp = import_cupy()
 
-    class LsObj1(Objective):
-        """No split grad."""
+        return cp.array(grad), cp.array(hess)
 
-        def __call__(
-            self, y_pred: ArrayLike, dtrain: DMatrix
-        ) -> Tuple[cp.ndarray, cp.ndarray]:
-            y_true = dtrain.get_label().reshape(y_pred.shape)
-            grad, hess = tm.ls_obj(y_true, y_pred, None)
-            return cp.array(grad), cp.array(hess)
+
+class LsObj1(Objective):
+    """No split grad."""
+
+    def __call__(
+        self, y_pred: ArrayLike, dtrain: DMatrix
+    ) -> Tuple[ArrayLike, ArrayLike]:
+        cp = import_cupy()
+
+        y_true = dtrain.get_label().reshape(y_pred.shape)
+        grad, hess = tm.ls_obj(y_true, y_pred, None)
+        return cp.array(grad), cp.array(hess)
+
+
+def run_reduced_grad(device: Device) -> None:
+    """Basic test for using reduced gradient for tree splits."""
+    import cupy as cp
 
     X, y = make_regression(  # pylint: disable=unbalanced-tuple-unpacking
         n_samples=1024, n_features=16, random_state=1994, n_targets=5
@@ -149,3 +159,85 @@ def split_grad(
     run_test(LsObj2(False))
     with pytest.raises(AssertionError):
         run_test(LsObj2(True))
+
+
+def run_with_iter(device: Device) -> None:  # pylint: disable=too-many-locals
+    """Test vector leaf with external memory."""
+    if device == "cuda":
+        from cupy import asarray
+    else:
+        from numpy import asarray
+
+    n_batches = 4
+    n_rounds = 8
+    n_targets = 3
+    intercept = [0.5] * n_targets
+    Xs = []
+    ys = []
+    for i in range(n_batches):
+        X_i, y_i = make_regression(  # pylint: disable=unbalanced-tuple-unpacking
+            n_samples=4096, n_features=8, random_state=(i + 1), n_targets=n_targets
+        )
+        Xs.append(asarray(X_i))
+        ys.append(asarray(y_i))
+    it = IteratorForTest(Xs, ys, None, cache="cache", on_host=True)
+    Xy: DMatrix = ExtMemQuantileDMatrix(it, cache_host_ratio=1.0)
+
+    evals_result_0: Dict[str, Dict] = {}
+    booster_0 = train(
+        {
+            "device": device,
+            "multi_strategy": "multi_output_tree",
+            "learning_rate": 1.0,
+            "base_score": intercept,
+        },
+        Xy,
+        num_boost_round=n_rounds,
+        evals=[(Xy, "Train")],
+        evals_result=evals_result_0,
+    )
+
+    it = IteratorForTest(Xs, ys, None, cache=None)
+    Xy = QuantileDMatrix(it)
+    evals_result_1: Dict[str, Dict] = {}
+    booster_1 = train(
+        {
+            "device": device,
+            "multi_strategy": "multi_output_tree",
+            "learning_rate": 1.0,
+            "base_score": intercept,
+        },
+        Xy,
+        num_boost_round=n_rounds,
+        evals=[(Xy, "Train")],
+        evals_result=evals_result_1,
+    )
+    np.testing.assert_allclose(
+        evals_result_0["Train"]["rmse"], evals_result_1["Train"]["rmse"]
+    )
+    assert tm.non_increasing(evals_result_0["Train"]["rmse"])
+    X, _, _ = it.as_arrays()
+    assert_allclose(device, booster_0.inplace_predict(X), booster_1.inplace_predict(X))
+
+    it = IteratorForTest(Xs, ys, None, cache="cache", on_host=True)
+    Xy = ExtMemQuantileDMatrix(it, cache_host_ratio=1.0)
+
+    evals_result_2: Dict[str, Dict] = {}
+    booster_2 = train(
+        {
+            "device": device,
+            "multi_strategy": "multi_output_tree",
+            "learning_rate": 1.0,
+            "base_score": intercept,
+            "debug_synchronize": True,
+        },
+        Xy,
+        evals=[(Xy, "Train")],
+        obj=LsObj0(),
+        num_boost_round=n_rounds,
+        evals_result=evals_result_2,
+    )
+    np.testing.assert_allclose(
+        evals_result_0["Train"]["rmse"], evals_result_2["Train"]["rmse"]
+    )
+    assert_allclose(device, booster_0.inplace_predict(X), booster_2.inplace_predict(X))
@@ -3,6 +3,8 @@
  */
 #include "cuda_rt_utils.h"
 
+#include "cuda_stream.h"  // for StreamRef
+
 #if defined(XGBOOST_USE_CUDA)
 #include <cuda_runtime_api.h>
 
@@ -99,6 +101,10 @@ void GetDrVersionGlobal(std::int32_t* major, std::int32_t* minor) {
   return numa_id;
 }
 
+void MemcpyAsync(void* dst, const void* src, std::size_t count, StreamRef stream) {
+  dh::safe_cuda(cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream));
+}
+
 #else
 std::int32_t AllVisibleGPUs() { return 0; }
 
@@ -128,5 +134,7 @@ void SetDevice(std::int32_t device) {
   return 0;
 }
 
+void MemcpyAsync(void*, const void*, std::size_t, StreamRef) { common::AssertGPUSupport(); }
+
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost::curt
@@ -5,6 +5,8 @@
 #include <cstddef>  // for size_t
 #include <cstdint>  // for int32_t
 
+#include "cuda_stream.h"  // for StreamRef
+
 namespace xgboost::curt {
 std::int32_t AllVisibleGPUs();
 
@@ -35,4 +37,7 @@ void GetDrVersionGlobal(std::int32_t* major, std::int32_t* minor);
 
 // Get the current device's numa ID.
 [[nodiscard]] std::int32_t GetNumaId();
+
+// cudaMemcpyAsync
+void MemcpyAsync(void* dst, const void* src, std::size_t count, StreamRef stream);
 }  // namespace xgboost::curt
@@ -2,14 +2,18 @@
  * Copyright 2022-2025, XGBoost contributors
  */
 #pragma once
+
+#if defined(XGBOOST_USE_CUDA)
 #include <cuda_runtime.h>
+#endif  // defined(XGBOOST_USE_CUDA)
 
 #include <memory>   // for unique_ptr
 #include <utility>  // for swap
 
 #include "common.h"
 
 namespace xgboost::curt {
+#if defined(XGBOOST_USE_CUDA)
 class StreamRef;
 
 class Event {
@@ -94,4 +98,12 @@ class Stream {
   void Sync() { this->View().Sync(); }
   void Wait(Event const &e) { this->View().Wait(e); }
 };
+#else
+class StreamRef {};
+
+inline StreamRef DefaultStream() {
+  common::AssertGPUSupport();
+  return StreamRef{};
+}
+#endif
 }  // namespace xgboost::curt