Merge branch 'master' into fix-r-checks

trivialfis · web-flow · commit 64c8eef67bb8 · 2025-11-24T16:28:27.000+08:00
diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
@@ -149,8 +149,8 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.m2
-          key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
-          restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
+          key: ${{ runner.os }}-m2-${{ hashFiles('/jvm-packages/pom.xml') }}
+          restore-keys: ${{ runner.os }}-m2-${{ hashFiles('/jvm-packages/pom.xml') }}
       - name: Test XGBoost4J (Core) on macos
         if: matrix.os == 'macos-15-intel'
         run: |
diff --git a/python-package/xgboost/testing/multi_target.py b/python-package/xgboost/testing/multi_target.py
@@ -1,5 +1,6 @@
 """Tests for multi-target training."""
 
+# pylint: disable=unbalanced-tuple-unpacking
 from typing import Dict, Optional, Tuple
 
 import numpy as np
@@ -29,6 +30,7 @@ def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
         128, n_features=12, n_informative=10, n_classes=4, random_state=2025
     )
     clf = XGBClassifier(
+        debug_synchronize=True,
         multi_strategy="multi_output_tree",
         callbacks=[ResetStrategy()],
         n_estimators=10,
@@ -47,9 +49,9 @@ def run_multiclass(device: Device, learning_rate: Optional[float]) -> None:
 
 def run_multilabel(device: Device, learning_rate: Optional[float]) -> None:
     """Use vector leaf for multi-label classification models."""
-    # pylint: disable=unbalanced-tuple-unpacking
     X, y = make_multilabel_classification(128, random_state=2025)
     clf = XGBClassifier(
+        debug_synchronize=True,
         multi_strategy="multi_output_tree",
         callbacks=[ResetStrategy()],
         n_estimators=10,
@@ -103,7 +105,7 @@ def run_reduced_grad(device: Device) -> None:
     """Basic test for using reduced gradient for tree splits."""
     import cupy as cp
 
-    X, y = make_regression(  # pylint: disable=unbalanced-tuple-unpacking
+    X, y = make_regression(
         n_samples=1024, n_features=16, random_state=1994, n_targets=5
     )
     Xy = QuantileDMatrix(X, y)
@@ -114,6 +116,7 @@ def run_test(
         evals_result: Dict[str, Dict] = {}
         booster = train(
             {
+                "debug_synchronize": True,
                 "device": device,
                 "multi_strategy": "multi_output_tree",
                 "learning_rate": 1,
@@ -184,7 +187,7 @@ def run_with_iter(device: Device) -> None:  # pylint: disable=too-many-locals
     Xs = []
     ys = []
     for i in range(n_batches):
-        X_i, y_i = make_regression(  # pylint: disable=unbalanced-tuple-unpacking
+        X_i, y_i = make_regression(
             n_samples=4096, n_features=8, random_state=(i + 1), n_targets=n_targets
         )
         Xs.append(asarray(X_i))
@@ -245,3 +248,33 @@ def run_with_iter(device: Device) -> None:  # pylint: disable=too-many-locals
         evals_result_0["Train"]["rmse"], evals_result_2["Train"]["rmse"]
     )
     assert_allclose(device, booster_0.inplace_predict(X), booster_2.inplace_predict(X))
+
+
+def run_eta(device: Device) -> None:
+    """Test for learning rate."""
+    X, y = make_regression(512, 16, random_state=2025, n_targets=3)
+
+    def run(obj: Optional[Objective]) -> None:
+        params = {
+            "device": device,
+            "multi_strategy": "multi_output_tree",
+            "learning_rate": 1.0,
+            "debug_synchronize": True,
+            "base_score": 0.0,
+        }
+        Xy = QuantileDMatrix(X, y)
+        booster_0 = train(params, Xy, num_boost_round=1, obj=obj)
+        params["learning_rate"] = 0.1
+        booster_1 = train(params, Xy, num_boost_round=1, obj=obj)
+        params["learning_rate"] = 2.0
+        booster_2 = train(params, Xy, num_boost_round=1, obj=obj)
+
+        predt_0 = booster_0.predict(Xy)
+        predt_1 = booster_1.predict(Xy)
+        predt_2 = booster_2.predict(Xy)
+
+        np.testing.assert_allclose(predt_0, predt_1 * 10, rtol=1e-6)
+        np.testing.assert_allclose(predt_0 * 2, predt_2, rtol=1e-6)
+
+    run(None)
+    run(LsObj0())
diff --git a/src/tree/gpu_hist/expand_entry.cuh b/src/tree/gpu_hist/expand_entry.cuh
@@ -132,7 +132,7 @@ struct MultiExpandEntry {
   bst_node_t depth{0};
   MultiSplitCandidate split;
 
-  common::Span<float const> base_weight;
+  common::Span<float> base_weight;
   common::Span<float const> left_weight;
   common::Span<float const> right_weight;
 
diff --git a/src/tree/gpu_hist/leaf_sum.cu b/src/tree/gpu_hist/leaf_sum.cu
@@ -88,7 +88,7 @@ void LeafWeight(Context const* ctx, GPUTrainingParam const& param,
   dh::LaunchN(grad_sum.Size(), ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t i) mutable {
     auto [nidx_in_set, t] = linalg::UnravelIndex(i, grad_sum.Shape());
     auto g = roundings[t].ToFloatingPoint(grad_sum(nidx_in_set, t));
-    out_weights(nidx_in_set, t) = CalcWeight(param, g.GetGrad(), g.GetHess());
+    out_weights(nidx_in_set, t) = CalcWeight(param, g.GetGrad(), g.GetHess()) * param.learning_rate;
   });
 }
 }  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cu b/src/tree/gpu_hist/multi_evaluate_splits.cu
@@ -1,14 +1,15 @@
 /**
  * Copyright 2025, XGBoost contributors
  */
-#include <thrust/reduce.h>  // for reduce_by_key
+#include <thrust/reduce.h>  // for reduce_by_key, reduce
 
 #include <cub/block/block_scan.cuh>  // for BlockScan
 #include <cub/util_type.cuh>         // for KeyValuePair
 #include <cub/warp/warp_reduce.cuh>  // for WarpReduce
 #include <vector>                    // for vector
 
 #include "../../common/cuda_context.cuh"
+#include "../tree_view.h"             // for MultiTargetTreeView
 #include "../updater_gpu_common.cuh"  // for SumCallbackOp
 #include "multi_evaluate_splits.cuh"  // for MultiEvalauteSplitInputs, MultiEvaluateSplitSharedInputs
 #include "quantiser.cuh"              // for GradientQuantiser
@@ -221,7 +222,18 @@ __global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
   dh::device_vector<MultiEvaluateSplitInputs> inputs{input};
   dh::device_vector<MultiExpandEntry> outputs(1);
 
-  this->EvaluateSplits(ctx, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(outputs));
+  auto d_outputs = dh::ToSpan(outputs);
+  this->EvaluateSplits(ctx, dh::ToSpan(inputs), shared_inputs, d_outputs);
+
+  auto n_targets = shared_inputs.Targets();
+  dh::LaunchN(n_targets, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t t) {
+    auto weight = d_outputs[0].base_weight;
+    if (weight.empty()) {
+      return;
+    }
+    weight[t] *= shared_inputs.param.learning_rate;
+  });
+
   return outputs[0];
 }
 
@@ -326,6 +338,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
 
     bool l = true, r = true;
     GradientPairPrecise lg_fst, rg_fst;
+    auto eta = shared_inputs.param.learning_rate;
     for (bst_target_t t = 0; t < n_targets; ++t) {
       auto quantizer = d_roundings[t];
       auto sibling_sum = input.parent_sum[t] - node_sum[t];
@@ -337,15 +350,15 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
       if (best_split.dir == kRightDir) {
         // forward pass, node_sum is the left sum
         lg = quantizer.ToFloatingPoint(node_sum[t]);
-        left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess());
+        left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess()) * eta;
         rg = quantizer.ToFloatingPoint(sibling_sum);
-        right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess());
+        right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess()) * eta;
       } else {
         // backward pass, node_sum is the right sum
         rg = quantizer.ToFloatingPoint(node_sum[t]);
-        right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess());
+        right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess()) * eta;
         lg = quantizer.ToFloatingPoint(sibling_sum);
-        left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess());
+        left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess()) * eta;
       }
 
       if (t == 0) {
@@ -367,35 +380,50 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
 }
 
 void MultiHistEvaluator::ApplyTreeSplit(Context const *ctx, RegTree const *p_tree,
-                                        MultiExpandEntry const &candidate) {
-  auto left_child = p_tree->LeftChild(candidate.nidx);
-  auto right_child = p_tree->RightChild(candidate.nidx);
-  bst_node_t max_node = std::max(left_child, right_child);
-  auto n_targets = candidate.base_weight.size();
-
+                                        common::Span<MultiExpandEntry const> d_candidates,
+                                        bst_target_t n_targets) {
+  // Assign the node sums here, for the next evaluate split call.
+  auto mt_tree = MultiTargetTreeView{ctx->Device(), p_tree};
+  auto max_in_it = dh::MakeIndexTransformIter([=] __device__(std::size_t i) -> bst_node_t {
+    return std::max(mt_tree.LeftChild(d_candidates[i].nidx),
+                    mt_tree.RightChild(d_candidates[i].nidx));
+  });
+  auto max_node = thrust::reduce(
+      ctx->CUDACtx()->CTP(), max_in_it, max_in_it + d_candidates.size(), 0,
+      [=] XGBOOST_DEVICE(bst_node_t l, bst_node_t r) { return cuda::std::max(l, r); });
   this->AllocNodeSum(max_node, n_targets);
 
-  auto parent_sum = this->GetNodeSum(candidate.nidx, n_targets);
-  auto left_sum = this->GetNodeSum(left_child, n_targets);
-  auto right_sum = this->GetNodeSum(right_child, n_targets);
-
-  // Calculate node sums
-  // TODO(jiamingy): We need to batch the nodes
-  auto best_split = candidate.split;
-
-  auto node_sum = best_split.child_sum;
-  dh::LaunchN(n_targets, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t t) {
-    auto sibling_sum = parent_sum[t] - node_sum[t];
-    if (best_split.dir == kRightDir) {
-      // forward pass, node_sum is the left sum
-      left_sum[t] = node_sum[t];
-      right_sum[t] = sibling_sum;
-    } else {
-      // backward pass, node_sum is the right sum
-      right_sum[t] = node_sum[t];
-      left_sum[t] = sibling_sum;
-    }
-  });
+  auto node_sums = dh::ToSpan(this->node_sums_);
+
+  dh::LaunchN(n_targets * d_candidates.size(), ctx->CUDACtx()->Stream(),
+              [=] XGBOOST_DEVICE(std::size_t i) {
+                auto get_node_sum = [&](bst_node_t nidx) {
+                  return GetNodeSumImpl(node_sums, nidx, n_targets);
+                };
+                auto nidx_in_set = i / n_targets;
+                auto t = i % n_targets;
+
+                auto const &candidate = d_candidates[nidx_in_set];
+                auto const &best_split = candidate.split;
+
+                auto parent_sum = get_node_sum(candidate.nidx);
+                // The child sum is a pointer to the scan buffer in this evaluator. Copy
+                // the data into the node sum buffer before the next evaluation call.
+                auto node_sum = best_split.child_sum;
+                auto left_sum = get_node_sum(mt_tree.LeftChild(candidate.nidx));
+                auto right_sum = get_node_sum(mt_tree.RightChild(candidate.nidx));
+
+                auto sibling_sum = parent_sum[t] - node_sum[t];
+                if (best_split.dir == kRightDir) {
+                  // forward pass, node_sum is the left sum
+                  left_sum[t] = node_sum[t];
+                  right_sum[t] = sibling_sum;
+                } else {
+                  // backward pass, node_sum is the right sum
+                  right_sum[t] = node_sum[t];
+                  left_sum[t] = sibling_sum;
+                }
+              });
 }
 
 std::ostream &DebugPrintHistogram(std::ostream &os, common::Span<GradientPairInt64 const> node_hist,
diff --git a/src/tree/gpu_hist/multi_evaluate_splits.cuh b/src/tree/gpu_hist/multi_evaluate_splits.cuh
@@ -20,6 +20,13 @@ class MultiHistEvaluator {
   dh::device_vector<GradientPairInt64> node_sums_;
 
  public:
+  template <typename GradT>
+  static XGBOOST_DEVICE common::Span<GradT> GetNodeSumImpl(common::Span<GradT> node_sums,
+                                                           bst_node_t nidx,
+                                                           bst_target_t n_targets) {
+    auto offset = nidx * n_targets;
+    return node_sums.subspan(offset, n_targets);
+  }
   /**
    * @brief Run evaluation for the root node.
    */
@@ -41,17 +48,16 @@ class MultiHistEvaluator {
   }
   [[nodiscard]] common::Span<GradientPairInt64> GetNodeSum(bst_node_t nidx,
                                                            bst_target_t n_targets) {
-    auto offset = nidx * n_targets;
-    return dh::ToSpan(this->node_sums_).subspan(offset, n_targets);
+    return GetNodeSumImpl(dh::ToSpan(this->node_sums_), nidx, n_targets);
   }
   [[nodiscard]] common::Span<GradientPairInt64 const> GetNodeSum(bst_node_t nidx,
                                                                  bst_target_t n_targets) const {
-    auto offset = nidx * n_targets;
-    return dh::ToSpan(this->node_sums_).subspan(offset, n_targets);
+    return GetNodeSumImpl(dh::ToSpan(this->node_sums_), nidx, n_targets);
   }
 
   // Track the child gradient sum.
-  void ApplyTreeSplit(Context const *ctx, RegTree const *p_tree, MultiExpandEntry const &candidate);
+  void ApplyTreeSplit(Context const *ctx, RegTree const *p_tree,
+                      common::Span<MultiExpandEntry const> d_candidates, bst_target_t n_targets);
 };
 
 std::ostream &DebugPrintHistogram(std::ostream &os, common::Span<GradientPairInt64 const> node_hist,
diff --git a/src/tree/updater_gpu_hist.cuh b/src/tree/updater_gpu_hist.cuh
@@ -187,28 +187,31 @@ class MultiTargetHistMaker {
                                                  this->param_.max_bin,
                                                  param};
     auto entry = this->evaluator_.EvaluateSingleSplit(ctx_, input, shared_inputs);
-
-    // TODO(jiamingy): Support learning rate.
     p_tree->SetRoot(linalg::MakeVec(this->ctx_->Device(), entry.base_weight));
 
     return entry;
   }
 
-  void ApplySplit(MultiExpandEntry const& candidate, RegTree* p_tree) {
-    // TODO(jiamingy): Support learning rate.
+  void ApplySplit(std::vector<MultiExpandEntry> const& h_candidates, RegTree* p_tree) {
+    CHECK(!h_candidates.empty());
+    auto n_targets = h_candidates.front().base_weight.size();
+
     // TODO(jiamingy): Avoid device to host copies.
-    std::vector<float> h_base_weight(candidate.base_weight.size());
-    std::vector<float> h_left_weight(candidate.left_weight.size());
-    std::vector<float> h_right_weight(candidate.right_weight.size());
-    dh::CopyDeviceSpanToVector(&h_base_weight, candidate.base_weight);
-    dh::CopyDeviceSpanToVector(&h_left_weight, candidate.left_weight);
-    dh::CopyDeviceSpanToVector(&h_right_weight, candidate.right_weight);
-
-    p_tree->ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
-                       candidate.split.dir == kLeftDir, linalg::MakeVec(h_base_weight),
-                       linalg::MakeVec(h_left_weight), linalg::MakeVec(h_right_weight));
-
-    this->evaluator_.ApplyTreeSplit(this->ctx_, p_tree, candidate);
+    for (auto const& candidate : h_candidates) {
+      std::vector<float> h_base_weight(candidate.base_weight.size());
+      std::vector<float> h_left_weight(candidate.left_weight.size());
+      std::vector<float> h_right_weight(candidate.right_weight.size());
+      dh::CopyDeviceSpanToVector(&h_base_weight, candidate.base_weight);
+      dh::CopyDeviceSpanToVector(&h_left_weight, candidate.left_weight);
+      dh::CopyDeviceSpanToVector(&h_right_weight, candidate.right_weight);
+
+      p_tree->ExpandNode(candidate.nidx, candidate.split.findex, candidate.split.fvalue,
+                         candidate.split.dir == kLeftDir, linalg::MakeVec(h_base_weight),
+                         linalg::MakeVec(h_left_weight), linalg::MakeVec(h_right_weight));
+    }
+
+    dh::device_vector<MultiExpandEntry> candidates{h_candidates};
+    this->evaluator_.ApplyTreeSplit(this->ctx_, p_tree, dh::ToSpan(candidates), n_targets);
   }
   /**
    * @brief Calculate the leaf weight based on the node sum for each leaf.
@@ -464,7 +467,7 @@ class MultiTargetHistMaker {
 
   void GrowTree(linalg::Matrix<GradientPair>* split_gpair, DMatrix* p_fmat, ObjInfo const*,
                 RegTree* p_tree) {
-    if (this->param_.learning_rate - 1.0 != 0.0) {
+    if (!this->hist_param_->debug_synchronize) {
       LOG(FATAL) << "GPU" << MTNotImplemented();
     }
     Driver<MultiExpandEntry> driver{param_, kMaxNodeBatchSize};
@@ -475,10 +478,7 @@ class MultiTargetHistMaker {
     // The set of leaves that can be expanded asynchronously
     auto expand_set = driver.Pop();
     while (!expand_set.empty()) {
-      for (auto& candidate : expand_set) {
-        this->ApplySplit(candidate, p_tree);
-      }
-
+      this->ApplySplit(expand_set, p_tree);
       // Get the candidates we are allowed to expand further
       // e.g. We do not bother further processing nodes whose children are beyond max depth
       std::vector<MultiExpandEntry> valid_candidates;
diff --git a/tests/python-gpu/test_gpu_multi_target.py b/tests/python-gpu/test_gpu_multi_target.py

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ void LeafWeight(Context const* ctx, GPUTrainingParam const& param,`
`88`	`88`	`dh::LaunchN(grad_sum.Size(), ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t i) mutable {`
`89`	`89`	`auto [nidx_in_set, t] = linalg::UnravelIndex(i, grad_sum.Shape());`
`90`	`90`	`auto g = roundings[t].ToFloatingPoint(grad_sum(nidx_in_set, t));`
`91`		`- out_weights(nidx_in_set, t) = CalcWeight(param, g.GetGrad(), g.GetHess());`
	`91`	`+ out_weights(nidx_in_set, t) = CalcWeight(param, g.GetGrad(), g.GetHess()) * param.learning_rate;`
`92`	`92`	`});`
`93`	`93`	`}`
`94`	`94`	`} // namespace xgboost::tree::cuda_impl`