Skip to content
46 changes: 25 additions & 21 deletions python-package/xgboost/testing/multi_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from .._typing import ArrayLike
from ..compat import import_cupy
from ..core import Booster, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
from ..core import Booster, DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix, build_info
from ..objective import Objective, TreeObjective
from ..sklearn import XGBClassifier
from ..training import train
Expand Down Expand Up @@ -172,6 +172,15 @@ def run_with_iter(device: Device) -> None: # pylint: disable=too-many-locals
n_rounds = 8
n_targets = 3
intercept = [0.5] * n_targets

params = {
"device": device,
"multi_strategy": "multi_output_tree",
"learning_rate": 1.0,
"base_score": intercept,
"debug_synchronize": True,
}

Xs = []
ys = []
for i in range(n_batches):
Expand All @@ -185,12 +194,7 @@ def run_with_iter(device: Device) -> None: # pylint: disable=too-many-locals

evals_result_0: Dict[str, Dict] = {}
booster_0 = train(
{
"device": device,
"multi_strategy": "multi_output_tree",
"learning_rate": 1.0,
"base_score": intercept,
},
params,
Xy,
num_boost_round=n_rounds,
evals=[(Xy, "Train")],
Expand All @@ -201,12 +205,7 @@ def run_with_iter(device: Device) -> None: # pylint: disable=too-many-locals
Xy = QuantileDMatrix(it)
evals_result_1: Dict[str, Dict] = {}
booster_1 = train(
{
"device": device,
"multi_strategy": "multi_output_tree",
"learning_rate": 1.0,
"base_score": intercept,
},
params,
Xy,
num_boost_round=n_rounds,
evals=[(Xy, "Train")],
Expand All @@ -219,18 +218,23 @@ def run_with_iter(device: Device) -> None: # pylint: disable=too-many-locals
X, _, _ = it.as_arrays()
assert_allclose(device, booster_0.inplace_predict(X), booster_1.inplace_predict(X))

it = IteratorForTest(Xs, ys, None, cache="cache", on_host=True)
v = build_info()["THRUST_VERSION"]
if v[0] < 3:
pytest.xfail("CCCL version too old.")

it = IteratorForTest(
Xs,
ys,
None,
cache="cache",
on_host=True,
min_cache_page_bytes=X.shape[0] // n_batches * X.shape[1],
)
Xy = ExtMemQuantileDMatrix(it, cache_host_ratio=1.0)

evals_result_2: Dict[str, Dict] = {}
booster_2 = train(
{
"device": device,
"multi_strategy": "multi_output_tree",
"learning_rate": 1.0,
"base_score": intercept,
"debug_synchronize": True,
},
params,
Xy,
evals=[(Xy, "Train")],
obj=LsObj0(),
Expand Down
2 changes: 1 addition & 1 deletion src/common/device_vector.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ template <typename T>
using caching_device_vector = thrust::device_vector<T, XGBCachingDeviceAllocator<T>>; // NOLINT

/**
* @brief Container class that doesn't initialize the data when RMM is used.
* @brief Container class that doesn't initialize the data.
*/
template <typename T, bool is_caching>
class DeviceUVectorImpl {
Expand Down
6 changes: 3 additions & 3 deletions src/tree/gpu_hist/expand_entry.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& e) {
<< "depth: " << e.depth << "\n"
<< "loss: " << e.split.loss_chg << "\n";

std::vector<GradientPairInt64> h_node_sum(e.split.node_sum.size());
dh::CopyDeviceSpanToVector(&h_node_sum, e.split.node_sum);
std::vector<GradientPairInt64> h_node_sum(e.split.child_sum.size());
dh::CopyDeviceSpanToVector(&h_node_sum, e.split.child_sum);

auto print_span = [&](auto const& span) {
using T = typename common::GetValueT<decltype(span)>::value_type;
Expand All @@ -38,7 +38,7 @@ std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& e) {
} else {
os << "right_sum: ";
}
print_span(e.split.node_sum);
print_span(e.split.child_sum);

os << "base_weight: ";
print_span(e.base_weight);
Expand Down
37 changes: 27 additions & 10 deletions src/tree/gpu_hist/leaf_sum.cu
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
/**
* Copyright 2025, XGBoost contributors
*/
#include <cstddef> // for size_t
#include <vector> // for vector
#include <thrust/scan.h> // for inclusive_scan
#include <thrust/version.h> // for THRUST_MAJOR_VERSION

#include "../../common/linalg_op.cuh" // for tbegin
#include "../updater_gpu_common.cuh" // for GPUTrainingParam
#include <cstddef> // for size_t
#include <cstdint> // for int32_t
#include <cub/device/device_segmented_reduce.cuh> // for DeviceSegmentedReduce
#include <vector> // for vector

#include "../updater_gpu_common.cuh" // for GPUTrainingParam
#include "leaf_sum.cuh"
#include "quantiser.cuh" // for GradientQuantiser
#include "row_partitioner.cuh" // for RowIndexT, LeafInfo
Expand All @@ -14,6 +18,12 @@
#include "xgboost/linalg.h" // for MatrixView
#include "xgboost/span.h" // for Span

#if THRUST_MAJOR_VERSION >= 3
#include <thrust/iterator/tabulate_output_iterator.h> // for make_tabulate_output_iterator
#else
#include "../../common/linalg_op.cuh" // for tbegin
#endif

namespace xgboost::tree::cuda_impl {
void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
common::Span<GradientQuantiser const> roundings,
Expand Down Expand Up @@ -50,14 +60,22 @@ void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
auto g = grad(sorted_ridx[j], t);
return roundings[t].ToFixedPoint(g);
});
// Use an output iterator to implement running sum.
#if THRUST_MAJOR_VERSION >= 3
auto out_it = thrust::make_tabulate_output_iterator(
[=] XGBOOST_DEVICE(std::int32_t idx, GradientPairInt64 v) mutable { out_t(idx) += v; });
#else
auto out_it = linalg::tbegin(out_t);
#endif

std::size_t n_bytes = 0;
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, it, linalg::tbegin(out_t),
h_leaves.size(), indptr.data(), indptr.data() + 1,
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, it, out_it, h_leaves.size(),
indptr.data(), indptr.data() + 1,
ctx->CUDACtx()->Stream()));
dh::TemporaryArray<char> alloc(n_bytes);
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(
alloc.data().get(), n_bytes, it, linalg::tbegin(out_t), h_leaves.size(), indptr.data(),
indptr.data() + 1, ctx->CUDACtx()->Stream()));
dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(alloc.data().get(), n_bytes, it, out_it,
h_leaves.size(), indptr.data(), indptr.data() + 1,
ctx->CUDACtx()->Stream()));
}
}

Expand All @@ -66,7 +84,6 @@ void LeafWeight(Context const* ctx, GPUTrainingParam const& param,
linalg::MatrixView<GradientPairInt64 const> grad_sum,
linalg::MatrixView<float> out_weights) {
CHECK(grad_sum.Contiguous());
auto s_grad_sum = grad_sum.Values();
dh::LaunchN(grad_sum.Size(), ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t i) mutable {
auto [nidx_in_set, t] = linalg::UnravelIndex(i, grad_sum.Shape());
auto g = roundings[t].ToFloatingPoint(grad_sum(nidx_in_set, t));
Expand Down
12 changes: 3 additions & 9 deletions src/tree/gpu_hist/multi_evaluate_splits.cu
Original file line number Diff line number Diff line change
Expand Up @@ -280,10 +280,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
auto d_weights = dh::ToSpan(this->weights_);

dh::CachingDeviceUVector<float> d_parent_gains(n_nodes);
dh::CachingDeviceUVector<std::int32_t> sum_zeros(n_nodes * 2);

auto s_parent_gains = dh::ToSpan(d_parent_gains);
auto s_sum_zeros = dh::ToSpan(sum_zeros);
auto s_d_splits = dh::ToSpan(d_splits);

// Process results for each node
Expand All @@ -304,7 +301,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
dh::LaunchN(n_nodes, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t nidx_in_set) {
auto input = d_inputs[nidx_in_set];
MultiSplitCandidate best_split = d_best_splits[nidx_in_set];
if (best_split.node_sum.empty()) {
if (best_split.child_sum.empty()) {
// Invalid split
out_splits[nidx_in_set] = {};
return;
Expand All @@ -316,7 +313,7 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
auto right_weight = d_weights.subspan(nidx_in_set * n_targets * 3 + n_targets * 2, n_targets);

auto d_roundings = shared_inputs.roundings;
auto node_sum = best_split.node_sum;
auto node_sum = best_split.child_sum;

float parent_gain = 0;
for (bst_target_t t = 0; t < n_targets; ++t) {
Expand Down Expand Up @@ -353,9 +350,6 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
}
}

s_sum_zeros[nidx_in_set * 2] = l;
s_sum_zeros[nidx_in_set * 2 + 1] = r;

// Set up the output entry
out_splits[nidx_in_set] = {input.nidx, input.depth, best_split,
base_weight, left_weight, right_weight};
Expand Down Expand Up @@ -384,7 +378,7 @@ void MultiHistEvaluator::ApplyTreeSplit(Context const *ctx, RegTree const *p_tre
// TODO(jiamingy): We need to batch the nodes
auto best_split = candidate.split;

auto node_sum = best_split.node_sum;
auto node_sum = best_split.child_sum;
dh::LaunchN(n_targets, ctx->CUDACtx()->Stream(), [=] XGBOOST_DEVICE(std::size_t t) {
auto sibling_sum = parent_sum[t] - node_sum[t];
if (best_split.dir == kRightDir) {
Expand Down
3 changes: 1 addition & 2 deletions src/tree/gpu_hist/row_partitioner.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright 2017-2024, XGBoost contributors
* Copyright 2017-2025, XGBoost contributors
*/
#include <thrust/sequence.h> // for sequence

Expand All @@ -13,7 +13,6 @@ namespace xgboost::tree {
void RowPartitioner::Reset(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid) {
ridx_segments_.clear();
ridx_.resize(n_samples);
ridx_tmp_.resize(n_samples);
tmp_.clear();
n_nodes_ = 1; // Root

Expand Down
92 changes: 75 additions & 17 deletions src/tree/gpu_hist/row_partitioner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -174,21 +174,24 @@ void SortPositionBatch(Context const* ctx, common::Span<const PerNodeData<OpData
auto ret =
cub::DispatchScan<decltype(input_iterator), decltype(discard_write_iterator), IndexFlagOp,
cub::NullType, std::uint64_t>::Dispatch(nullptr, n_bytes, input_iterator,
discard_write_iterator,
IndexFlagOp{}, cub::NullType{},
static_cast<std::uint64_t>(total_rows),
ctx->CUDACtx()->Stream());
discard_write_iterator,
IndexFlagOp{}, cub::NullType{},
static_cast<std::uint64_t>(
total_rows),
ctx->CUDACtx()->Stream());
dh::safe_cuda(ret);
tmp->resize(n_bytes);
}
n_bytes = tmp->size();
auto ret =
cub::DispatchScan<decltype(input_iterator), decltype(discard_write_iterator), IndexFlagOp,
cub::NullType, std::uint64_t>::Dispatch(tmp->data(), n_bytes, input_iterator,
discard_write_iterator,
IndexFlagOp{}, cub::NullType{},
static_cast<std::uint64_t>(total_rows),
ctx->CUDACtx()->Stream());
cub::NullType, std::uint64_t>::Dispatch(tmp->data(), n_bytes,
input_iterator,
discard_write_iterator,
IndexFlagOp{}, cub::NullType{},
static_cast<std::uint64_t>(
total_rows),
ctx->CUDACtx()->Stream());
dh::safe_cuda(ret);

constexpr int kBlockSize = 256;
Expand Down Expand Up @@ -272,8 +275,6 @@ class RowPartitioner {
* rows idx | 3, 5, 1 | 13, 31 |
*/
dh::DeviceUVector<RowIndexT> ridx_;
// Staging area for sorting ridx
dh::DeviceUVector<RowIndexT> ridx_tmp_;
dh::DeviceUVector<int8_t> tmp_;
dh::PinnedMemory pinned_;
dh::PinnedMemory pinned2_;
Expand Down Expand Up @@ -343,7 +344,8 @@ class RowPartitioner {
void UpdatePositionBatch(Context const* ctx, std::vector<bst_node_t> const& nidx,
std::vector<bst_node_t> const& left_nidx,
std::vector<bst_node_t> const& right_nidx,
std::vector<OpDataT> const& op_data, UpdatePositionOpT op) {
std::vector<OpDataT> const& op_data, common::Span<RowIndexT> ridx_tmp,
UpdatePositionOpT op) {
if (nidx.empty()) {
return;
}
Expand All @@ -366,20 +368,21 @@ class RowPartitioner {
auto h_counts = pinned_.GetSpan<RowIndexT>(nidx.size());
// Must initialize with 0 as 0 count is not written in the kernel.
dh::TemporaryArray<RowIndexT> d_counts(nidx.size(), 0);
CHECK_EQ(ridx_tmp.size(), this->Size());

// Process a sub-batch
auto sub_batch_impl = [ctx, op, this](common::Span<bst_node_t const> nidx,
common::Span<PerNodeData<OpDataT>> d_batch_info,
common::Span<RowIndexT> d_counts) {
auto sub_batch_impl = [&](common::Span<bst_node_t const> nidx,
common::Span<PerNodeData<OpDataT>> d_batch_info,
common::Span<RowIndexT> d_counts) {
std::size_t total_rows = 0;
for (bst_node_t i : nidx) {
total_rows += this->ridx_segments_[i].segment.Size();
}

// Partition the rows according to the operator
SortPositionBatch<UpdatePositionOpT, OpDataT>(ctx, d_batch_info, dh::ToSpan(this->ridx_),
dh::ToSpan(this->ridx_tmp_), d_counts,
total_rows, op, &this->tmp_);
ridx_tmp, d_counts, total_rows, op,
&this->tmp_);
};

// Divide inputs into sub-batches.
Expand Down Expand Up @@ -441,4 +444,59 @@ class RowPartitioner {
base_ridx, d_ridx, d_out_position, op);
}
};

// Partitioner for all batches, used for external memory training.
class RowPartitionerBatches {
private:
// Temporary buffer for sorting the samples.
dh::DeviceUVector<cuda_impl::RowIndexT> ridx_tmp_;
// Partitioners for each batch.
std::vector<std::unique_ptr<RowPartitioner>> partitioners_;

public:
void Reset(Context const* ctx, std::vector<bst_idx_t> const& batch_ptr) {
CHECK_GE(batch_ptr.size(), 2);
std::size_t n_batches = batch_ptr.size() - 1;
if (partitioners_.size() != n_batches) {
partitioners_.clear();
}

bst_idx_t n_max_samples = 0;
for (std::size_t k = 0; k < n_batches; ++k) {
if (partitioners_.size() != n_batches) {
// First run.
partitioners_.emplace_back(std::make_unique<RowPartitioner>());
}
auto base_ridx = batch_ptr[k];
auto n_samples = batch_ptr.at(k + 1) - base_ridx;
partitioners_[k]->Reset(ctx, n_samples, base_ridx);
CHECK_LE(n_samples, std::numeric_limits<cuda_impl::RowIndexT>::max());
n_max_samples = std::max(n_samples, n_max_samples);
}
this->ridx_tmp_.resize(n_max_samples);
}

// Accessors
[[nodiscard]] decltype(auto) operator[](std::size_t i) { return partitioners_[i]; }
decltype(auto) At(std::size_t i) { return partitioners_.at(i); }
[[nodiscard]] std::size_t Size() const { return this->partitioners_.size(); }
decltype(auto) cbegin() const { return this->partitioners_.cbegin(); } // NOLINT
decltype(auto) cend() const { return this->partitioners_.cend(); } // NOLINT
decltype(auto) begin() const { return this->partitioners_.cbegin(); } // NOLINT
decltype(auto) end() const { return this->partitioners_.cend(); } // NOLINT

[[nodiscard]] decltype(auto) Front() { return this->partitioners_.front(); }
[[nodiscard]] bool Empty() const { return this->partitioners_.empty(); }

template <typename UpdatePositionOpT, typename OpDataT>
void UpdatePositionBatch(Context const* ctx, std::int32_t batch_idx,
std::vector<bst_node_t> const& nidx,
std::vector<bst_node_t> const& left_nidx,
std::vector<bst_node_t> const& right_nidx,
std::vector<OpDataT> const& op_data, UpdatePositionOpT op) {
auto& part = this->At(batch_idx);
auto ridx_tmp = dh::ToSpan(this->ridx_tmp_).subspan(0, part->Size());
part->UpdatePositionBatch(ctx, nidx, left_nidx, right_nidx, op_data, ridx_tmp, op);
}
};
}; // namespace xgboost::tree
Loading
Loading