Skip to content

Commit 4bda887

Browse files
authored
[mt] Implement histogram subtraction. (#11825)
1 parent e460617 commit 4bda887

File tree

8 files changed

+149
-95
lines changed

8 files changed

+149
-95
lines changed

src/data/batch_utils.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@ void CheckParam(BatchParam const& init, BatchParam const& param) {
2525
<< "Only the `hist` tree method can use the `QuantileDMatrix`.";
2626
}
2727

28+
/**
29+
* @brief Check whether we should configure `min_cache_page_bytes`.
30+
*
31+
* Defined by @ref AutoCachePageBytes .
32+
*/
33+
[[nodiscard]] bool CachePageBytesIsAuto(std::int64_t min_cache_page_bytes) {
34+
return min_cache_page_bytes == cuda_impl::AutoCachePageBytes();
35+
}
36+
2837
[[nodiscard]] std::pair<double, std::int64_t> DftPageSizeHostRatio(
2938
std::size_t n_cache_bytes, bool is_validation, double cache_host_ratio,
3039
std::int64_t min_cache_page_bytes) {

src/data/batch_utils.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,6 @@ void CheckParam(BatchParam const& init, BatchParam const& param);
5656
[[nodiscard]] inline bool HostRatioIsAuto(float cache_host_ratio) {
5757
return std::isnan(cache_host_ratio);
5858
}
59-
/**
60-
* @brief Check whether we should configure `min_cache_page_bytes`.
61-
*
62-
* Defined by @ref AutoCachePageBytes .
63-
*/
64-
[[nodiscard]] inline bool CachePageBytesIsAuto(std::int64_t min_cache_page_bytes) {
65-
return min_cache_page_bytes == -1;
66-
}
6759
} // namespace xgboost::data::detail
6860

6961
namespace xgboost::cuda_impl {

src/tree/gpu_hist/expand_entry.cuh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,10 @@ struct MultiExpandEntry {
136136
common::Span<float const> left_weight;
137137
common::Span<float const> right_weight;
138138

139+
// Sum Hessian of the first target. Used as a surrogate for node size.
140+
double left_fst_hess{0};
141+
double right_fst_hess{0};
142+
139143
MultiExpandEntry() = default;
140144

141145
[[nodiscard]] float GetLossChange() const { return split.loss_chg; }
@@ -165,6 +169,11 @@ struct MultiExpandEntry {
165169
return true;
166170
}
167171

172+
__device__ void UpdateFirstHessian(GradientPairPrecise const& lg, GradientPairPrecise const& rg) {
173+
this->left_fst_hess = lg.GetHess();
174+
this->right_fst_hess = rg.GetHess();
175+
}
176+
168177
friend std::ostream& operator<<(std::ostream& os, MultiExpandEntry const& entry);
169178
};
170179
} // namespace cuda_impl

src/tree/gpu_hist/leaf_sum.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ void LeafGradSum(Context const* ctx, std::vector<LeafInfo> const& h_leaves,
6060
auto g = grad(sorted_ridx[j], t);
6161
return roundings[t].ToFixedPoint(g);
6262
});
63-
// Use an output iterator to implement running sum.
63+
// Use an output iterator to implement running sum. Old thrust versions either don't
64+
// have this iterator, or unusable with segmented sum.
6465
#if THRUST_MAJOR_VERSION >= 3
6566
auto out_it = thrust::make_tabulate_output_iterator(
6667
[=] XGBOOST_DEVICE(std::int32_t idx, GradientPairInt64 v) mutable { out_t(idx) += v; });

src/tree/gpu_hist/multi_evaluate_splits.cu

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -202,10 +202,6 @@ __global__ __launch_bounds__(kBlockThreads) void EvaluateSplitsKernel(
202202
AgentT agent{&temp_storage, fidx};
203203

204204
auto n_targets = shared.Targets();
205-
// The number of bins in a feature
206-
auto f_hist_size =
207-
(shared.feature_segments[fidx + 1] - shared.feature_segments[fidx]) * n_targets;
208-
209205
auto candidate_idx = nidx * shared.Features() + fidx;
210206

211207
if (shared.one_pass != MultiEvaluateSplitSharedInputs::kBackward) {
@@ -256,11 +252,12 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
256252
GradientPairInt64{});
257253

258254
// Create spans for each node's scan results
259-
dh::device_vector<common::Span<GradientPairInt64>> scans(n_nodes);
255+
std::vector<common::Span<GradientPairInt64>> h_scans(n_nodes);
260256
for (std::size_t nidx_in_set = 0; nidx_in_set < n_nodes; ++nidx_in_set) {
261-
scans[nidx_in_set] = dh::ToSpan(this->scan_buffer_)
262-
.subspan(nidx_in_set * node_hist_size * 2, node_hist_size * 2);
257+
h_scans[nidx_in_set] = dh::ToSpan(this->scan_buffer_)
258+
.subspan(nidx_in_set * node_hist_size * 2, node_hist_size * 2);
263259
}
260+
dh::device_vector<common::Span<GradientPairInt64>> scans(h_scans);
264261

265262
// Launch histogram scan kernel
266263
dim3 grid{n_nodes, n_features, n_targets};
@@ -328,32 +325,40 @@ void MultiHistEvaluator::EvaluateSplits(Context const *ctx,
328325
s_parent_gains[nidx_in_set] = parent_gain;
329326

330327
bool l = true, r = true;
328+
GradientPairPrecise lg_fst, rg_fst;
331329
for (bst_target_t t = 0; t < n_targets; ++t) {
332330
auto quantizer = d_roundings[t];
333331
auto sibling_sum = input.parent_sum[t] - node_sum[t];
334332

335333
l = l && (node_sum[t].GetQuantisedHess() == 0);
336334
r = r && (sibling_sum.GetQuantisedHess() == 0);
337335

336+
GradientPairPrecise lg, rg;
338337
if (best_split.dir == kRightDir) {
339338
// forward pass, node_sum is the left sum
340-
auto lg = quantizer.ToFloatingPoint(node_sum[t]);
339+
lg = quantizer.ToFloatingPoint(node_sum[t]);
341340
left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess());
342-
auto rg = quantizer.ToFloatingPoint(sibling_sum);
341+
rg = quantizer.ToFloatingPoint(sibling_sum);
343342
right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess());
344343
} else {
345344
// backward pass, node_sum is the right sum
346-
auto rg = quantizer.ToFloatingPoint(node_sum[t]);
345+
rg = quantizer.ToFloatingPoint(node_sum[t]);
347346
right_weight[t] = CalcWeight(shared_inputs.param, rg.GetGrad(), rg.GetHess());
348-
auto lg = quantizer.ToFloatingPoint(sibling_sum);
347+
lg = quantizer.ToFloatingPoint(sibling_sum);
349348
left_weight[t] = CalcWeight(shared_inputs.param, lg.GetGrad(), lg.GetHess());
350349
}
350+
351+
if (t == 0) {
352+
lg_fst = lg;
353+
rg_fst = rg;
354+
}
351355
}
352356

353357
// Set up the output entry
354358
out_splits[nidx_in_set] = {input.nidx, input.depth, best_split,
355359
base_weight, left_weight, right_weight};
356360
out_splits[nidx_in_set].split.loss_chg -= parent_gain;
361+
out_splits[nidx_in_set].UpdateFirstHessian(lg_fst, rg_fst);
357362

358363
if (l || r) {
359364
out_splits[nidx_in_set] = {};

src/tree/gpu_hist/multi_evaluate_splits.cuh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,11 @@
1212
namespace xgboost::tree::cuda_impl {
1313
/** @brief Evaluator for vector leaf. */
1414
class MultiHistEvaluator {
15+
// Buffer for node weights
1516
dh::device_vector<float> weights_;
16-
17+
// Buffer for histogram scans.
1718
dh::device_vector<GradientPairInt64> scan_buffer_;
19+
// Buffer for node gradient sums.
1820
dh::device_vector<GradientPairInt64> node_sums_;
1921

2022
public:

src/tree/updater_gpu_hist.cu

Lines changed: 8 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -76,33 +76,6 @@ struct NodeSplitData {
7676
};
7777
static_assert(std::is_trivially_copyable_v<NodeSplitData>);
7878

79-
// Some nodes we will manually compute histograms, others we will do by subtraction
80-
void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
81-
std::vector<GPUExpandEntry> const& candidates,
82-
common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
83-
auto const& tree = p_tree->HostScView();
84-
std::size_t nidx_in_set{0};
85-
auto p_build_nidx = nodes_to_build.data();
86-
auto p_sub_nidx = nodes_to_sub.data();
87-
for (auto& e : candidates) {
88-
// Decide whether to build the left histogram or right histogram Use sum of Hessian as
89-
// a heuristic to select node with fewest training instances This optimization is for
90-
// distributed training to avoid an allreduce call for synchronizing the number of
91-
// instances for each node.
92-
auto left_sum = quantizer->ToFloatingPoint(e.split.left_sum);
93-
auto right_sum = quantizer->ToFloatingPoint(e.split.right_sum);
94-
bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
95-
if (fewer_right) {
96-
p_build_nidx[nidx_in_set] = tree.RightChild(e.nidx);
97-
p_sub_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
98-
} else {
99-
p_build_nidx[nidx_in_set] = tree.LeftChild(e.nidx);
100-
p_sub_nidx[nidx_in_set] = tree.RightChild(e.nidx);
101-
}
102-
++nidx_in_set;
103-
}
104-
}
105-
10679
// GPU tree updater implementation.
10780
struct GPUHistMakerDevice {
10881
private:
@@ -501,9 +474,16 @@ struct GPUHistMakerDevice {
501474
auto nodes = this->CreatePartitionNodes(p_tree, is_single_block ? candidates : expand_set);
502475

503476
// Prepare for build hist
477+
auto const& tree = p_tree->HostScView();
504478
std::vector<bst_node_t> build_nidx(candidates.size());
505479
std::vector<bst_node_t> subtraction_nidx(candidates.size());
506-
AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx);
480+
cuda_impl::AssignNodes(tree, candidates, build_nidx, subtraction_nidx,
481+
[&](GPUExpandEntry const& e) {
482+
auto left_sum = this->quantiser->ToFloatingPoint(e.split.left_sum);
483+
auto right_sum = this->quantiser->ToFloatingPoint(e.split.right_sum);
484+
bool fewer_right = right_sum.GetHess() < left_sum.GetHess();
485+
return fewer_right;
486+
});
507487
auto prefetch_copy = !build_nidx.empty() && this->NeedCopy(p_fmat, candidates);
508488

509489
this->histogram_.AllocateHistograms(ctx_, build_nidx, subtraction_nidx);

0 commit comments

Comments
 (0)