Skip to content

Commit 5598949

Browse files
author
Dmitry Razdoburdin
committed
make row/col wise hist dispatching more clear
1 parent 465cd68 commit 5598949

File tree

1 file changed

+17
-7
lines changed

1 file changed

+17
-7
lines changed

src/tree/hist/histogram.h

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -245,11 +245,11 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
245245
// Estimate the size of each data block based on model parameters and L1 capacity
246246
// The general idea is to keep as much working-set data in L1 as possible.
247247
/* Each processed row occupies ~32 bytes in L1:
248-
* - gradient pair (p_gpair): 2 * sizeof(float)
248+
* - gradient pair (p_gpair): sizeof(GradientPair)
249249
* - row index (rid[i]): sizeof(size_t)
250250
* - icol_start and icol_end: 2 * sizeof(size_t)
251251
*/
252-
std::size_t l1_row_foot_print = (2 * sizeof(float) + 3 * sizeof(size_t));
252+
std::size_t l1_row_foot_print = (sizeof(GradientPair) + 3 * sizeof(size_t));
253253
double usable_l1_size = 0.8 * l1_size;
254254

255255
std::size_t space_in_l1_for_rows;
@@ -262,9 +262,10 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
262262
*/
263263

264264
/* First step: determine whether one histogram column fits into L1.
265-
* The maximum number of elements in a column is 2^8, 2^16, or 2^32,
266-
* depending on the bin index size.
267-
*/
265+
* The maximum number of bins in a column is 2^8, 2^16, or 2^32,
266+
* depending on the bin index size.
267+
* Note: column-wise kernel is used for dense data only.
268+
*/
268269
std::size_t max_elem_in_hist_col = 1u << (8 * gidx.index.GetBinTypeSize());
269270
std::size_t hist_col_size = 2 * sizeof(double) * max_elem_in_hist_col;
270271
bool hist_col_fit_to_l1 = hist_col_size < usable_l1_size;
@@ -289,7 +290,7 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
289290
/* Second step: estimate the extra L1 footprint caused by prefetching.
290291
* Prefetching is not always active, so the estimate is intentionally conservative.
291292
*/
292-
l1_row_foot_print += 2 * sizeof(float);
293+
l1_row_foot_print += sizeof(GradientPair);
293294
std::size_t idx_bin_size = n_columns * sizeof(uint32_t);
294295

295296
bool hist_fit_to_l1 = (hist_size + offsets_size + idx_bin_size) < usable_l1_size;
@@ -304,7 +305,7 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
304305
* This ensures that a full cache line is utilized when loading gradient pairs.
305306
*/
306307
constexpr std::size_t kCacheLineSize = 64;
307-
constexpr std::size_t kMinBlockSize = kCacheLineSize / (2 * sizeof(float));
308+
constexpr std::size_t kMinBlockSize = kCacheLineSize / sizeof(GradientPair);
308309
block_size = std::max<std::size_t>(kMinBlockSize, block_size);
309310

310311
common::BlockedSpace2d space{
@@ -329,6 +330,15 @@ class MultiHistogramBuilder {
329330
size_t hist_size = 2 * sizeof(double) * nbins;
330331
const bool hist_fit_to_l2 = 0.8 * cache_manager_.L2Size() > hist_size;
331332

333+
/* In row-wise histogram construction, each iteration of the outer (row-wise) loop
334+
* accesses bins across the entire histogram; the bins are not localized.
335+
* If the histogram is too large to fit in L2 cache, random access becomes a major performance bottleneck.
336+
*
337+
* or dense data, using column-wise histogram construction,
338+
* each iteration of the outer (column-wise) loop accesses only a localized portion of the histogram:
339+
* idx_bin = gradient_index(row_id, col_id) + offset[col_id].
340+
* This improves cache locality, so the column-wise kernel outperforms the row-wise kernel in this case.
341+
*/
332342
bool read_by_column = !hist_fit_to_l2 && gidx.IsDense();
333343
return read_by_column;
334344
}

0 commit comments

Comments
 (0)