@@ -245,11 +245,11 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
245245 // Estimate the size of each data block based on model parameters and L1 capacity
246246 // The general idea is to keep as much working-set data in L1 as possible.
247247 /* Each processed row occupies ~32 bytes in L1:
248- * - gradient pair (p_gpair): 2 * sizeof(float )
248+ * - gradient pair (p_gpair): sizeof(GradientPair )
249249 * - row index (rid[i]): sizeof(size_t)
250250 * - icol_start and icol_end: 2 * sizeof(size_t)
251251 */
252- std::size_t l1_row_foot_print = (2 * sizeof (float ) + 3 * sizeof (size_t ));
252+ std::size_t l1_row_foot_print = (sizeof (GradientPair ) + 3 * sizeof (size_t ));
253253 double usable_l1_size = 0.8 * l1_size;
254254
255255 std::size_t space_in_l1_for_rows;
@@ -262,9 +262,10 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
262262 */
263263
264264 /* First step: determine whether one histogram column fits into L1.
265- * The maximum number of elements in a column is 2^8, 2^16, or 2^32,
266- * depending on the bin index size.
267- */
265+ * The maximum number of bins in a column is 2^8, 2^16, or 2^32,
266+ * depending on the bin index size.
267+ * Note: column-wise kernel is used for dense data only.
268+ */
268269 std::size_t max_elem_in_hist_col = 1u << (8 * gidx.index .GetBinTypeSize ());
269270 std::size_t hist_col_size = 2 * sizeof (double ) * max_elem_in_hist_col;
270271 bool hist_col_fit_to_l1 = hist_col_size < usable_l1_size;
@@ -289,7 +290,7 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
289290 /* Second step: estimate the extra L1 footprint caused by prefetching.
290291 * Prefetching is not always active, so the estimate is intentionally conservative.
291292 */
292- l1_row_foot_print += 2 * sizeof (float );
293+ l1_row_foot_print += sizeof (GradientPair );
293294 std::size_t idx_bin_size = n_columns * sizeof (uint32_t );
294295
295296 bool hist_fit_to_l1 = (hist_size + offsets_size + idx_bin_size) < usable_l1_size;
@@ -304,7 +305,7 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
304305 * This ensures that a full cache line is utilized when loading gradient pairs.
305306 */
306307 constexpr std::size_t kCacheLineSize = 64 ;
307- constexpr std::size_t kMinBlockSize = kCacheLineSize / ( 2 * sizeof (float ) );
308+ constexpr std::size_t kMinBlockSize = kCacheLineSize / sizeof (GradientPair );
308309 block_size = std::max<std::size_t >(kMinBlockSize , block_size);
309310
310311 common::BlockedSpace2d space{
@@ -329,6 +330,15 @@ class MultiHistogramBuilder {
329330 size_t hist_size = 2 * sizeof (double ) * nbins;
330331 const bool hist_fit_to_l2 = 0.8 * cache_manager_.L2Size () > hist_size;
331332
333+ /* In row-wise histogram construction, each iteration of the outer (row-wise) loop
334+ * accesses bins across the entire histogram; the bins are not localized.
335+ * If the histogram is too large to fit in L2 cache, random access becomes a major performance bottleneck.
336+ *
337+ * or dense data, using column-wise histogram construction,
338+ * each iteration of the outer (column-wise) loop accesses only a localized portion of the histogram:
339+ * idx_bin = gradient_index(row_id, col_id) + offset[col_id].
340+ * This improves cache locality, so the column-wise kernel outperforms the row-wise kernel in this case.
341+ */
332342 bool read_by_column = !hist_fit_to_l2 && gidx.IsDense ();
333343 return read_by_column;
334344 }
0 commit comments