dmlc · trivialfis · Jul 18, 2025 · Jul 9, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/ops/conda_env/linux_sycl_test.yml b/ops/conda_env/linux_sycl_test.yml
@@ -20,6 +20,6 @@ dependencies:
 - pytest-cov
 - dask=2024.11
 - ninja
-- dpcpp_linux-64
+- dpcpp_linux-64>=2024.2.1
 - onedpl-devel
 - intel-openmp
diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc
@@ -7,6 +7,7 @@
 #include <algorithm>
 
 #include "../data/gradient_index.h"
+#include "../tree/hist_dispatcher.h"
 #include "hist_util.h"
 
 #include <sycl/sycl.hpp>
@@ -91,28 +92,27 @@ template ::sycl::event SubtractionHist(::sycl::queue* qu,
                               const GHistRow<double, MemoryType::on_device>& src2,
                               size_t size, ::sycl::event event_priv);
 
-inline auto GetBlocksParameters(::sycl::queue* qu, size_t size, size_t max_nblocks) {
-  struct _ {
-    size_t block_size, nblocks;
-  };
+template <typename GradientPairT>
+::sycl::event ReduceHist(::sycl::queue* qu, GradientPairT* hist_data,
+                         GradientPairT* hist_buffer_data,
+                         size_t  nblocks, size_t nbins,
+                         const ::sycl::event& event_main) {
+  auto event_save = qu->submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event_main);
+    cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
+      size_t idx_bin = pid.get_id(0);
 
-  const size_t min_block_size = 32;
-  const size_t max_compute_units =
-    qu->get_device().get_info<::sycl::info::device::max_compute_units>();
+      GradientPairT gpair = {0, 0};
 
-  size_t nblocks = max_compute_units;
+      for (size_t j = 0; j < nblocks; ++j) {
+        gpair += hist_buffer_data[j * nbins + idx_bin];
+      }
 
-  size_t block_size = size / nblocks + !!(size % nblocks);
-  if (block_size > (1u << 12)) {
-    nblocks = max_nblocks;
-    block_size = size / nblocks + !!(size % nblocks);
-  }
-  if (block_size < min_block_size) {
-    block_size = min_block_size;
-    nblocks = size / block_size + !!(size % block_size);
-  }
+      hist_data[idx_bin] = gpair;
+    });
+  });
 
-  return _{block_size, nblocks};
+  return event_save;
 }
 
 // Kernel with buffer using
@@ -123,6 +123,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const GHistIndexMatrix& gmat,
                             GHistRow<FPType, MemoryType::on_device>* hist,
                             GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                            const tree::HistDispatcher<FPType>& dispatcher,
                             ::sycl::event event_priv) {
   using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
   const size_t size = row_indices.Size();
@@ -133,18 +134,13 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
   const uint32_t* offsets = gmat.cut.cut_ptrs_.ConstDevicePointer();
   const size_t nbins = gmat.nbins;
 
-  const size_t max_work_group_size =
-    qu->get_device().get_info<::sycl::info::device::max_work_group_size>();
-  const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
-
-  // Captured structured bindings are a C++20 extension
-  const auto block_params = GetBlocksParameters(qu, size, hist_buffer->Size() / (nbins * 2));
-  const size_t block_size = block_params.block_size;
-  const size_t nblocks = block_params.nblocks;
+  const size_t work_group_size = dispatcher.work_group_size;
+  const size_t block_size = dispatcher.block.size;
+  const size_t nblocks = dispatcher.block.nblocks;
 
   GradientPairT* hist_buffer_data = hist_buffer->Data();
   auto event_fill = qu->fill(hist_buffer_data, GradientPairT(0, 0),
-                             nblocks * nbins * 2, event_priv);
+                             nblocks * nbins, event_priv);
   auto event_main = qu->submit([&](::sycl::handler& cgh) {
     cgh.depends_on(event_fill);
     cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
@@ -179,20 +175,84 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
   });
 
   GradientPairT* hist_data = hist->Data();
-  auto event_save = qu->submit([&](::sycl::handler& cgh) {
-    cgh.depends_on(event_main);
-    cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
-      size_t idx_bin = pid.get_id(0);
+  auto event_save = ReduceHist(qu, hist_data, hist_buffer_data, nblocks,
+                               nbins, event_main);
 
-      GradientPairT gpair = {0, 0};
+  return event_save;
+}
 
-      for (size_t j = 0; j < nblocks; ++j) {
-        gpair += hist_buffer_data[j * nbins + idx_bin];
-      }
+// Kernel with buffer and local hist using
+template<typename FPType, typename BinIdxType>
+::sycl::event BuildHistKernelLocal(::sycl::queue* qu,
+                            const HostDeviceVector<GradientPair>& gpair,
+                            const RowSetCollection::Elem& row_indices,
+                            const GHistIndexMatrix& gmat,
+                            GHistRow<FPType, MemoryType::on_device>* hist,
+                            GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                            const tree::HistDispatcher<FPType>& dispatcher,
+                            ::sycl::event event_priv) {
+  constexpr int kMaxNumBins = tree::HistDispatcher<FPType>::KMaxNumBins;
+  using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
+  const size_t size = row_indices.Size();
+  const size_t* rid = row_indices.begin;
+  const size_t n_columns = gmat.nfeatures;
+  const auto* pgh = gpair.ConstDevicePointer();
+  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
+  const uint32_t* offsets = gmat.cut.cut_ptrs_.ConstDevicePointer();
+  const size_t nbins = gmat.nbins;
 
-      hist_data[idx_bin] = gpair;
+  const size_t work_group_size = dispatcher.work_group_size;
+  const size_t block_size = dispatcher.block.size;
+  const size_t nblocks = dispatcher.block.nblocks;
+
+  GradientPairT* hist_buffer_data = hist_buffer->Data();
+
+  auto event_main = qu->submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event_priv);
+    cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
+                                           ::sycl::range<2>(1, work_group_size)),
+                       [=](::sycl::nd_item<2> pid) {
+      size_t block = pid.get_global_id(0);
+      size_t feat = pid.get_global_id(1);
+
+      // This buffer will be keeped in L1/registers
+      GradientPairT hist_fast[kMaxNumBins];
+
+      GradientPairT* hist_local = hist_buffer_data + block * nbins;
+      for (size_t fid = feat; fid < n_columns; fid += work_group_size) {
+        size_t n_bins_feature = offsets[fid+1] - offsets[fid];
+
+        // Not all elements of hist_fast are actually used: n_bins_feature <= kMaxNumBins
+        // We initililize only the requared elements to prevent the unused go to cache.
+        for (int bin = 0; bin < n_bins_feature; ++bin) {
+          hist_fast[bin] = {0, 0};
+        }
+
+        for (size_t idx = 0; idx < block_size; ++idx) {
+          size_t i = block * block_size + idx;
+          if (i < size) {
+            size_t row_id = rid[i];
+
+            const size_t icol_start = n_columns * row_id;
+            const GradientPairT pgh_row(pgh[row_id].GetGrad(),
+                                        pgh[row_id].GetHess());
+
+            const BinIdxType* gr_index_local = gradient_index + icol_start;
+            uint32_t idx_bin = gr_index_local[fid];
+
+            hist_fast[idx_bin] += pgh_row;
+          }
+        }
+        for (int bin = 0 ; bin < n_bins_feature; ++bin) {
+          hist_local[bin + offsets[fid]] = hist_fast[bin];
+        }
+      }
     });
   });
+
+  GradientPairT* hist_data = hist->Data();
+  auto event_save = ReduceHist(qu, hist_data, hist_buffer_data, nblocks,
+                               nbins, event_main);
   return event_save;
 }
 
@@ -203,6 +263,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const RowSetCollection::Elem& row_indices,
                             const GHistIndexMatrix& gmat,
                             GHistRow<FPType, MemoryType::on_device>* hist,
+                            const tree::HistDispatcher<FPType>& dispatcher,
                             ::sycl::event event_priv) {
   const size_t size = row_indices.Size();
   const size_t* rid = row_indices.begin;
@@ -214,7 +275,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
   FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
   const size_t nbins = gmat.nbins;
 
-  constexpr size_t work_group_size = 32;
+  size_t work_group_size = dispatcher.work_group_size;
   const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);
 
   auto event_fill = qu->fill(hist_data, FPType(0), nbins * 2, event_priv);
@@ -260,34 +321,47 @@ ::sycl::event BuildHistDispatchKernel(
                 GHistRow<FPType, MemoryType::on_device>* hist,
                 bool isDense,
                 GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                const tree::DeviceProperties& device_prop,
                 ::sycl::event events_priv,
                 bool force_atomic_use) {
   const size_t size = row_indices.Size();
   const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
   const size_t nbins = gmat.nbins;
+  const size_t max_num_bins = gmat.max_num_bins;
+  const size_t min_num_bins = gmat.min_num_bins;
 
-  // TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
-  bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
+  size_t max_n_blocks = hist_buffer->Size() / nbins;
+  auto dispatcher = tree::HistDispatcher<FPType>
+                       (device_prop, isDense, size, max_n_blocks, nbins,
+                        n_columns, max_num_bins, min_num_bins);
 
   // force_atomic_use flag is used only for testing
-  use_atomic = use_atomic || force_atomic_use;
+  bool use_atomic = dispatcher.use_atomics || force_atomic_use;
   if (!use_atomic) {
     if (isDense) {
-      return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
-                                                       gmat, hist, hist_buffer,
-                                                       events_priv);
+      if (dispatcher.use_local_hist) {
+        return BuildHistKernelLocal<FPType, BinIdxType>(qu, gpair, row_indices,
+                                                        gmat, hist, hist_buffer,
+                                                        dispatcher, events_priv);
+      } else {
+        return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
+                                                         gmat, hist, hist_buffer,
+                                                         dispatcher, events_priv);
+      }
     } else {
       return BuildHistKernel<FPType, uint32_t, false>(qu, gpair, row_indices,
                                                       gmat, hist, hist_buffer,
-                                                      events_priv);
+                                                      dispatcher, events_priv);
     }
   } else {
     if (isDense) {
       return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
-                                                       gmat, hist, events_priv);
+                                                       gmat, hist,
+                                                       dispatcher, events_priv);
     } else {
       return BuildHistKernel<FPType, uint32_t, false>(qu, gpair, row_indices,
-                                                      gmat, hist, events_priv);
+                                                      gmat, hist,
+                                                      dispatcher, events_priv);
     }
   }
 }
@@ -299,23 +373,27 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
                             const GHistIndexMatrix& gmat, const bool isDense,
                             GHistRow<FPType, MemoryType::on_device>* hist,
                             GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                            const tree::DeviceProperties& device_prop,
                             ::sycl::event event_priv,
                             bool force_atomic_use) {
   const bool is_dense = isDense;
   switch (gmat.index.GetBinTypeSize()) {
     case BinTypeSize::kUint8BinsTypeSize:
       return BuildHistDispatchKernel<FPType, uint8_t>(qu, gpair, row_indices,
                                                       gmat, hist, is_dense, hist_buffer,
+                                                      device_prop,
                                                       event_priv, force_atomic_use);
       break;
     case BinTypeSize::kUint16BinsTypeSize:
       return BuildHistDispatchKernel<FPType, uint16_t>(qu, gpair, row_indices,
                                                        gmat, hist, is_dense, hist_buffer,
+                                                       device_prop,
                                                        event_priv, force_atomic_use);
       break;
     case BinTypeSize::kUint32BinsTypeSize:
       return BuildHistDispatchKernel<FPType, uint32_t>(qu, gpair, row_indices,
                                                        gmat, hist, is_dense, hist_buffer,
+                                                       device_prop,
                                                        event_priv, force_atomic_use);
       break;
     default:
@@ -331,10 +409,12 @@ ::sycl::event GHistBuilder<GradientSumT>::BuildHist(
               GHistRowT<MemoryType::on_device>* hist,
               bool isDense,
               GHistRowT<MemoryType::on_device>* hist_buffer,
+              const tree::DeviceProperties& device_prop,
               ::sycl::event event_priv,
               bool force_atomic_use) {
   return BuildHistKernel<GradientSumT>(qu_, gpair, row_indices, gmat,
-                                       isDense, hist, hist_buffer, event_priv,
+                                       isDense, hist, hist_buffer,
+                                       device_prop, event_priv,
                                        force_atomic_use);
 }
 
@@ -346,6 +426,7 @@ ::sycl::event GHistBuilder<float>::BuildHist(
               GHistRow<float, MemoryType::on_device>* hist,
               bool isDense,
               GHistRow<float, MemoryType::on_device>* hist_buffer,
+              const tree::DeviceProperties& device_prop,
               ::sycl::event event_priv,
               bool force_atomic_use);
 template
@@ -356,6 +437,7 @@ ::sycl::event GHistBuilder<double>::BuildHist(
               GHistRow<double, MemoryType::on_device>* hist,
               bool isDense,
               GHistRow<double, MemoryType::on_device>* hist_buffer,
+              const tree::DeviceProperties& device_prop,
               ::sycl::event event_priv,
               bool force_atomic_use);
 

diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h
@@ -14,6 +14,7 @@
 
 #include "../../src/common/hist_util.h"
 #include "../data/gradient_index.h"
+#include "../tree/hist_dispatcher.h"
 
 #include <sycl/sycl.hpp>
 
@@ -123,7 +124,7 @@ class ParallelGHistBuilder {
   }
 
   void Reset(size_t nblocks) {
-    hist_device_buffer_.Resize(qu_, nblocks * nbins_ * 2);
+    hist_device_buffer_.Resize(qu_, nblocks * nbins_);
   }
 
   GHistRowT& GetDeviceBuffer() {
@@ -161,6 +162,7 @@ class GHistBuilder {
                           GHistRowT<MemoryType::on_device>* HistCollection,
                           bool isDense,
                           GHistRowT<MemoryType::on_device>* hist_buffer,
+                          const tree::DeviceProperties& device_prop,
                           ::sycl::event event,
                           bool force_atomic_use = false);
 

diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
@@ -121,6 +121,14 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
   max_num_bins = max_bins;
   nbins = cut.Ptrs().back();
 
+  min_num_bins = nbins;
+  const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
+  for (unsigned fid = 0; fid < n_offsets; ++fid) {
+    auto ibegin = cut.cut_ptrs_.ConstHostVector()[fid];
+    auto iend = cut.cut_ptrs_.ConstHostVector()[fid + 1];
+    min_num_bins = std::min<size_t>(min_num_bins, iend - ibegin);
+  }
+
   hit_count.SetDevice(ctx->Device());
   hit_count.Resize(nbins, 0);
 
@@ -141,7 +149,7 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
     row_stride = nfeatures;
     n_rows = dmat->Info().num_row_;
   }
-  const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
+
   const size_t n_index = n_rows * row_stride;
   ResizeIndex(qu, n_index);
 

diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h
@@ -86,6 +86,7 @@ struct GHistIndexMatrix {
   /*! \brief The corresponding cuts */
   xgboost::common::HistogramCuts cut;
   size_t max_num_bins;
+  size_t min_num_bins;
   size_t nbins;
   size_t nfeatures;
   size_t row_stride;