Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ops/conda_env/linux_sycl_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,6 @@ dependencies:
- pytest-cov
- dask=2024.11
- ninja
- dpcpp_linux-64
- dpcpp_linux-64>=2024.2.1
- onedpl-devel
- intel-openmp
176 changes: 129 additions & 47 deletions plugin/sycl/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <algorithm>

#include "../data/gradient_index.h"
#include "../tree/hist_dispatcher.h"
#include "hist_util.h"

#include <sycl/sycl.hpp>
Expand Down Expand Up @@ -91,28 +92,27 @@ template ::sycl::event SubtractionHist(::sycl::queue* qu,
const GHistRow<double, MemoryType::on_device>& src2,
size_t size, ::sycl::event event_priv);

inline auto GetBlocksParameters(::sycl::queue* qu, size_t size, size_t max_nblocks) {
struct _ {
size_t block_size, nblocks;
};
template <typename GradientPairT>
::sycl::event ReduceHist(::sycl::queue* qu, GradientPairT* hist_data,
GradientPairT* hist_buffer_data,
size_t nblocks, size_t nbins,
const ::sycl::event& event_main) {
auto event_save = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event_main);
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
size_t idx_bin = pid.get_id(0);

const size_t min_block_size = 32;
const size_t max_compute_units =
qu->get_device().get_info<::sycl::info::device::max_compute_units>();
GradientPairT gpair = {0, 0};

size_t nblocks = max_compute_units;
for (size_t j = 0; j < nblocks; ++j) {
gpair += hist_buffer_data[j * nbins + idx_bin];
}

size_t block_size = size / nblocks + !!(size % nblocks);
if (block_size > (1u << 12)) {
nblocks = max_nblocks;
block_size = size / nblocks + !!(size % nblocks);
}
if (block_size < min_block_size) {
block_size = min_block_size;
nblocks = size / block_size + !!(size % block_size);
}
hist_data[idx_bin] = gpair;
});
});

return _{block_size, nblocks};
return event_save;
}

// Kernel with buffer using
Expand All @@ -123,6 +123,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
const GHistIndexMatrix& gmat,
GHistRow<FPType, MemoryType::on_device>* hist,
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
const tree::HistDispatcher<FPType>& dispatcher,
::sycl::event event_priv) {
using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
const size_t size = row_indices.Size();
Expand All @@ -133,18 +134,13 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
const uint32_t* offsets = gmat.cut.cut_ptrs_.ConstDevicePointer();
const size_t nbins = gmat.nbins;

const size_t max_work_group_size =
qu->get_device().get_info<::sycl::info::device::max_work_group_size>();
const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;

// Captured structured bindings are a C++20 extension
const auto block_params = GetBlocksParameters(qu, size, hist_buffer->Size() / (nbins * 2));
const size_t block_size = block_params.block_size;
const size_t nblocks = block_params.nblocks;
const size_t work_group_size = dispatcher.work_group_size;
const size_t block_size = dispatcher.block.size;
const size_t nblocks = dispatcher.block.nblocks;

GradientPairT* hist_buffer_data = hist_buffer->Data();
auto event_fill = qu->fill(hist_buffer_data, GradientPairT(0, 0),
nblocks * nbins * 2, event_priv);
nblocks * nbins, event_priv);
auto event_main = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event_fill);
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
Expand Down Expand Up @@ -179,20 +175,84 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
});

GradientPairT* hist_data = hist->Data();
auto event_save = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event_main);
cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
size_t idx_bin = pid.get_id(0);
auto event_save = ReduceHist(qu, hist_data, hist_buffer_data, nblocks,
nbins, event_main);

GradientPairT gpair = {0, 0};
return event_save;
}

for (size_t j = 0; j < nblocks; ++j) {
gpair += hist_buffer_data[j * nbins + idx_bin];
}
// Kernel with buffer and local hist using
template<typename FPType, typename BinIdxType>
::sycl::event BuildHistKernelLocal(::sycl::queue* qu,
const HostDeviceVector<GradientPair>& gpair,
const RowSetCollection::Elem& row_indices,
const GHistIndexMatrix& gmat,
GHistRow<FPType, MemoryType::on_device>* hist,
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
const tree::HistDispatcher<FPType>& dispatcher,
::sycl::event event_priv) {
constexpr int kMaxNumBins = tree::HistDispatcher<FPType>::KMaxNumBins;
using GradientPairT = xgboost::detail::GradientPairInternal<FPType>;
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const size_t n_columns = gmat.nfeatures;
const auto* pgh = gpair.ConstDevicePointer();
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
const uint32_t* offsets = gmat.cut.cut_ptrs_.ConstDevicePointer();
const size_t nbins = gmat.nbins;

hist_data[idx_bin] = gpair;
const size_t work_group_size = dispatcher.work_group_size;
const size_t block_size = dispatcher.block.size;
const size_t nblocks = dispatcher.block.nblocks;

GradientPairT* hist_buffer_data = hist_buffer->Data();

auto event_main = qu->submit([&](::sycl::handler& cgh) {
cgh.depends_on(event_priv);
cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
::sycl::range<2>(1, work_group_size)),
[=](::sycl::nd_item<2> pid) {
size_t block = pid.get_global_id(0);
size_t feat = pid.get_global_id(1);

// This buffer will be keeped in L1/registers
GradientPairT hist_fast[kMaxNumBins];

GradientPairT* hist_local = hist_buffer_data + block * nbins;
for (size_t fid = feat; fid < n_columns; fid += work_group_size) {
size_t n_bins_feature = offsets[fid+1] - offsets[fid];

// Not all elements of hist_fast are actually used: n_bins_feature <= kMaxNumBins
// We initililize only the requared elements to prevent the unused go to cache.
for (int bin = 0; bin < n_bins_feature; ++bin) {
hist_fast[bin] = {0, 0};
}

for (size_t idx = 0; idx < block_size; ++idx) {
size_t i = block * block_size + idx;
if (i < size) {
size_t row_id = rid[i];

const size_t icol_start = n_columns * row_id;
const GradientPairT pgh_row(pgh[row_id].GetGrad(),
pgh[row_id].GetHess());

const BinIdxType* gr_index_local = gradient_index + icol_start;
uint32_t idx_bin = gr_index_local[fid];

hist_fast[idx_bin] += pgh_row;
}
}
for (int bin = 0 ; bin < n_bins_feature; ++bin) {
hist_local[bin + offsets[fid]] = hist_fast[bin];
}
}
});
});

GradientPairT* hist_data = hist->Data();
auto event_save = ReduceHist(qu, hist_data, hist_buffer_data, nblocks,
nbins, event_main);
return event_save;
}

Expand All @@ -203,6 +263,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
const RowSetCollection::Elem& row_indices,
const GHistIndexMatrix& gmat,
GHistRow<FPType, MemoryType::on_device>* hist,
const tree::HistDispatcher<FPType>& dispatcher,
::sycl::event event_priv) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
Expand All @@ -214,7 +275,7 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
const size_t nbins = gmat.nbins;

constexpr size_t work_group_size = 32;
size_t work_group_size = dispatcher.work_group_size;
const size_t n_work_groups = n_columns / work_group_size + (n_columns % work_group_size > 0);

auto event_fill = qu->fill(hist_data, FPType(0), nbins * 2, event_priv);
Expand Down Expand Up @@ -260,34 +321,47 @@ ::sycl::event BuildHistDispatchKernel(
GHistRow<FPType, MemoryType::on_device>* hist,
bool isDense,
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
const tree::DeviceProperties& device_prop,
::sycl::event events_priv,
bool force_atomic_use) {
const size_t size = row_indices.Size();
const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
const size_t nbins = gmat.nbins;
const size_t max_num_bins = gmat.max_num_bins;
const size_t min_num_bins = gmat.min_num_bins;

// TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
size_t max_n_blocks = hist_buffer->Size() / nbins;
auto dispatcher = tree::HistDispatcher<FPType>
(device_prop, isDense, size, max_n_blocks, nbins,
n_columns, max_num_bins, min_num_bins);

// force_atomic_use flag is used only for testing
use_atomic = use_atomic || force_atomic_use;
bool use_atomic = dispatcher.use_atomics || force_atomic_use;
if (!use_atomic) {
if (isDense) {
return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
gmat, hist, hist_buffer,
events_priv);
if (dispatcher.use_local_hist) {
return BuildHistKernelLocal<FPType, BinIdxType>(qu, gpair, row_indices,
gmat, hist, hist_buffer,
dispatcher, events_priv);
} else {
return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
gmat, hist, hist_buffer,
dispatcher, events_priv);
}
} else {
return BuildHistKernel<FPType, uint32_t, false>(qu, gpair, row_indices,
gmat, hist, hist_buffer,
events_priv);
dispatcher, events_priv);
}
} else {
if (isDense) {
return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair, row_indices,
gmat, hist, events_priv);
gmat, hist,
dispatcher, events_priv);
} else {
return BuildHistKernel<FPType, uint32_t, false>(qu, gpair, row_indices,
gmat, hist, events_priv);
gmat, hist,
dispatcher, events_priv);
}
}
}
Expand All @@ -299,23 +373,27 @@ ::sycl::event BuildHistKernel(::sycl::queue* qu,
const GHistIndexMatrix& gmat, const bool isDense,
GHistRow<FPType, MemoryType::on_device>* hist,
GHistRow<FPType, MemoryType::on_device>* hist_buffer,
const tree::DeviceProperties& device_prop,
::sycl::event event_priv,
bool force_atomic_use) {
const bool is_dense = isDense;
switch (gmat.index.GetBinTypeSize()) {
case BinTypeSize::kUint8BinsTypeSize:
return BuildHistDispatchKernel<FPType, uint8_t>(qu, gpair, row_indices,
gmat, hist, is_dense, hist_buffer,
device_prop,
event_priv, force_atomic_use);
break;
case BinTypeSize::kUint16BinsTypeSize:
return BuildHistDispatchKernel<FPType, uint16_t>(qu, gpair, row_indices,
gmat, hist, is_dense, hist_buffer,
device_prop,
event_priv, force_atomic_use);
break;
case BinTypeSize::kUint32BinsTypeSize:
return BuildHistDispatchKernel<FPType, uint32_t>(qu, gpair, row_indices,
gmat, hist, is_dense, hist_buffer,
device_prop,
event_priv, force_atomic_use);
break;
default:
Expand All @@ -331,10 +409,12 @@ ::sycl::event GHistBuilder<GradientSumT>::BuildHist(
GHistRowT<MemoryType::on_device>* hist,
bool isDense,
GHistRowT<MemoryType::on_device>* hist_buffer,
const tree::DeviceProperties& device_prop,
::sycl::event event_priv,
bool force_atomic_use) {
return BuildHistKernel<GradientSumT>(qu_, gpair, row_indices, gmat,
isDense, hist, hist_buffer, event_priv,
isDense, hist, hist_buffer,
device_prop, event_priv,
force_atomic_use);
}

Expand All @@ -346,6 +426,7 @@ ::sycl::event GHistBuilder<float>::BuildHist(
GHistRow<float, MemoryType::on_device>* hist,
bool isDense,
GHistRow<float, MemoryType::on_device>* hist_buffer,
const tree::DeviceProperties& device_prop,
::sycl::event event_priv,
bool force_atomic_use);
template
Expand All @@ -356,6 +437,7 @@ ::sycl::event GHistBuilder<double>::BuildHist(
GHistRow<double, MemoryType::on_device>* hist,
bool isDense,
GHistRow<double, MemoryType::on_device>* hist_buffer,
const tree::DeviceProperties& device_prop,
::sycl::event event_priv,
bool force_atomic_use);

Expand Down
4 changes: 3 additions & 1 deletion plugin/sycl/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

#include "../../src/common/hist_util.h"
#include "../data/gradient_index.h"
#include "../tree/hist_dispatcher.h"

#include <sycl/sycl.hpp>

Expand Down Expand Up @@ -123,7 +124,7 @@ class ParallelGHistBuilder {
}

void Reset(size_t nblocks) {
hist_device_buffer_.Resize(qu_, nblocks * nbins_ * 2);
hist_device_buffer_.Resize(qu_, nblocks * nbins_);
}

GHistRowT& GetDeviceBuffer() {
Expand Down Expand Up @@ -161,6 +162,7 @@ class GHistBuilder {
GHistRowT<MemoryType::on_device>* HistCollection,
bool isDense,
GHistRowT<MemoryType::on_device>* hist_buffer,
const tree::DeviceProperties& device_prop,
::sycl::event event,
bool force_atomic_use = false);

Expand Down
10 changes: 9 additions & 1 deletion plugin/sycl/data/gradient_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,14 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
max_num_bins = max_bins;
nbins = cut.Ptrs().back();

min_num_bins = nbins;
const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
for (unsigned fid = 0; fid < n_offsets; ++fid) {
auto ibegin = cut.cut_ptrs_.ConstHostVector()[fid];
auto iend = cut.cut_ptrs_.ConstHostVector()[fid + 1];
min_num_bins = std::min<size_t>(min_num_bins, iend - ibegin);
}

hit_count.SetDevice(ctx->Device());
hit_count.Resize(nbins, 0);

Expand All @@ -141,7 +149,7 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
row_stride = nfeatures;
n_rows = dmat->Info().num_row_;
}
const size_t n_offsets = cut.cut_ptrs_.Size() - 1;

const size_t n_index = n_rows * row_stride;
ResizeIndex(qu, n_index);

Expand Down
1 change: 1 addition & 0 deletions plugin/sycl/data/gradient_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ struct GHistIndexMatrix {
/*! \brief The corresponding cuts */
xgboost::common::HistogramCuts cut;
size_t max_num_bins;
size_t min_num_bins;
size_t nbins;
size_t nfeatures;
size_t row_stride;
Expand Down
Loading
Loading