fix training continuation for iGPUs

Dmitry Razdoburdin · Dmitry Razdoburdin · commit cd22e8fbeb38 · 2025-03-10T07:08:30.000-07:00
diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
@@ -50,28 +50,29 @@ void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
 
 template <typename BinIdxType, bool isDense>
 void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
+                                    Context const * ctx,
                                     BinIdxType* index_data,
-                                    DMatrix *dmat,
-                                    size_t nbins,
-                                    size_t row_stride) {
+                                    DMatrix *dmat) {
   if (nbins == 0) return;
   const bst_float* cut_values = cut.cut_values_.ConstDevicePointer();
   const uint32_t* cut_ptrs = cut.cut_ptrs_.ConstDevicePointer();
   size_t* hit_count_ptr = hit_count.DevicePointer();
 
   BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
 
-  ::sycl::event event;
   for (auto &batch : dmat->GetBatches<SparsePage>()) {
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
-      const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
-      size_t batch_size = batch.Size();
-      if (batch_size > 0) {
-        const auto base_rowid = batch.base_rowid;
-        event = qu->submit([&](::sycl::handler& cgh) {
-          cgh.depends_on(event);
-          cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) {
+    batch.data.SetDevice(ctx->Device());
+    batch.offset.SetDevice(ctx->Device());
+
+    const xgboost::Entry *data_ptr = batch.data.ConstDevicePointer();
+    const bst_idx_t *offset_vec = batch.offset.ConstDevicePointer();
+    size_t batch_size = batch.Size();
+    if (batch_size > 0) {
+      const auto base_rowid = batch.base_rowid;
+      size_t row_stride = this->row_stride;
+      size_t nbins = this->nbins;
+      qu->submit([&](::sycl::handler& cgh) {
+         cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::item<1> pid) {
           const size_t i = pid.get_id(0);
           const size_t ibegin = offset_vec[i];
           const size_t iend = offset_vec[i + 1];
@@ -92,23 +93,22 @@ void GHistIndexMatrix::SetIndexData(::sycl::queue* qu,
           }
         });
       });
-      }
+      qu->wait();
     }
   }
-  qu->wait();
 }
 
-void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
-  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+void GHistIndexMatrix::ResizeIndex(::sycl::queue* qu, size_t n_index) {
+  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense_) {
     index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
-    index.Resize((sizeof(uint8_t)) * n_index);
+    index.Resize(qu, (sizeof(uint8_t)) * n_index);
   } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max())  &&
-    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
+    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense_) {
     index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
-    index.Resize((sizeof(uint16_t)) * n_index);
+    index.Resize(qu, (sizeof(uint16_t)) * n_index);
   } else {
     index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize);
-    index.Resize((sizeof(uint32_t)) * n_index);
+    index.Resize(qu, (sizeof(uint32_t)) * n_index);
   }
 }
 
@@ -122,52 +122,50 @@ void GHistIndexMatrix::Init(::sycl::queue* qu,
   cut.SetDevice(ctx->Device());
 
   max_num_bins = max_bins;
-  const uint32_t nbins = cut.Ptrs().back();
-  this->nbins = nbins;
+  nbins = cut.Ptrs().back();
 
   hit_count.SetDevice(ctx->Device());
   hit_count.Resize(nbins, 0);
 
-  this->p_fmat = dmat;
   const bool isDense = dmat->IsDense();
   this->isDense_ = isDense;
 
-  index.setQueue(qu);
-
   row_stride = 0;
   size_t n_rows = 0;
-  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    const auto& row_offset = batch.offset.ConstHostVector();
-    batch.data.SetDevice(ctx->Device());
-    batch.offset.SetDevice(ctx->Device());
-    n_rows += batch.Size();
-    for (auto i = 1ull; i < row_offset.size(); i++) {
-      row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
+  if (!isDense) {
+    for (const auto& batch : dmat->GetBatches<SparsePage>()) {
+      const auto& row_offset = batch.offset.ConstHostVector();
+      n_rows += batch.Size();
+      for (auto i = 1ull; i < row_offset.size(); i++) {
+        row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
+      }
     }
+  } else {
+    row_stride = nfeatures;
+    n_rows = dmat->Info().num_row_;
   }
 
   const size_t n_offsets = cut.cut_ptrs_.Size() - 1;
   const size_t n_index = n_rows * row_stride;
-  ResizeIndex(n_index, isDense);
+  ResizeIndex(qu, n_index);
 
   CHECK_GT(cut.cut_values_.Size(), 0U);
 
   if (isDense) {
     BinTypeSize curent_bin_size = index.GetBinTypeSize();
     if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) {
-      SetIndexData<uint8_t, true>(qu, index.data<uint8_t>(), dmat, nbins, row_stride);
-
+      SetIndexData<uint8_t, true>(qu, ctx, index.data<uint8_t>(), dmat);
     } else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) {
-      SetIndexData<uint16_t, true>(qu, index.data<uint16_t>(), dmat, nbins, row_stride);
+      SetIndexData<uint16_t, true>(qu, ctx, index.data<uint16_t>(), dmat);
     } else {
       CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
-      SetIndexData<uint32_t, true>(qu, index.data<uint32_t>(), dmat, nbins, row_stride);
+      SetIndexData<uint16_t, true>(qu, ctx, index.data<uint16_t>(), dmat);
     }
   /* For sparse DMatrix we have to store index of feature for each bin
      in index field to chose right offset. So offset is nullptr and index is not reduced */
   } else {
     sort_buff.Resize(qu, n_rows * row_stride * sizeof(uint32_t));
-    SetIndexData<uint32_t, false>(qu, index.data<uint32_t>(), dmat, nbins, row_stride);
+    SetIndexData<uint32_t, false>(qu, ctx, index.data<uint32_t>(), dmat);
   }
 }
 
diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h
@@ -31,21 +31,9 @@ struct Index {
   Index& operator=(Index&& i) = delete;
   void SetBinTypeSize(BinTypeSize binTypeSize) {
     binTypeSize_ = binTypeSize;
-    switch (binTypeSize) {
-      case BinTypeSize::kUint8BinsTypeSize:
-        func_ = &GetValueFromUint8;
-        break;
-      case BinTypeSize::kUint16BinsTypeSize:
-        func_ = &GetValueFromUint16;
-        break;
-      case BinTypeSize::kUint32BinsTypeSize:
-        func_ = &GetValueFromUint32;
-        break;
-      default:
-        CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize  ||
-              binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
-              binTypeSize == BinTypeSize::kUint32BinsTypeSize);
-    }
+    CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize  ||
+          binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
+          binTypeSize == BinTypeSize::kUint32BinsTypeSize);
   }
   BinTypeSize GetBinTypeSize() const {
     return binTypeSize_;
@@ -65,8 +53,8 @@ struct Index {
     return data_.Size() / (binTypeSize_);
   }
 
-  void Resize(const size_t nBytesData) {
-    data_.Resize(qu_, nBytesData);
+  void Resize(::sycl::queue* qu, const size_t nBytesData) {
+    data_.Resize(qu, nBytesData);
   }
 
   uint8_t* begin() const {
@@ -77,28 +65,9 @@ struct Index {
     return data_.End();
   }
 
-  void setQueue(::sycl::queue* qu) {
-    qu_ = qu;
-  }
-
  private:
-  static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) {
-    return reinterpret_cast<const uint8_t*>(t)[i];
-  }
-  static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) {
-    return reinterpret_cast<const uint16_t*>(t)[i];
-  }
-  static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) {
-    return reinterpret_cast<const uint32_t*>(t)[i];
-  }
-
-  using Func = uint32_t (*)(const uint8_t*, size_t);
-
   USMVector<uint8_t, MemoryType::on_device> data_;
   BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize};
-  Func func_;
-
-  ::sycl::queue* qu_;
 };
 
 /*!
@@ -116,22 +85,19 @@ struct GHistIndexMatrix {
   USMVector<uint8_t, MemoryType::on_device> sort_buff;
   /*! \brief The corresponding cuts */
   xgboost::common::HistogramCuts cut;
-  DMatrix* p_fmat;
   size_t max_num_bins;
   size_t nbins;
   size_t nfeatures;
   size_t row_stride;
 
   // Create a global histogram matrix based on a given DMatrix device wrapper
-  void Init(::sycl::queue* qu, Context const * ctx,
-            DMatrix *dmat, int max_num_bins);
+  void Init(::sycl::queue* qu, Context const * ctx, DMatrix *dmat, int max_num_bins);
 
   template <typename BinIdxType, bool isDense>
-  void SetIndexData(::sycl::queue* qu, BinIdxType* index_data,
-                    DMatrix *dmat,
-                    size_t nbins, size_t row_stride);
+  void SetIndexData(::sycl::queue* qu, Context const * ctx, BinIdxType* index_data,
+                    DMatrix *dmat);
 
-  void ResizeIndex(size_t n_index, bool isDense);
+  void ResizeIndex(::sycl::queue* qu, size_t n_index);
 
   inline void GetFeatureCounts(size_t* counts) const {
     auto nfeature = cut.cut_ptrs_.Size() - 1;
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
@@ -291,7 +291,7 @@ class Predictor : public xgboost::Predictor {
         }
 
         if (num_group == 1) {
-          float sum = 0.0;
+          float& sum = out_predictions[row_idx];
           for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
             const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
             if constexpr (any_missing) {
@@ -300,7 +300,6 @@ class Predictor : public xgboost::Predictor {
               sum += GetLeafWeight(first_node, fval_buff_row_ptr);
             }
           }
-          out_predictions[row_idx] += sum;
         } else {
           for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
             const Node* first_node = nodes + first_node_position[tree_idx - tree_begin];
@@ -333,7 +332,6 @@ class Predictor : public xgboost::Predictor {
     int num_features = dmat->Info().num_col_;
 
     float* out_predictions = out_preds->DevicePointer();
-    ::sycl::event event;
     for (auto &batch : dmat->GetBatches<SparsePage>()) {
       batch.data.SetDevice(ctx_->Device());
       batch.offset.SetDevice(ctx_->Device());
@@ -343,6 +341,7 @@ class Predictor : public xgboost::Predictor {
       if (batch_size > 0) {
         const auto base_rowid = batch.base_rowid;
 
+        ::sycl::event event;
         if (needs_buffer_update) {
           fval_buff.ResizeNoCopy(qu_, num_features * batch_size);
           if constexpr (any_missing) {
@@ -354,9 +353,9 @@ class Predictor : public xgboost::Predictor {
                                    row_ptr, batch_size, num_features,
                                    num_group, tree_begin, tree_end);
         needs_buffer_update = (batch_size != out_preds->Size());
+        qu_->wait();
       }
     }
-    qu_->wait();
   }
 
   mutable USMVector<float,   MemoryType::on_device> fval_buff;
diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc
@@ -67,7 +67,7 @@ void TestPartitioning(float sparsity, int max_bins) {
 
   std::vector<uint8_t> ridx_left(num_rows, 0);
   std::vector<uint8_t> ridx_right(num_rows, 0);
-  for (auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
+  for (auto &batch : p_fmat->GetBatches<SparsePage>()) {
     const auto& data_vec = batch.data.HostVector();
     const auto& offset_vec = batch.offset.HostVector();