dmlc
diff --git a/‎plugin/federated/federated_comm.cuh‎
Lines changed: 3 additions & 3 deletions b/‎plugin/federated/federated_comm.cuh‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/collective/coll.cu‎
Lines changed: 14 additions & 13 deletions b/‎src/collective/coll.cu‎
Lines changed: 14 additions & 13 deletions
diff --git a/‎src/collective/coll.cuh‎
Lines changed: 8 additions & 8 deletions b/‎src/collective/coll.cuh‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/collective/comm.cu‎
Lines changed: 2 additions & 2 deletions b/‎src/collective/comm.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/collective/comm.cuh‎
Lines changed: 7 additions & 7 deletions b/‎src/collective/comm.cuh‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎src/common/algorithm.cuh‎
Lines changed: 45 additions & 37 deletions b/‎src/common/algorithm.cuh‎
Lines changed: 45 additions & 37 deletions
@@ -1,18 +1,18 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #pragma once
 
 #include <memory>  // for shared_ptr
 
 #include "../../src/collective/coll.h"          // for Coll
-#include "../../src/common/device_helpers.cuh"  // for CUDAStreamView
+#include "../../src/common/cuda_stream.h"       // for StreamRef
 #include "federated_comm.h"                     // for FederatedComm
 #include "xgboost/context.h"                    // for Context
 
 namespace xgboost::collective {
 class CUDAFederatedComm : public FederatedComm {
-  dh::CUDAStreamView stream_;
+  curt::StreamRef stream_;
 
  public:
   explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl);
 
@@ -13,7 +13,8 @@
 #include <type_traits>  // for invoke_result_t, is_same_v, enable_if_t
 #include <utility>      // for move
 
-#include "../common/device_helpers.cuh"  // for CUDAStreamView, CUDAEvent, device_vector
+#include "../common/cuda_stream.h"       // for StreamRef, Event
+#include "../common/device_helpers.cuh"  // for device_vector
 #include "../common/threadpool.h"        // for ThreadPool
 #include "../common/utils.h"             // for MakeCleanup
 #include "../data/array_interface.h"     // for ArrayInterfaceHandler
@@ -87,16 +88,16 @@ struct Chan {
 };
 }  // namespace
 
-template <typename Fn, typename R = std::invoke_result_t<Fn, dh::CUDAStreamView>>
+template <typename Fn, typename R = std::invoke_result_t<Fn, curt::StreamRef>>
 [[nodiscard]] std::enable_if_t<std::is_same_v<R, Result>, Result> AsyncLaunch(
     common::ThreadPool* pool, NCCLComm const* nccl, std::shared_ptr<NcclStub> stub,
-    dh::CUDAStreamView stream, Fn&& fn) {
-  dh::CUDAEvent e0;
+    curt::StreamRef stream, Fn&& fn) {
+  curt::Event e0;
   e0.Record(nccl->Stream());
   stream.Wait(e0);
 
   auto cleanup = common::MakeCleanup([&] {
-    dh::CUDAEvent e1;
+    curt::Event e1;
     e1.Record(stream);
     nccl->Stream().Wait(e1);
   });
@@ -180,7 +181,7 @@ bool IsBitwiseOp(Op const& op) {
 }
 
 template <typename Func>
-void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> out_buffer,
+void RunBitwiseAllreduce(curt::StreamRef stream, common::Span<std::int8_t> out_buffer,
                          std::int8_t const* device_buffer, Func func, std::int32_t world_size,
                          std::size_t size) {
   dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
@@ -194,13 +195,13 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> ou
 
 [[nodiscard]] Result BitwiseAllReduce(common::ThreadPool* pool, NCCLComm const* pcomm,
                                       common::Span<std::int8_t> data, Op op,
-                                      dh::CUDAStreamView stream) {
+                                      curt::StreamRef stream) {
   dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World());
   auto* device_buffer = buffer.data().get();
   auto stub = pcomm->Stub();
 
   // First gather data from all the workers.
-  auto rc = AsyncLaunch(pool, pcomm, stub, stream, [&](dh::CUDAStreamView s) {
+  auto rc = AsyncLaunch(pool, pcomm, stub, stream, [&](curt::StreamRef s) {
     return stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8, pcomm->Handle(), s);
   });
   if (!rc.OK()) {
@@ -263,7 +264,7 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
         using T = decltype(t);
         auto rdata = common::RestoreType<T>(data);
         return AsyncLaunch(
-            &this->pool_, nccl, stub, this->stream_.View(), [&](dh::CUDAStreamView s) {
+            &this->pool_, nccl, stub, this->stream_.View(), [&](curt::StreamRef s) {
               return stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
                                      GetNCCLRedOp(op), nccl->Handle(), s);
             });
@@ -285,7 +286,7 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
 
   return Success() << [&] {
     return AsyncLaunch(&this->pool_, nccl, stub, this->stream_.View(),
-                       [data, nccl, root, stub](dh::CUDAStreamView s) {
+                       [data, nccl, root, stub](curt::StreamRef s) {
                          return stub->Broadcast(data.data(), data.data(), data.size_bytes(),
                                                 ncclInt8, root, nccl->Handle(), s);
                        });
@@ -306,7 +307,7 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
   auto send = data.subspan(comm.Rank() * size, size);
   return Success() << [&] {
     return AsyncLaunch(&this->pool_, nccl, stub, this->stream_.View(),
-                       [send, data, size, nccl, stub](dh::CUDAStreamView s) {
+                       [send, data, size, nccl, stub](curt::StreamRef s) {
                          return stub->Allgather(send.data(), data.data(), size, ncclInt8,
                                                 nccl->Handle(), s);
                        });
@@ -321,7 +322,7 @@ namespace cuda_impl {
  *
  * https://arxiv.org/abs/1812.05964
  */
-Result BroadcastAllgatherV(NCCLComm const* comm, dh::CUDAStreamView s,
+Result BroadcastAllgatherV(NCCLComm const* comm, curt::StreamRef s,
                            common::Span<std::int8_t const> data,
                            common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
   auto stub = comm->Stub();
@@ -379,7 +380,7 @@ Result BroadcastAllgatherV(NCCLComm const* comm, dh::CUDAStreamView s,
       };
     }
     case AllgatherVAlgo::kBcast: {
-      return AsyncLaunch(&this->pool_, nccl, stub, this->stream_.View(), [&](dh::CUDAStreamView s) {
+      return AsyncLaunch(&this->pool_, nccl, stub, this->stream_.View(), [&](curt::StreamRef s) {
         return cuda_impl::BroadcastAllgatherV(nccl, s, data, sizes, recv);
       });
     }
 
@@ -1,21 +1,21 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #pragma once
 
 #include <cstdint>  // for int8_t, int64_t
 
-#include "../common/device_helpers.cuh"  // for CUDAStream
-#include "../common/threadpool.h"        // for ThreadPool
-#include "../data/array_interface.h"     // for ArrayInterfaceHandler
-#include "coll.h"                        // for Coll
-#include "comm.h"                        // for Comm
-#include "xgboost/span.h"                // for Span
+#include "../common/cuda_stream.h"    // for Stream
+#include "../common/threadpool.h"     // for ThreadPool
+#include "../data/array_interface.h"  // for ArrayInterfaceHandler
+#include "coll.h"                     // for Coll
+#include "comm.h"                     // for Comm
+#include "xgboost/span.h"             // for Span
 
 namespace xgboost::collective {
 class NCCLColl : public Coll {
   common::ThreadPool pool_;
-  dh::CUDAStream stream_;
+  curt::Stream stream_;
 
  public:
   NCCLColl();
 
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023-2024, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #if defined(XGBOOST_USE_NCCL)
 #include <algorithm>  // for sort
@@ -113,7 +113,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
 
   for (std::int32_t r = 0; r < root.World(); ++r) {
     this->channels_.emplace_back(
-        std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, dh::DefaultStream()));
+        std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, curt::DefaultStream()));
   }
 }
 
 
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2025, XGBoost Contributors
  */
 #pragma once
 
@@ -9,7 +9,7 @@
 
 #include <utility>  // for move
 
-#include "../common/device_helpers.cuh"
+#include "../common/cuda_stream.h"  // for StreamRef
 #include "coll.h"
 #include "comm.h"
 #include "nccl_stub.h"  // for NcclStub
@@ -30,7 +30,7 @@ class NCCLComm : public Comm {
   ncclComm_t nccl_comm_{nullptr};
   std::shared_ptr<NcclStub> stub_;
   ncclUniqueId nccl_unique_id_{};
-  dh::CUDAStreamView stream_;
+  curt::StreamRef stream_;
   std::string nccl_path_;
 
  public:
@@ -45,7 +45,7 @@ class NCCLComm : public Comm {
   }
   ~NCCLComm() override;
   [[nodiscard]] bool IsFederated() const override { return false; }
-  [[nodiscard]] dh::CUDAStreamView Stream() const { return stream_; }
+  [[nodiscard]] curt::StreamRef Stream() const { return stream_; }
   [[nodiscard]] Result Block() const override {
     auto rc = this->Stream().Sync(false);
     return GetCUDAResult(rc);
@@ -60,16 +60,16 @@ class NCCLChannel : public Channel {
   std::int32_t rank_{-1};
   ncclComm_t nccl_comm_{};
   std::shared_ptr<NcclStub> stub_;
-  dh::CUDAStreamView stream_;
+  curt::StreamRef stream_;
 
  public:
   explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm,
-                       std::shared_ptr<NcclStub> stub, dh::CUDAStreamView stream)
+                       std::shared_ptr<NcclStub> stub, curt::StreamRef stream)
       : rank_{rank},
         nccl_comm_{nccl_comm},
         stub_{std::move(stub)},
         Channel{comm, nullptr},
-        stream_{stream} {}
+        stream_{std::move(stream)} {}
 
   [[nodiscard]] Result SendAll(std::int8_t const* ptr, std::size_t n) override {
     return stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_);
 
@@ -4,10 +4,10 @@
 #ifndef XGBOOST_COMMON_ALGORITHM_CUH_
 #define XGBOOST_COMMON_ALGORITHM_CUH_
 
-#include <thrust/copy.h>                         // for copy
-#include <thrust/iterator/counting_iterator.h>   // for make_counting_iterator
-#include <thrust/sort.h>                         // for stable_sort_by_key
-#include <thrust/tuple.h>                        // for tuple, get
+#include <thrust/copy.h>                        // for copy
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+#include <thrust/sort.h>                        // for stable_sort_by_key
+#include <thrust/tuple.h>                       // for tuple, get
 
 #include <cstddef>      // size_t
 #include <cstdint>      // int32_t
@@ -18,23 +18,23 @@
 
 #include "common.h"            // safe_cuda
 #include "cuda_context.cuh"    // CUDAContext
+#include "cuda_stream.h"       // for StreamRef
 #include "device_helpers.cuh"  // TemporaryArray,SegmentId,LaunchN,Iota
 #include "device_vector.cuh"   // for device_vector
 #include "xgboost/base.h"      // XGBOOST_DEVICE
 #include "xgboost/context.h"   // Context
-#include "xgboost/linalg.h"    // for VectorView
 #include "xgboost/logging.h"   // CHECK
 #include "xgboost/span.h"      // Span,byte
 
 namespace xgboost::common {
 namespace detail {
 
 #if CUB_VERSION >= 300000
-  constexpr auto kCubSortOrderAscending = cub::SortOrder::Ascending;
-  constexpr auto kCubSortOrderDescending = cub::SortOrder::Descending;
+constexpr auto kCubSortOrderAscending = cub::SortOrder::Ascending;
+constexpr auto kCubSortOrderDescending = cub::SortOrder::Descending;
 #else
-  constexpr bool kCubSortOrderAscending = false;
-  constexpr bool kCubSortOrderDescending = true;
+constexpr bool kCubSortOrderAscending = false;
+constexpr bool kCubSortOrderDescending = true;
 #endif
 
 // Wrapper around cub sort to define is_decending
@@ -70,7 +70,7 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage,
                                   const ValueT *d_values_in, ValueT *d_values_out,
                                   std::size_t num_items, std::size_t num_segments,
                                   BeginOffsetIteratorT d_begin_offsets,
-                                  EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream,
+                                  EndOffsetIteratorT d_end_offsets, curt::StreamRef stream,
                                   int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) {
   cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(d_keys_in), d_keys_out);
   cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(d_values_in), d_values_out);
@@ -198,7 +198,7 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
                                if (thrust::get<0>(l) != thrust::get<0>(r)) {
                                  return thrust::get<0>(l) < thrust::get<0>(r);  // segment index
                                }
-                               return thrust::get<1>(l) < thrust::get<1>(r);    // residue
+                               return thrust::get<1>(l) < thrust::get<1>(r);  // residue
                              });
 }
 
@@ -224,46 +224,54 @@ void ArgSort(Context const *ctx, Span<U> keys, Span<IdxT> sorted_idx) {
   if (accending) {
     void *d_temp_storage = nullptr;
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        cuctx->Stream())));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        nullptr, false)));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            nullptr, false)));
 #endif
     dh::TemporaryArray<char> storage(bytes);
     d_temp_storage = storage.data().get();
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        cuctx->Stream())));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        nullptr, false)));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderAscending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            nullptr, false)));
 #endif
   } else {
     void *d_temp_storage = nullptr;
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        cuctx->Stream())));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        nullptr, false)));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            nullptr, false)));
 #endif
     dh::TemporaryArray<char> storage(bytes);
     d_temp_storage = storage.data().get();
 #if THRUST_MAJOR_VERSION >= 2
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        cuctx->Stream())));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            cuctx->Stream())));
 #else
-    dh::safe_cuda((cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
-        nullptr, false)));
+    dh::safe_cuda(
+        (cub::DispatchRadixSort<detail::kCubSortOrderDescending, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+            nullptr, false)));
 #endif
   }
 
@@ -330,15 +338,15 @@ void InclusiveSum(Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out
 }
 
 template <typename... Args>
-void RunLengthEncode(dh::CUDAStreamView stream, Args &&...args) {
+void RunLengthEncode(curt::StreamRef stream, Args &&...args) {
   std::size_t n_bytes = 0;
   dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(nullptr, n_bytes, args..., stream));
   dh::CachingDeviceUVector<char> tmp(n_bytes);
   dh::safe_cuda(cub::DeviceRunLengthEncode::Encode(tmp.data(), n_bytes, args..., stream));
 }
 
 template <typename... Args>
-void SegmentedSum(dh::CUDAStreamView stream, Args &&...args) {
+void SegmentedSum(curt::StreamRef stream, Args &&...args) {
   std::size_t n_bytes = 0;
   dh::safe_cuda(cub::DeviceSegmentedReduce::Sum(nullptr, n_bytes, args..., stream));
   dh::CachingDeviceUVector<char> tmp(n_bytes);
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`/**`
`2`		`- * Copyright 2023-2024, XGBoost Contributors`
	`2`	`+ * Copyright 2023-2025, XGBoost Contributors`
`3`	`3`	`*/`
`4`	`4`	`#if defined(XGBOOST_USE_NCCL)`
`5`	`5`	`#include <algorithm> // for sort`
`@@ -113,7 +113,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p`
`113`	`113`
`114`	`114`	`for (std::int32_t r = 0; r < root.World(); ++r) {`
`115`	`115`	`this->channels_.emplace_back(`
`116`		`- std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, dh::DefaultStream()));`
	`116`	`+ std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, curt::DefaultStream()));`
`117`	`117`	`}`
`118`	`118`	`}`
`119`	`119`