From 3a1b93e35c7e2f9969bef1da389601048327a0c2 Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Wed, 19 Feb 2025 11:37:19 +0000
Subject: [PATCH 01/13] Fix kernel argument passing. (#16995)

We were reading the kernel arguments at kernel execution time, but
kernel arguments are allowed to change between enqueuing and executing.
Make sure to create a copy of kernel arguments ahead of time.

This was previously approved as a unified-runtime PR:
https://github.com/oneapi-src/unified-runtime/pull/2700
---
 source/adapters/native_cpu/enqueue.cpp | 43 ++++++------
 source/adapters/native_cpu/event.cpp   |  2 +-
 source/adapters/native_cpu/event.hpp   |  6 +-
 source/adapters/native_cpu/kernel.hpp  | 96 ++++++++++++++++----------
 4 files changed, 87 insertions(+), 60 deletions(-)
diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index 1130385cfa..0ffe9415f1 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -106,9 +106,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
                            pLocalWorkSize);
   auto &tp = hQueue->getDevice()->tp;
   const size_t numParallelThreads = tp.num_threads();
-  hKernel->updateMemPool(numParallelThreads);
   std::vector<std::future<void>> futures;
-  std::vector<std::function<void(size_t, ur_kernel_handle_t_)>> groups;
+  std::vector<std::function<void(size_t, ur_kernel_handle_t_ &)>> groups;
   auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
   auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
   auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
@@ -119,8 +118,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   auto event = new ur_event_handle_t_(hQueue, UR_COMMAND_KERNEL_LAUNCH);
   event->tick_start();
 
+  // Create a copy of the kernel and its arguments.
+  auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
+  kernel->updateMemPool(numParallelThreads);
+
 #ifndef NATIVECPU_USE_OCK
-  hKernel->handleLocalArgs(1, 0);
   for (unsigned g2 = 0; g2 < numWG2; g2++) {
     for (unsigned g1 = 0; g1 < numWG1; g1++) {
       for (unsigned g0 = 0; g0 < numWG0; g0++) {
@@ -128,7 +130,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
           for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
             for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
               state.update(g0, g1, g2, local0, local1, local2);
-              hKernel->_subhandler(hKernel->getArgs().data(), &state);
+              kernel->_subhandler(kernel->getArgs(1, 0).data(), &state);
             }
           }
         }
@@ -139,7 +141,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   bool isLocalSizeOne =
       ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
   if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads &&
-      !hKernel->hasLocalArgs()) {
+      !kernel->hasLocalArgs()) {
     // If the local size is one, we make the assumption that we are running a
     // parallel_for over a sycl::range.
     // Todo: we could add more compiler checks and
@@ -160,7 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned g1 = 0; g1 < numWG1; g1++) {
         for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
           futures.emplace_back(tp.schedule_task(
-              [ndr, itemsPerThread, kernel = *hKernel, g0, g1, g2](size_t) {
+              [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) {
                 native_cpu::state resized_state =
                     getResizedState(ndr, itemsPerThread);
                 resized_state.update(g0, g1, g2);
@@ -172,7 +174,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
              g0++) {
           state.update(g0, g1, g2);
-          hKernel->_subhandler(hKernel->getArgs().data(), &state);
+          kernel->_subhandler(kernel->getArgs().data(), &state);
         }
       }
     }
@@ -185,12 +187,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
           futures.emplace_back(
-              tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2,
+              tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2,
                                 numParallelThreads](size_t threadId) mutable {
                 for (unsigned g0 = 0; g0 < numWG0; g0++) {
-                  kernel.handleLocalArgs(numParallelThreads, threadId);
                   state.update(g0, g1, g2);
-                  kernel._subhandler(kernel.getArgs().data(), &state);
+                  kernel._subhandler(
+                      kernel.getArgs(numParallelThreads, threadId).data(),
+                      &state);
                 }
               }));
         }
@@ -202,13 +205,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
           for (unsigned g0 = 0; g0 < numWG0; g0++) {
-            groups.push_back(
-                [state, g0, g1, g2, numParallelThreads](
-                    size_t threadId, ur_kernel_handle_t_ kernel) mutable {
-                  kernel.handleLocalArgs(numParallelThreads, threadId);
-                  state.update(g0, g1, g2);
-                  kernel._subhandler(kernel.getArgs().data(), &state);
-                });
+            groups.push_back([state, g0, g1, g2, numParallelThreads](
+                                 size_t threadId,
+                                 ur_kernel_handle_t_ &kernel) mutable {
+              state.update(g0, g1, g2);
+              kernel._subhandler(
+                  kernel.getArgs(numParallelThreads, threadId).data(), &state);
+            });
           }
         }
       }
@@ -218,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned thread = 0; thread < numParallelThreads; thread++) {
         futures.emplace_back(
             tp.schedule_task([groups, thread, groupsPerThread,
-                              kernel = *hKernel](size_t threadId) {
+                              &kernel = *kernel](size_t threadId) {
               for (unsigned i = 0; i < groupsPerThread; i++) {
                 auto index = thread * groupsPerThread + i;
                 groups[index](threadId, kernel);
@@ -231,7 +234,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         futures.emplace_back(
             tp.schedule_task([groups, remainder,
                               scheduled = numParallelThreads * groupsPerThread,
-                              kernel = *hKernel](size_t threadId) {
+                              &kernel = *kernel](size_t threadId) {
               for (unsigned i = 0; i < remainder; i++) {
                 auto index = scheduled + i;
                 groups[index](threadId, kernel);
@@ -247,7 +250,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   if (phEvent) {
     *phEvent = event;
   }
-  event->set_callback([hKernel, event]() {
+  event->set_callback([kernel = std::move(kernel), hKernel, event]() {
     event->tick_end();
     // TODO: avoid calling clear() here.
     hKernel->_localArgInfo.clear();
diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp
index 37eaf1f6d1..87865905c1 100644
--- a/source/adapters/native_cpu/event.cpp
+++ b/source/adapters/native_cpu/event.cpp
@@ -146,7 +146,7 @@ void ur_event_handle_t_::wait() {
   // The callback may need to acquire the lock, so we unlock it here
   lock.unlock();
 
-  if (callback)
+  if (callback.valid())
     callback();
 }
 
diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp
index 60176a33a6..479c671b38 100644
--- a/source/adapters/native_cpu/event.hpp
+++ b/source/adapters/native_cpu/event.hpp
@@ -21,7 +21,9 @@ struct ur_event_handle_t_ : RefCounted {
 
   ~ur_event_handle_t_();
 
-  void set_callback(const std::function<void()> &cb) { callback = cb; }
+  template <typename T> auto set_callback(T &&cb) {
+    callback = std::packaged_task<void()>(std::forward<T>(cb));
+  }
 
   void wait();
 
@@ -60,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted {
   bool done;
   std::mutex mutex;
   std::vector<std::future<void>> futures;
-  std::function<void()> callback;
+  std::packaged_task<void()> callback;
   uint64_t timestamp_start = 0;
   uint64_t timestamp_end = 0;
 };
diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp
index 4d2dec85cb..9e13286f3e 100644
--- a/source/adapters/native_cpu/kernel.hpp
+++ b/source/adapters/native_cpu/kernel.hpp
@@ -35,18 +35,9 @@ struct ur_kernel_handle_t_ : RefCounted {
   ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
       : Args(other.Args), hProgram(other.hProgram), _name(other._name),
         _subhandler(other._subhandler), _localArgInfo(other._localArgInfo),
-        _localMemPool(other._localMemPool),
-        _localMemPoolSize(other._localMemPoolSize),
-        ReqdWGSize(other.ReqdWGSize) {
-    incrementReferenceCount();
-  }
+        ReqdWGSize(other.ReqdWGSize) {}
 
-  ~ur_kernel_handle_t_() {
-    if (decrementReferenceCount() == 0) {
-      free(_localMemPool);
-      Args.deallocate();
-    }
-  }
+  ~ur_kernel_handle_t_() { free(_localMemPool); }
 
   ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler,
@@ -64,27 +55,62 @@ struct ur_kernel_handle_t_ : RefCounted {
     std::vector<bool> OwnsMem;
     static constexpr size_t MaxAlign = 16 * sizeof(double);
 
+    arguments() = default;
+
+    arguments(const arguments &Other)
+        : Indices(Other.Indices), ParamSizes(Other.ParamSizes),
+          OwnsMem(Other.OwnsMem.size(), false) {
+      for (size_t Index = 0; Index < Indices.size(); Index++) {
+        if (!Other.OwnsMem[Index]) {
+          continue;
+        }
+        addArg(Index, ParamSizes[Index], Indices[Index]);
+      }
+    }
+
+    arguments(arguments &&Other) : arguments() {
+      std::swap(Indices, Other.Indices);
+      std::swap(ParamSizes, Other.ParamSizes);
+      std::swap(OwnsMem, Other.OwnsMem);
+    }
+
+    ~arguments() {
+      assert(OwnsMem.size() == Indices.size() && "Size mismatch");
+      for (size_t Index = 0; Index < Indices.size(); Index++) {
+        if (!OwnsMem[Index]) {
+          continue;
+        }
+        native_cpu::aligned_free(Indices[Index]);
+      }
+    }
+
     /// Add an argument to the kernel.
     /// If the argument existed before, it is replaced.
     /// Otherwise, it is added.
     /// Gaps are filled with empty arguments.
     /// Implicit offset argument is kept at the back of the indices collection.
     void addArg(size_t Index, size_t Size, const void *Arg) {
+      bool NeedAlloc = true;
       if (Index + 1 > Indices.size()) {
         Indices.resize(Index + 1);
         OwnsMem.resize(Index + 1);
         ParamSizes.resize(Index + 1);
-
-        // Update the stored value for the argument
-        Indices[Index] = native_cpu::aligned_malloc(MaxAlign, Size);
-        OwnsMem[Index] = true;
-        ParamSizes[Index] = Size;
-      } else {
-        if (ParamSizes[Index] != Size) {
-          Indices[Index] = realloc(Indices[Index], Size);
-          ParamSizes[Index] = Size;
+      } else if (OwnsMem[Index]) {
+        if (ParamSizes[Index] == Size) {
+          NeedAlloc = false;
+        } else {
+          native_cpu::aligned_free(Indices[Index]);
         }
       }
+      if (NeedAlloc) {
+        size_t Align = MaxAlign;
+        while (Align > Size) {
+          Align >>= 1;
+        }
+        Indices[Index] = native_cpu::aligned_malloc(Align, Size);
+        ParamSizes[Index] = Size;
+        OwnsMem[Index] = true;
+      }
       std::memcpy(Indices[Index], Arg, Size);
     }
 
@@ -100,17 +126,6 @@ struct ur_kernel_handle_t_ : RefCounted {
       Indices[Index] = Arg;
     }
 
-    // This is called by the destructor of ur_kernel_handle_t_, since
-    // ur_kernel_handle_t_ implements reference counting and we want
-    // to deallocate only when the reference count is 0.
-    void deallocate() {
-      assert(OwnsMem.size() == Indices.size() && "Size mismatch");
-      for (size_t Index = 0; Index < Indices.size(); Index++) {
-        if (OwnsMem[Index])
-          native_cpu::aligned_free(Indices[Index]);
-      }
-    }
-
     const args_index_t &getIndices() const noexcept { return Indices; }
 
   } Args;
@@ -144,19 +159,26 @@ struct ur_kernel_handle_t_ : RefCounted {
 
   bool hasLocalArgs() const { return !_localArgInfo.empty(); }
 
-  // To be called before executing a work group if local args are present
-  void handleLocalArgs(size_t numParallelThread, size_t threadId) {
+  const std::vector<void *> &getArgs() const {
+    assert(!hasLocalArgs() && "For kernels with local arguments, thread "
+                              "information must be supplied.");
+    return Args.getIndices();
+  }
+
+  std::vector<void *> getArgs(size_t numThreads, size_t threadId) const {
+    auto Result = Args.getIndices();
+
     // For each local argument we have size*numthreads
     size_t offset = 0;
     for (auto &entry : _localArgInfo) {
-      Args.Indices[entry.argIndex] =
+      Result[entry.argIndex] =
           _localMemPool + offset + (entry.argSize * threadId);
       // update offset in the memory pool
-      offset += entry.argSize * numParallelThread;
+      offset += entry.argSize * numThreads;
     }
-  }
 
-  const std::vector<void *> &getArgs() const { return Args.getIndices(); }
+    return Result;
+  }
 
   void addArg(const void *Ptr, size_t Index, size_t Size) {
     Args.addArg(Index, Size, Ptr);

From dcb022fddc4f1d3d2b278f3d57b3b7879011ec63 Mon Sep 17 00:00:00 2001
From: Maosu Zhao <maosu.zhao@intel.com>
Date: Thu, 20 Feb 2025 12:36:11 +0800
Subject: [PATCH 02/13] Support device memory sanitizer for DG2 GPU device
 (#16619)

---
 source/loader/layers/sanitizer/msan/msan_shadow.cpp | 2 +-
 source/loader/layers/sanitizer/msan/msan_shadow.hpp | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.cpp b/source/loader/layers/sanitizer/msan/msan_shadow.cpp
index 8b210a2717..d4446d4dcf 100644
--- a/source/loader/layers/sanitizer/msan/msan_shadow.cpp
+++ b/source/loader/layers/sanitizer/msan/msan_shadow.cpp
@@ -141,7 +141,7 @@ ur_result_t MsanShadowMemoryGPU::Setup() {
     // To reserve very large amount of GPU virtual memroy, the pStart param
     // should be beyond the SVM range, so that GFX driver will automatically
     // switch to reservation on the GPU heap.
-    const void *StartAddress = (void *)(0x100'0000'0000'0000ULL);
+    const void *StartAddress = (void *)GetStartAddress();
     // TODO: Protect Bad Zone
     auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve(
         Context, StartAddress, ShadowSize, (void **)&ShadowBegin);
diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.hpp b/source/loader/layers/sanitizer/msan/msan_shadow.hpp
index 51847e4907..29b4a6d96a 100644
--- a/source/loader/layers/sanitizer/msan/msan_shadow.hpp
+++ b/source/loader/layers/sanitizer/msan/msan_shadow.hpp
@@ -103,6 +103,8 @@ struct MsanShadowMemoryGPU : public MsanShadowMemory {
 
   virtual size_t GetShadowSize() = 0;
 
+  virtual uptr GetStartAddress() { return 0; }
+
 private:
   ur_result_t EnqueueMapShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size,
                                std::vector<ur_event_handle_t> &EventWaitList,
@@ -134,6 +136,8 @@ struct MsanShadowMemoryPVC final : public MsanShadowMemoryGPU {
   uptr MemToShadow(uptr Ptr) override;
 
   size_t GetShadowSize() override { return 0x8000'0000'0000ULL; }
+
+  uptr GetStartAddress() override { return 0x100'0000'0000'0000ULL; }
 };
 
 /// Shadow Memory layout of GPU DG2 device

From 298b026e1eea11a837ab8a3e551ba0af8c659c70 Mon Sep 17 00:00:00 2001
From: Lukasz Dorau <lukasz.dorau@intel.com>
Date: Thu, 20 Feb 2025 10:01:49 +0100
Subject: [PATCH 03/13] Update UMF in UR to fix issue in LLVM (#17034)

Update UMF to the commit:
```
    commit 5a515c56c92be75944c8246535c408cee7711114
    Author: Lukasz Dorau <lukasz.dorau@intel.com>
    Date:   Mon Feb 17 10:56:05 2025 +0100
    Merge pull request #1086 from vinser52/svinogra_l0_linking
```
to fix the issue in LLVM (SYCL/CUDA):

    https://github.com/intel/llvm/issues/16944
    [SYCL][CUDA] Nsys profiling broken after memory providers change

Moved from: https://github.com/oneapi-src/unified-runtime/pull/2708

Fixes: https://github.com/intel/llvm/issues/16944

Signed-off-by: Lukasz Dorau <lukasz.dorau@intel.com>
---
 source/common/CMakeLists.txt | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 417016d630..5443e9de79 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -40,11 +40,11 @@ if (NOT DEFINED UMF_REPO)
 endif()
 
 if (NOT DEFINED UMF_TAG)
-    # commit ace9f4a60b686463fdad15cd016c548237cb79e0
-    # Author: Rafał Rudnicki <rafal.rudnicki@intel.com>
-    # Date:   Mon Feb 10 11:39:15 2025 +0100
-    # Merge pull request #1088 from ldorau/Fix_remove_CUDA_ERROR_INVALID_RESOURCE_TYPE
-    set(UMF_TAG ace9f4a60b686463fdad15cd016c548237cb79e0)
+    # commit 5a515c56c92be75944c8246535c408cee7711114
+    # Author: Lukasz Dorau <lukasz.dorau@intel.com>
+    # Date:   Mon Feb 17 10:56:05 2025 +0100
+    # Merge pull request #1086 from vinser52/svinogra_l0_linking
+    set(UMF_TAG 5a515c56c92be75944c8246535c408cee7711114)
 endif()
 
 message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}")

From 441720f0f719613b43e3311bfd125a999010d753 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20Komar?=
 <69756491+Xewar313@users.noreply.github.com>
Date: Thu, 20 Feb 2025 10:02:46 +0100
Subject: [PATCH 04/13] Add remaining calls shared with queue in level-zero v2
 adapter (#17061)

Adds implements calls shared between command buffer and queue in
unified-runtime level-zero v2 adapter and moves the shared code to
`command_list_manager.cpp`
---
 source/adapters/level_zero/v2/api.cpp         |  47 ------
 .../adapters/level_zero/v2/command_buffer.cpp | 109 ++++++++++++++
 .../level_zero/v2/command_list_manager.cpp    | 134 ++++++++++++++++++
 .../level_zero/v2/command_list_manager.hpp    |  27 ++++
 .../v2/queue_immediate_in_order.cpp           | 119 ++--------------
 5 files changed, 281 insertions(+), 155 deletions(-)

diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp
index 8a2153e0a5..129db02594 100644
--- a/source/adapters/level_zero/v2/api.cpp
+++ b/source/adapters/level_zero/v2/api.cpp
@@ -170,53 +170,6 @@ ur_result_t urBindlessImagesReleaseExternalSemaphoreExp(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
-ur_result_t urCommandBufferAppendUSMFillExp(
-    ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory,
-    const void *pPattern, size_t patternSize, size_t size,
-    uint32_t numSyncPointsInWaitList,
-    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
-    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urCommandBufferAppendMemBufferFillExp(
-    ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
-    const void *pPattern, size_t patternSize, size_t offset, size_t size,
-    uint32_t numSyncPointsInWaitList,
-    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
-    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urCommandBufferAppendUSMPrefetchExp(
-    ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
-    size_t size, ur_usm_migration_flags_t flags,
-    uint32_t numSyncPointsInWaitList,
-    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
-    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urCommandBufferAppendUSMAdviseExp(
-    ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
-    size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList,
-    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
-    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
 ur_result_t urCommandBufferUpdateKernelLaunchExp(
     ur_exp_command_buffer_command_handle_t hCommand,
     const ur_exp_command_buffer_update_kernel_launch_desc_t
diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp
index 8253527efe..8511d818b2 100644
--- a/source/adapters/level_zero/v2/command_buffer.cpp
+++ b/source/adapters/level_zero/v2/command_buffer.cpp
@@ -346,6 +346,115 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp(
   return exceptionToResult(std::current_exception());
 }
 
+ur_result_t urCommandBufferAppendUSMFillExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory,
+    const void *pPattern, size_t patternSize, size_t size,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
+
+  // the same issue as in urCommandBufferAppendKernelLaunchExp
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  // sync mechanic can be ignored, because all lists are in-order
+  std::ignore = numSyncPointsInWaitList;
+  std::ignore = pSyncPointWaitList;
+  std::ignore = pSyncPoint;
+
+  std::ignore = phCommand;
+
+  UR_CALL(hCommandBuffer->commandListManager.appendUSMFill(
+      pMemory, patternSize, pPattern, size, 0, nullptr, nullptr));
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
+ur_result_t urCommandBufferAppendMemBufferFillExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
+    const void *pPattern, size_t patternSize, size_t offset, size_t size,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
+
+  // the same issue as in urCommandBufferAppendKernelLaunchExp
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  // sync mechanic can be ignored, because all lists are in-order
+  std::ignore = numSyncPointsInWaitList;
+  std::ignore = pSyncPointWaitList;
+  std::ignore = pSyncPoint;
+
+  std::ignore = phCommand;
+
+  UR_CALL(hCommandBuffer->commandListManager.appendMemBufferFill(
+      hBuffer, pPattern, patternSize, offset, size, 0, nullptr, nullptr));
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
+ur_result_t urCommandBufferAppendUSMPrefetchExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
+    size_t size, ur_usm_migration_flags_t flags,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
+
+  // the same issue as in urCommandBufferAppendKernelLaunchExp
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  // sync mechanic can be ignored, because all lists are in-order
+  std::ignore = numSyncPointsInWaitList;
+  std::ignore = pSyncPointWaitList;
+  std::ignore = pSyncPoint;
+
+  std::ignore = phCommand;
+
+  UR_CALL(hCommandBuffer->commandListManager.appendUSMPrefetch(
+      pMemory, size, flags, 0, nullptr, nullptr));
+
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
+ur_result_t urCommandBufferAppendUSMAdviseExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
+    size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
+
+  // the same issue as in urCommandBufferAppendKernelLaunchExp
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  // sync mechanic can be ignored, because all lists are in-order
+  std::ignore = numSyncPointsInWaitList;
+  std::ignore = pSyncPointWaitList;
+  std::ignore = pSyncPoint;
+
+  std::ignore = phCommand;
+
+  UR_CALL(hCommandBuffer->commandListManager.appendUSMAdvise(pMemory, size,
+                                                             advice, nullptr));
+
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
 ur_result_t
 urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer,
                           ur_exp_command_buffer_info_t propName,
diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp
index b1510d0a97..5c6d2330d7 100644
--- a/source/adapters/level_zero/v2/command_list_manager.cpp
+++ b/source/adapters/level_zero/v2/command_list_manager.cpp
@@ -31,6 +31,50 @@ ur_command_list_manager::~ur_command_list_manager() {
   ur::level_zero::urDeviceRelease(device);
 }
 
+ur_result_t ur_command_list_manager::appendGenericFillUnlocked(
+    ur_mem_buffer_t *dst, size_t offset, size_t patternSize,
+    const void *pPattern, size_t size, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
+    ur_command_t commandType) {
+
+  auto zeSignalEvent = getSignalEvent(phEvent, commandType);
+
+  auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
+
+  auto pDst = ur_cast<char *>(dst->getDevicePtr(
+      device, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size,
+      [&](void *src, void *dst, size_t size) {
+        ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
+                          (zeCommandList.get(), dst, src, size, nullptr,
+                           waitListView.num, waitListView.handles));
+        waitListView.clear();
+      }));
+
+  // PatternSize must be a power of two for zeCommandListAppendMemoryFill.
+  // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy.
+  if (isPowerOf2(patternSize)) {
+    ZE2UR_CALL(zeCommandListAppendMemoryFill,
+               (zeCommandList.get(), pDst, pPattern, patternSize, size,
+                zeSignalEvent, waitListView.num, waitListView.handles));
+  } else {
+    // Copy pattern into every entry in memory array pointed by Ptr.
+    uint32_t numOfCopySteps = size / patternSize;
+    const void *src = pPattern;
+
+    for (uint32_t step = 0; step < numOfCopySteps; ++step) {
+      void *dst = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(pDst) +
+                                           step * patternSize);
+      ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+                 (zeCommandList.get(), dst, src, patternSize,
+                  step == numOfCopySteps - 1 ? zeSignalEvent : nullptr,
+                  waitListView.num, waitListView.handles));
+      waitListView.clear();
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
 ur_result_t ur_command_list_manager::appendGenericCopyUnlocked(
     ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset,
     size_t dstOffset, size_t size, uint32_t numEventsInWaitList,
@@ -209,6 +253,96 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy(
   return UR_RESULT_SUCCESS;
 }
 
+ur_result_t ur_command_list_manager::appendMemBufferFill(
+    ur_mem_handle_t hMem, const void *pPattern, size_t patternSize,
+    size_t offset, size_t size, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferFill");
+
+  auto hBuffer = hMem->getBuffer();
+  UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE);
+
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
+                                                          hBuffer->getMutex());
+
+  return appendGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, size,
+                                   numEventsInWaitList, phEventWaitList,
+                                   phEvent, UR_COMMAND_MEM_BUFFER_FILL);
+}
+
+ur_result_t ur_command_list_manager::appendUSMFill(
+    void *pMem, size_t patternSize, const void *pPattern, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill");
+
+  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
+
+  ur_usm_handle_t dstHandle(context, size, pMem);
+  return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size,
+                                   numEventsInWaitList, phEventWaitList,
+                                   phEvent, UR_COMMAND_USM_FILL);
+}
+
+ur_result_t ur_command_list_manager::appendUSMPrefetch(
+    const void *pMem, size_t size, ur_usm_migration_flags_t flags,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch");
+
+  std::ignore = flags;
+
+  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
+
+  auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH);
+
+  auto [pWaitEvents, numWaitEvents] =
+      getWaitListView(phEventWaitList, numEventsInWaitList);
+
+  if (pWaitEvents) {
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (zeCommandList.get(), numWaitEvents, pWaitEvents));
+  }
+  // TODO: figure out how to translate "flags"
+  ZE2UR_CALL(zeCommandListAppendMemoryPrefetch,
+             (zeCommandList.get(), pMem, size));
+  if (zeSignalEvent) {
+    ZE2UR_CALL(zeCommandListAppendSignalEvent,
+               (zeCommandList.get(), zeSignalEvent));
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t
+ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size,
+                                         ur_usm_advice_flags_t advice,
+                                         ur_event_handle_t *phEvent) {
+  TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise");
+
+  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
+
+  auto zeAdvice = ur_cast<ze_memory_advice_t>(advice);
+
+  auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE);
+
+  auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0);
+
+  if (pWaitEvents) {
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (zeCommandList.get(), numWaitEvents, pWaitEvents));
+  }
+
+  ZE2UR_CALL(zeCommandListAppendMemAdvise,
+             (zeCommandList.get(), device->ZeDevice, pMem, size, zeAdvice));
+
+  if (zeSignalEvent) {
+    ZE2UR_CALL(zeCommandListAppendSignalEvent,
+               (zeCommandList.get(), zeSignalEvent));
+  }
+  return UR_RESULT_SUCCESS;
+}
+
 ur_result_t ur_command_list_manager::appendMemBufferRead(
     ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size,
     void *pDst, uint32_t numEventsInWaitList,
diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp
index e85d9b9049..3d3481cb22 100644
--- a/source/adapters/level_zero/v2/command_list_manager.hpp
+++ b/source/adapters/level_zero/v2/command_list_manager.hpp
@@ -99,6 +99,27 @@ struct ur_command_list_manager : public _ur_object {
                                 const ur_event_handle_t *phEventWaitList,
                                 ur_event_handle_t *phEvent);
 
+  ur_result_t appendMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern,
+                                  size_t patternSize, size_t offset,
+                                  size_t size, uint32_t numEventsInWaitList,
+                                  const ur_event_handle_t *phEventWaitList,
+                                  ur_event_handle_t *phEvent);
+
+  ur_result_t appendUSMFill(void *pMem, size_t patternSize,
+                            const void *pPattern, size_t size,
+                            uint32_t numEventsInWaitList,
+                            const ur_event_handle_t *phEventWaitList,
+                            ur_event_handle_t *phEvent);
+
+  ur_result_t appendUSMPrefetch(const void *pMem, size_t size,
+                                ur_usm_migration_flags_t flags,
+                                uint32_t numEventsInWaitList,
+                                const ur_event_handle_t *phEventWaitList,
+                                ur_event_handle_t *phEvent);
+
+  ur_result_t appendUSMAdvise(const void *pMem, size_t size,
+                              ur_usm_advice_flags_t advice,
+                              ur_event_handle_t *phEvent);
   ze_command_list_handle_t getZeCommandList();
 
   wait_list_view getWaitListView(const ur_event_handle_t *phWaitEvents,
@@ -107,6 +128,12 @@ struct ur_command_list_manager : public _ur_object {
                                    ur_command_t commandType);
 
 private:
+  ur_result_t appendGenericFillUnlocked(
+      ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize,
+      const void *pPattern, size_t size, uint32_t numEventsInWaitList,
+      const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
+      ur_command_t commandType);
+
   ur_result_t appendGenericCopyUnlocked(
       ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking,
       size_t srcOffset, size_t dstOffset, size_t size,
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index d33ac12f7e..3f88161b90 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -372,16 +372,11 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferFill(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferFill");
 
-  auto hBuffer = hMem->getBuffer();
-
-  UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE);
-
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
-                                                          hBuffer->getMutex());
+  UR_CALL(commandListManager.appendMemBufferFill(
+      hMem, pPattern, patternSize, offset, size, numEventsInWaitList,
+      phEventWaitList, phEvent));
 
-  return enqueueGenericFillUnlocked(hBuffer, offset, patternSize, pPattern,
-                                    size, numEventsInWaitList, phEventWaitList,
-                                    phEvent, UR_COMMAND_MEM_BUFFER_FILL);
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageRead(
@@ -550,63 +545,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap(
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked(
-    ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize,
-    const void *pPattern, size_t size, uint32_t numEventsInWaitList,
-    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
-    ur_command_t commandType) {
-  auto zeSignalEvent = getSignalEvent(phEvent, commandType);
-
-  auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
-
-  auto pDst = ur_cast<char *>(hBuffer->getDevicePtr(
-      hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size,
-      [&](void *src, void *dst, size_t size) {
-        ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
-                          (commandListManager.getZeCommandList(), dst, src,
-                           size, nullptr, waitListView.num,
-                           waitListView.handles));
-        waitListView.clear();
-      }));
-
-  // PatternSize must be a power of two for zeCommandListAppendMemoryFill.
-  // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy.
-  if (isPowerOf2(patternSize)) {
-    ZE2UR_CALL(zeCommandListAppendMemoryFill,
-               (commandListManager.getZeCommandList(), pDst, pPattern,
-                patternSize, size, zeSignalEvent, waitListView.num,
-                waitListView.handles));
-  } else {
-    // Copy pattern into every entry in memory array pointed by Ptr.
-    uint32_t numOfCopySteps = size / patternSize;
-    const void *src = pPattern;
-
-    for (uint32_t step = 0; step < numOfCopySteps; ++step) {
-      void *dst = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(pDst) +
-                                           step * patternSize);
-      ZE2UR_CALL(zeCommandListAppendMemoryCopy,
-                 (commandListManager.getZeCommandList(), dst, src, patternSize,
-                  step == numOfCopySteps - 1 ? zeSignalEvent : nullptr,
-                  waitListView.num, waitListView.handles));
-      waitListView.clear();
-    }
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
 ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill(
     void *pMem, size_t patternSize, const void *pPattern, size_t size,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFill");
 
-  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
-
-  ur_usm_handle_t dstHandle(hContext, size, pMem);
-  return enqueueGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size,
-                                    numEventsInWaitList, phEventWaitList,
-                                    phEvent, UR_COMMAND_USM_FILL);
+  UR_CALL(commandListManager.appendUSMFill(pMem, patternSize, pPattern, size,
+                                           numEventsInWaitList, phEventWaitList,
+                                           phEvent));
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy(
@@ -628,29 +576,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch(
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMPrefetch");
-
-  std::ignore = flags;
-
-  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
-
-  auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH);
-
-  auto [pWaitEvents, numWaitEvents] =
-      getWaitListView(phEventWaitList, numEventsInWaitList);
-
-  if (pWaitEvents) {
-    ZE2UR_CALL(
-        zeCommandListAppendWaitOnEvents,
-        (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents));
-  }
-  // TODO: figure out how to translate "flags"
-  ZE2UR_CALL(zeCommandListAppendMemoryPrefetch,
-             (commandListManager.getZeCommandList(), pMem, size));
-  if (zeSignalEvent) {
-    ZE2UR_CALL(zeCommandListAppendSignalEvent,
-               (commandListManager.getZeCommandList(), zeSignalEvent));
-  }
-
+  UR_CALL(commandListManager.appendUSMPrefetch(
+      pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent));
   return UR_RESULT_SUCCESS;
 }
 
@@ -660,31 +587,7 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size,
                                                 ur_event_handle_t *phEvent) {
   TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMAdvise");
 
-  std::ignore = flags;
-
-  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
-
-  auto zeAdvice = ur_cast<ze_memory_advice_t>(advice);
-
-  auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE);
-
-  auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0);
-
-  if (pWaitEvents) {
-    ZE2UR_CALL(
-        zeCommandListAppendWaitOnEvents,
-        (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents));
-  }
-
-  // TODO: figure out how to translate "flags"
-  ZE2UR_CALL(zeCommandListAppendMemAdvise,
-             (commandListManager.getZeCommandList(), this->hDevice->ZeDevice,
-              pMem, size, zeAdvice));
-
-  if (zeSignalEvent) {
-    ZE2UR_CALL(zeCommandListAppendSignalEvent,
-               (commandListManager.getZeCommandList(), zeSignalEvent));
-  }
+  UR_CALL(commandListManager.appendUSMAdvise(pMem, size, advice, phEvent));
   return UR_RESULT_SUCCESS;
 }
 

From b77fc2bbd49dde749f7a9b286d972a92d5780a3b Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Thu, 20 Feb 2025 09:03:14 +0000
Subject: [PATCH 05/13] Make command-buffer creation descriptor mandatory
 (#17058)

As discussed in
https://github.com/oneapi-src/unified-runtime/pull/2670#discussion_r1946398932
the `pCommandBufferDesc` parameter to `urCommandBufferCreateExp` is
optional. However, the UR spec doesn't state what the configuration of
the created command-buffer is when this isn't passed, and being optional
is also inconsistent with the description parameters to urSamplerCreate
& urMemImageCreate which are not optional. This PR updates the
descriptor parameter to command-buffer creation to be mandatory to
address these concerns.

Closes https://github.com/oneapi-src/unified-runtime/issues/2673

**Note**: This UR patch was previously approved and ready-to-merge in
https://github.com/oneapi-src/unified-runtime/pull/2676 prior to the
repo move
---
 include/ur_api.h                              |  3 +-
 scripts/core/exp-command-buffer.yml           |  2 +-
 source/adapters/cuda/command_buffer.cpp       |  5 +---
 source/adapters/hip/command_buffer.cpp        |  4 +--
 source/adapters/level_zero/command_buffer.cpp |  9 ++----
 source/adapters/mock/ur_mockddi.cpp           |  2 +-
 source/adapters/opencl/command_buffer.cpp     |  3 +-
 source/loader/layers/tracing/ur_trcddi.cpp    |  2 +-
 source/loader/layers/validation/ur_valddi.cpp |  5 +++-
 source/loader/ur_ldrddi.cpp                   |  2 +-
 source/loader/ur_libapi.cpp                   |  3 +-
 source/ur_api.cpp                             |  3 +-
 .../conformance/exp_command_buffer/fixtures.h | 29 ++++++++++---------
 .../exp_command_buffer/kernel_event_sync.cpp  |  5 +++-
 .../update/invalid_update.cpp                 |  5 +++-
 .../urMultiDeviceProgramCreateWithBinary.cpp  |  7 +++--
 16 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index d7621bda32..6c15e39e09 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -10111,6 +10111,7 @@ typedef struct ur_exp_command_buffer_command_handle_t_
 ///         + `NULL == hContext`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pCommandBufferDesc`
 ///         + `NULL == phCommandBuffer`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_DEVICE
@@ -10125,7 +10126,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer);
diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml
index 218c626423..680bc60f8d 100644
--- a/scripts/core/exp-command-buffer.yml
+++ b/scripts/core/exp-command-buffer.yml
@@ -282,7 +282,7 @@ params:
       desc: "[in] Handle of the device object."
     - type: "const $x_exp_command_buffer_desc_t*"
       name: pCommandBufferDesc
-      desc: "[in][optional] command-buffer descriptor."
+      desc: "[in] Command-buffer descriptor."
     - type: "$x_exp_command_buffer_handle_t*"
       name: phCommandBuffer
       desc: "[out][alloc] Pointer to command-Buffer handle."
diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 0d274e6c15..42ec8dbafc 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -352,10 +352,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
-
-  const bool IsUpdatable =
-      pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false;
-
+  const bool IsUpdatable = pCommandBufferDesc->isUpdatable;
   try {
     *phCommandBuffer =
         new ur_exp_command_buffer_handle_t_(hContext, hDevice, IsUpdatable);
diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp
index 4c65e61dab..ce07332ce8 100644
--- a/source/adapters/hip/command_buffer.cpp
+++ b/source/adapters/hip/command_buffer.cpp
@@ -233,9 +233,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
-  const bool IsUpdatable =
-      pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false;
-
+  const bool IsUpdatable = pCommandBufferDesc->isUpdatable;
   try {
     *phCommandBuffer =
         new ur_exp_command_buffer_handle_t_(hContext, hDevice, IsUpdatable);
diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 879ee0f1cc..5d6ad6c68b 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -586,9 +586,7 @@ bool canBeInOrder(ur_context_handle_t Context,
   bool CompatibleDriver = Context->getPlatform()->isDriverVersionNewerOrSimilar(
       1, 3, L0_DRIVER_INORDER_MIN_VERSION);
   bool CanUseDriverInOrderLists = CompatibleDriver && DriverInOrderRequested;
-  return CanUseDriverInOrderLists
-             ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false)
-             : false;
+  return CanUseDriverInOrderLists ? CommandBufferDesc->isInOrder : false;
 }
 
 /**
@@ -624,9 +622,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
                          const ur_exp_command_buffer_desc_t *CommandBufferDesc,
                          ur_exp_command_buffer_handle_t *CommandBuffer) {
   bool IsInOrder = canBeInOrder(Context, CommandBufferDesc);
-  bool EnableProfiling =
-      CommandBufferDesc && CommandBufferDesc->enableProfiling && !IsInOrder;
-  bool IsUpdatable = CommandBufferDesc && CommandBufferDesc->isUpdatable;
+  bool EnableProfiling = CommandBufferDesc->enableProfiling && !IsInOrder;
+  bool IsUpdatable = CommandBufferDesc->isUpdatable;
   bool ImmediateAppendPath = checkImmediateAppendSupport(Context, Device);
   const bool WaitEventPath = !ImmediateAppendPath;
   bool UseCounterBasedEvents = checkCounterBasedEventsSupport(Device) &&
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index 6d5034d07b..9a4ace593e 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -8403,7 +8403,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) try {
diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp
index d78ef0121b..1c1bb0d18a 100644
--- a/source/adapters/opencl/command_buffer.cpp
+++ b/source/adapters/opencl/command_buffer.cpp
@@ -43,8 +43,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
           CLContext, cl_ext::ExtFuncPtrCache->clCreateCommandBufferKHRCache,
           cl_ext::CreateCommandBufferName, &clCreateCommandBufferKHR));
 
-  const bool IsUpdatable =
-      pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false;
+  const bool IsUpdatable = pCommandBufferDesc->isUpdatable;
 
   ur_device_command_buffer_update_capability_flags_t UpdateCapabilities;
   cl_device_id CLDevice = cl_adapter::cast<cl_device_id>(hDevice);
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index f4a7b7e60a..e716eaaa49 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -6978,7 +6978,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index eb2bd4c353..acb1f65c94 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -7615,7 +7615,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
@@ -7632,6 +7632,9 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     if (NULL == hDevice)
       return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
 
+    if (NULL == pCommandBufferDesc)
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+
     if (NULL == phCommandBuffer)
       return UR_RESULT_ERROR_INVALID_NULL_POINTER;
   }
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index ec6081509f..c1e21fd58b 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -7044,7 +7044,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 5761dab3a4..42d905964f 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -7619,6 +7619,7 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp(
 ///         + `NULL == hContext`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pCommandBufferDesc`
 ///         + `NULL == phCommandBuffer`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_DEVICE
@@ -7633,7 +7634,7 @@ ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) try {
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 7023161cb1..80c034cbd0 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -6664,6 +6664,7 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp(
 ///         + `NULL == hContext`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pCommandBufferDesc`
 ///         + `NULL == phCommandBuffer`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_DEVICE
@@ -6678,7 +6679,7 @@ ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h
index 0cd4d7a6b3..e0ddc0b389 100644
--- a/test/conformance/exp_command_buffer/fixtures.h
+++ b/test/conformance/exp_command_buffer/fixtures.h
@@ -60,8 +60,12 @@ struct urCommandBufferExpTest : uur::urContextTest {
     UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp());
 
     UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device));
+
+    ur_exp_command_buffer_desc_t desc{
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false,
+    };
     ASSERT_SUCCESS(
-        urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle));
+        urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle));
     ASSERT_NE(cmd_buf_handle, nullptr);
   }
 
@@ -83,8 +87,11 @@ struct urCommandBufferExpTestWithParam : urQueueTestWithParam<T> {
     UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam<T>::SetUp());
 
     UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(this->device));
-    ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device,
-                                            nullptr, &cmd_buf_handle));
+
+    ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC,
+                                      nullptr, false, false, false};
+    ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device, &desc,
+                                            &cmd_buf_handle));
     ASSERT_NE(cmd_buf_handle, nullptr);
   }
 
@@ -105,8 +112,11 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest {
     UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp());
 
     UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device));
+
+    ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC,
+                                      nullptr, false, false, false};
     ASSERT_SUCCESS(
-        urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle));
+        urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle));
     ASSERT_NE(cmd_buf_handle, nullptr);
   }
 
@@ -333,15 +343,8 @@ struct urCommandEventSyncTest : urCommandBufferExpTest {
       ASSERT_NE(buffer, nullptr);
     }
 
-    // Create a command-buffer with update enabled.
-    ur_exp_command_buffer_desc_t desc{
-        /*.stype=*/UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC,
-        /*.pNext =*/nullptr,
-        /*.isUpdatable =*/false,
-        /*.isInOrder =*/false,
-        /*.enableProfiling =*/false,
-    };
-
+    ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC,
+                                      nullptr, true, false, false};
     ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc,
                                             &second_cmd_buf_handle));
     ASSERT_NE(second_cmd_buf_handle, nullptr);
diff --git a/test/conformance/exp_command_buffer/kernel_event_sync.cpp b/test/conformance/exp_command_buffer/kernel_event_sync.cpp
index 84941ab26d..60844ffdd7 100644
--- a/test/conformance/exp_command_buffer/kernel_event_sync.cpp
+++ b/test/conformance/exp_command_buffer/kernel_event_sync.cpp
@@ -40,7 +40,10 @@ struct KernelCommandEventSyncTest
     ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 3, nullptr, device_ptrs[1]));
 
     // Create second command-buffer
-    ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr,
+    ur_exp_command_buffer_desc_t desc{
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false,
+    };
+    ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc,
                                             &second_cmd_buf_handle));
     ASSERT_NE(second_cmd_buf_handle, nullptr);
   }
diff --git a/test/conformance/exp_command_buffer/update/invalid_update.cpp b/test/conformance/exp_command_buffer/update/invalid_update.cpp
index 9c3da1ddd5..b93ffc26d4 100644
--- a/test/conformance/exp_command_buffer/update/invalid_update.cpp
+++ b/test/conformance/exp_command_buffer/update/invalid_update.cpp
@@ -109,8 +109,11 @@ TEST_P(InvalidUpdateTest, NotFinalizedCommandBuffer) {
 TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) {
   // Create a command-buffer without isUpdatable
   ur_exp_command_buffer_handle_t test_cmd_buf_handle = nullptr;
+  ur_exp_command_buffer_desc_t desc{
+      UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false,
+  };
   ASSERT_SUCCESS(
-      urCommandBufferCreateExp(context, device, nullptr, &test_cmd_buf_handle));
+      urCommandBufferCreateExp(context, device, &desc, &test_cmd_buf_handle));
   EXPECT_NE(test_cmd_buf_handle, nullptr);
 
   // Append a kernel commands to command-buffer and close command-buffer
diff --git a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp
index 45309c3dee..0094caa274 100644
--- a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp
+++ b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp
@@ -311,9 +311,12 @@ TEST_P(urMultiDeviceCommandBufferExpTest, Enqueue) {
     }
 
     // Create command-buffer
+    ur_exp_command_buffer_desc_t desc{
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false,
+    };
     uur::raii::CommandBuffer cmd_buf_handle;
-    ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr,
-                                            cmd_buf_handle.ptr()));
+    ASSERT_SUCCESS(
+        urCommandBufferCreateExp(context, device, &desc, cmd_buf_handle.ptr()));
 
     // Append kernel command to command-buffer and close command-buffer
     ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(

From eb471a7127f9f8c5abda89f68e22a1ffe40647ac Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Thu, 20 Feb 2025 10:18:34 +0000
Subject: [PATCH 06/13] In-order path for OpenCL command-buffers (#17056)

After the [spec bump of cl_khr_command_buffer to
0.9.7](https://github.com/KhronosGroup/OpenCL-Docs/), in the OpenCL
adapter we no longer need to worry about the in-order/out-of-order
property of the internal queue used on command-command-buffer creation
matching the queue used to enqueue the command-buffer.

We can therefore take advantage of the in-order flag passed on UR
command-buffer creation to use an in-order queue for command-buffer
creation, and omit using sync points.

**Note:** This UR patch was previously approved and ready-to-merge prior
to the UR repo move in
https://github.com/oneapi-src/unified-runtime/pull/2681
---
 source/adapters/opencl/command_buffer.cpp | 47 ++++++++++++++++++-----
 source/adapters/opencl/command_buffer.hpp |  6 ++-
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp
index 1c1bb0d18a..3707dd4e5d 100644
--- a/source/adapters/opencl/command_buffer.cpp
+++ b/source/adapters/opencl/command_buffer.cpp
@@ -34,7 +34,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
 
   ur_queue_handle_t Queue = nullptr;
-  UR_RETURN_ON_FAILURE(urQueueCreate(hContext, hDevice, nullptr, &Queue));
+  ur_queue_properties_t QueueProperties = {UR_STRUCTURE_TYPE_QUEUE_PROPERTIES,
+                                           nullptr, 0};
+  const bool IsInOrder =
+      pCommandBufferDesc ? pCommandBufferDesc->isInOrder : false;
+  if (!IsInOrder) {
+    QueueProperties.flags = UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+  }
+  UR_RETURN_ON_FAILURE(
+      urQueueCreate(hContext, hDevice, &QueueProperties, &Queue));
 
   cl_context CLContext = cl_adapter::cast<cl_context>(hContext);
   cl_ext::clCreateCommandBufferKHR_fn clCreateCommandBufferKHR = nullptr;
@@ -66,7 +74,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
 
   try {
     auto URCommandBuffer = std::make_unique<ur_exp_command_buffer_handle_t_>(
-        Queue, hContext, hDevice, CLCommandBuffer, IsUpdatable);
+        Queue, hContext, hDevice, CLCommandBuffer, IsUpdatable, IsInOrder);
     *phCommandBuffer = URCommandBuffer.release();
   } catch (...) {
     return UR_RESULT_ERROR_OUT_OF_RESOURCES;
@@ -147,11 +155,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
 
   cl_command_properties_khr *Properties =
       hCommandBuffer->IsUpdatable ? UpdateProperties : nullptr;
+
+  const bool IsInOrder = hCommandBuffer->IsInOrder;
+  cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint;
+  const cl_sync_point_khr *SyncPointWaitList =
+      IsInOrder ? nullptr : pSyncPointWaitList;
+  uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList;
   CL_RETURN_ON_FAILURE(clCommandNDRangeKernelKHR(
       hCommandBuffer->CLCommandBuffer, nullptr, Properties,
       cl_adapter::cast<cl_kernel>(hKernel), workDim, pGlobalWorkOffset,
-      pGlobalWorkSize, pLocalWorkSize, numSyncPointsInWaitList,
-      pSyncPointWaitList, pSyncPoint, OutCommandHandle));
+      pGlobalWorkSize, pLocalWorkSize, WaitListSize, SyncPointWaitList,
+      RetSyncPoint, OutCommandHandle));
 
   try {
     auto Handle = std::make_unique<ur_exp_command_buffer_command_handle_t_>(
@@ -218,11 +232,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
           CLContext, cl_ext::ExtFuncPtrCache->clCommandCopyBufferKHRCache,
           cl_ext::CommandCopyBufferName, &clCommandCopyBufferKHR));
 
+  const bool IsInOrder = hCommandBuffer->IsInOrder;
+  cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint;
+  const cl_sync_point_khr *SyncPointWaitList =
+      IsInOrder ? nullptr : pSyncPointWaitList;
+  uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList;
   CL_RETURN_ON_FAILURE(clCommandCopyBufferKHR(
       hCommandBuffer->CLCommandBuffer, nullptr, nullptr,
       cl_adapter::cast<cl_mem>(hSrcMem), cl_adapter::cast<cl_mem>(hDstMem),
-      srcOffset, dstOffset, size, numSyncPointsInWaitList, pSyncPointWaitList,
-      pSyncPoint, nullptr));
+      srcOffset, dstOffset, size, WaitListSize, SyncPointWaitList, RetSyncPoint,
+      nullptr));
 
   return UR_RESULT_SUCCESS;
 }
@@ -256,12 +275,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
           CLContext, cl_ext::ExtFuncPtrCache->clCommandCopyBufferRectKHRCache,
           cl_ext::CommandCopyBufferRectName, &clCommandCopyBufferRectKHR));
 
+  const bool IsInOrder = hCommandBuffer->IsInOrder;
+  cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint;
+  const cl_sync_point_khr *SyncPointWaitList =
+      IsInOrder ? nullptr : pSyncPointWaitList;
+  uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList;
   CL_RETURN_ON_FAILURE(clCommandCopyBufferRectKHR(
       hCommandBuffer->CLCommandBuffer, nullptr, nullptr,
       cl_adapter::cast<cl_mem>(hSrcMem), cl_adapter::cast<cl_mem>(hDstMem),
       OpenCLOriginRect, OpenCLDstRect, OpenCLRegion, srcRowPitch, srcSlicePitch,
-      dstRowPitch, dstSlicePitch, numSyncPointsInWaitList, pSyncPointWaitList,
-      pSyncPoint, nullptr));
+      dstRowPitch, dstSlicePitch, WaitListSize, SyncPointWaitList, RetSyncPoint,
+      nullptr));
 
   return UR_RESULT_SUCCESS;
 }
@@ -360,10 +384,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
           CLContext, cl_ext::ExtFuncPtrCache->clCommandFillBufferKHRCache,
           cl_ext::CommandFillBufferName, &clCommandFillBufferKHR));
 
+  const bool IsInOrder = hCommandBuffer->IsInOrder;
+  cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint;
+  const cl_sync_point_khr *SyncPointWaitList =
+      IsInOrder ? nullptr : pSyncPointWaitList;
+  uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList;
   CL_RETURN_ON_FAILURE(clCommandFillBufferKHR(
       hCommandBuffer->CLCommandBuffer, nullptr, nullptr,
       cl_adapter::cast<cl_mem>(hBuffer), pPattern, patternSize, offset, size,
-      numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, nullptr));
+      WaitListSize, SyncPointWaitList, RetSyncPoint, nullptr));
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/opencl/command_buffer.hpp b/source/adapters/opencl/command_buffer.hpp
index 83fedc3c1e..6fbdc5d259 100644
--- a/source/adapters/opencl/command_buffer.hpp
+++ b/source/adapters/opencl/command_buffer.hpp
@@ -46,6 +46,8 @@ struct ur_exp_command_buffer_handle_t_ {
   /// Set to true if the kernel commands in the command-buffer can be updated,
   /// false otherwise
   bool IsUpdatable;
+  /// Set to true if the command-buffer was created with an in-order queue.
+  bool IsInOrder;
   /// Set to true if the command-buffer has been finalized, false otherwise
   bool IsFinalized;
   /// List of commands in the command-buffer.
@@ -58,10 +60,10 @@ struct ur_exp_command_buffer_handle_t_ {
                                   ur_context_handle_t hContext,
                                   ur_device_handle_t hDevice,
                                   cl_command_buffer_khr CLCommandBuffer,
-                                  bool IsUpdatable)
+                                  bool IsUpdatable, bool IsInOrder)
       : hInternalQueue(hQueue), hContext(hContext), hDevice(hDevice),
         CLCommandBuffer(CLCommandBuffer), IsUpdatable(IsUpdatable),
-        IsFinalized(false), RefCount(0) {}
+        IsInOrder(IsInOrder), IsFinalized(false), RefCount(0) {}
 
   ~ur_exp_command_buffer_handle_t_();
 

From a239c8f728d1d7be4814bf660332d4635baefbd4 Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Thu, 20 Feb 2025 10:47:46 +0000
Subject: [PATCH 07/13] Don't set -pie on shared objects (#16880)

Fixes #16677 by only setting `-pie` linker option in Release builds on
executables rather than on any type of target.
---
 cmake/helpers.cmake | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index bdd4a6d15c..ec945ba2c2 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -157,11 +157,6 @@ function(add_ur_target_link_options name)
             if (UR_DEVELOPER_MODE)
                 target_link_options(${name} PRIVATE -Werror -Wextra)
             endif()
-            if (CMAKE_BUILD_TYPE STREQUAL "Release")
-                target_link_options(${name} PRIVATE
-                    $<$<CXX_COMPILER_ID:GNU>:-pie>
-                )
-            endif()
         endif()
     elseif(MSVC)
         target_link_options(${name} PRIVATE
@@ -176,7 +171,15 @@ function(add_ur_target_link_options name)
 endfunction()
 
 function(add_ur_target_exec_options name)
-    if(MSVC)
+    if(NOT MSVC)
+        if(NOT APPLE)
+            if(CMAKE_BUILD_TYPE STREQUAL "Release")
+                target_link_options(${name} PRIVATE
+                    $<$<CXX_COMPILER_ID:GNU>:-pie>
+                )
+            endif()
+        endif()
+    elseif(MSVC)
         target_link_options(${name} PRIVATE
             LINKER:/ALLOWISOLATION
         )

From a6e7ea77bcf8c6737fb6712dcdbbd878710303fa Mon Sep 17 00:00:00 2001
From: Yang Zhao <yang2.zhao@intel.com>
Date: Thu, 20 Feb 2025 18:49:49 +0800
Subject: [PATCH 08/13] Fix throw "UR_RESULT_ERROR_INVALID_ARGUMENT" exception
 when catching free related error (#16706)

UR: https://github.com/oneapi-src/unified-runtime/pull/2592

---------

Co-authored-by: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
---
 .../sanitizer/asan/asan_interceptor.cpp       | 20 +++++++++++++++----
 .../layers/sanitizer/asan/asan_options.cpp    |  1 +
 .../layers/sanitizer/asan/asan_options.hpp    |  1 +
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index d5d5c2c729..d99c0545c3 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -175,7 +175,10 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context,
   if (!AllocInfoItOp) {
     // "Addr" might be a host pointer
     ReportBadFree(Addr, GetCurrentBacktrace(), nullptr);
-    return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    if (getOptions().HaltOnError) {
+      exitWithErrors();
+    }
+    return UR_RESULT_SUCCESS;
   }
 
   auto AllocInfoIt = *AllocInfoItOp;
@@ -190,17 +193,26 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context,
       // "Addr" might be a host pointer
       ReportBadFree(Addr, GetCurrentBacktrace(), nullptr);
     }
-    return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    if (getOptions().HaltOnError) {
+      exitWithErrors();
+    }
+    return UR_RESULT_SUCCESS;
   }
 
   if (Addr != AllocInfo->UserBegin) {
     ReportBadFree(Addr, GetCurrentBacktrace(), AllocInfo);
-    return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    if (getOptions().HaltOnError) {
+      exitWithErrors();
+    }
+    return UR_RESULT_SUCCESS;
   }
 
   if (AllocInfo->IsReleased) {
     ReportDoubleFree(Addr, GetCurrentBacktrace(), AllocInfo);
-    return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    if (getOptions().HaltOnError) {
+      exitWithErrors();
+    }
+    return UR_RESULT_SUCCESS;
   }
 
   AllocInfo->IsReleased = true;
diff --git a/source/loader/layers/sanitizer/asan/asan_options.cpp b/source/loader/layers/sanitizer/asan/asan_options.cpp
index 7704547d38..799c892ff1 100644
--- a/source/loader/layers/sanitizer/asan/asan_options.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_options.cpp
@@ -90,6 +90,7 @@ AsanOptions::AsanOptions() {
   SetBoolOption("detect_privates", DetectPrivates);
   SetBoolOption("print_stats", PrintStats);
   SetBoolOption("detect_leaks", DetectLeaks);
+  SetBoolOption("halt_on_error", HaltOnError);
 
   auto KV = OptionsEnvMap->find("quarantine_size_mb");
   if (KV != OptionsEnvMap->end()) {
diff --git a/source/loader/layers/sanitizer/asan/asan_options.hpp b/source/loader/layers/sanitizer/asan/asan_options.hpp
index 4da32ca278..cea30351d3 100644
--- a/source/loader/layers/sanitizer/asan/asan_options.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_options.hpp
@@ -28,6 +28,7 @@ struct AsanOptions {
   bool PrintStats = false;
   bool DetectKernelArguments = true;
   bool DetectLeaks = true;
+  bool HaltOnError = true;
 
   explicit AsanOptions();
 };

From 7697d3787208a19c0ab01723f4848e2529974dd2 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 20 Feb 2025 12:21:26 +0000
Subject: [PATCH 09/13] Make each profiling info variant for
 urEventGetProfilingInfo optional and improve its conformance test (#17067)

Migrated from https://github.com/oneapi-src/unified-runtime/pull/2533

This patch turns all of the values returned by urEventGetProfilingInfo
to be optional and updates adapters to handle this by returning the
appropriate enum when it is not supported.

The tests have also been updated, to ensure that returning a counter of
"0" or values equal to the previous profiling event is no longer
considered a failure.
---
 include/ur_api.h                              | 24 ++++---
 scripts/core/event.yml                        | 12 ++--
 source/adapters/cuda/event.cpp                |  2 +
 source/adapters/hip/event.cpp                 |  2 +
 source/adapters/level_zero/event.cpp          | 12 ++++
 source/adapters/native_cpu/event.cpp          |  1 +
 source/loader/ur_libapi.cpp                   |  2 +
 source/ur_api.cpp                             |  2 +
 .../event/urEventGetProfilingInfo.cpp         | 71 +++++++++----------
 .../testing/include/uur/optional_queries.h    | 12 ++++
 10 files changed, 85 insertions(+), 55 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index 6c15e39e09..c390ed4410 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -7105,21 +7105,21 @@ typedef enum ur_event_info_t {
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Profiling query information type
 typedef enum ur_profiling_info_t {
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event is enqueued
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event is enqueued
   UR_PROFILING_INFO_COMMAND_QUEUED = 0,
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event is submitted
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event is submitted
   UR_PROFILING_INFO_COMMAND_SUBMIT = 1,
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event starts execution
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event starts execution
   UR_PROFILING_INFO_COMMAND_START = 2,
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event has finished execution
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event has finished execution
   UR_PROFILING_INFO_COMMAND_END = 3,
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event and any child events enqueued by this event on the
-  /// device have finished execution
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event and any child events enqueued by this event
+  /// on the device have finished execution
   UR_PROFILING_INFO_COMMAND_COMPLETE = 4,
   /// @cond
   UR_PROFILING_INFO_FORCE_UINT32 = 0x7fffffff
@@ -7193,6 +7193,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(
 ///     - ::UR_RESULT_ERROR_INVALID_EVENT
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
+///         + If `propName` is not supported by the adapter.
 UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
     /// [in] handle of the event object
     ur_event_handle_t hEvent,
diff --git a/scripts/core/event.yml b/scripts/core/event.yml
index da2fa457de..1b0eeca23e 100644
--- a/scripts/core/event.yml
+++ b/scripts/core/event.yml
@@ -121,15 +121,15 @@ name: $x_profiling_info_t
 typed_etors: True
 etors:
     - name: COMMAND_QUEUED
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event is enqueued"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event is enqueued"
     - name: COMMAND_SUBMIT
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event is submitted"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event is submitted"
     - name: COMMAND_START
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event starts execution"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event starts execution"
     - name: COMMAND_END
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event has finished execution"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event has finished execution"
     - name: COMMAND_COMPLETE
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event and any child events enqueued by this event on the device have finished execution"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event and any child events enqueued by this event on the device have finished execution"
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Get event object information"
@@ -198,6 +198,8 @@ returns:
     - $X_RESULT_ERROR_INVALID_EVENT
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
+    - $X_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
+      - "If `propName` is not supported by the adapter."
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Wait for a list of events to finish."
diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp
index a6c2208e8f..37792bf5b2 100644
--- a/source/adapters/cuda/event.cpp
+++ b/source/adapters/cuda/event.cpp
@@ -213,6 +213,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
     return ReturnValue(static_cast<uint64_t>(hEvent->getStartTime()));
   case UR_PROFILING_INFO_COMMAND_END:
     return ReturnValue(static_cast<uint64_t>(hEvent->getEndTime()));
+  case UR_PROFILING_INFO_COMMAND_COMPLETE:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     break;
   }
diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp
index 81c839cf32..d971207a6c 100644
--- a/source/adapters/hip/event.cpp
+++ b/source/adapters/hip/event.cpp
@@ -234,6 +234,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
     return ReturnValue(static_cast<uint64_t>(hEvent->getStartTime()));
   case UR_PROFILING_INFO_COMMAND_END:
     return ReturnValue(static_cast<uint64_t>(hEvent->getEndTime()));
+  case UR_PROFILING_INFO_COMMAND_COMPLETE:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     break;
   }
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index e12352b6b1..cfcd1d316d 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -609,6 +609,10 @@ ur_result_t urEventGetProfilingInfo(
 
       return ReturnValue(ContextEndTime);
     }
+    case UR_PROFILING_INFO_COMMAND_COMPLETE:
+      logger::error("urEventGetProfilingInfo: "
+                    "UR_PROFILING_INFO_COMMAND_COMPLETE not supported");
+      return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
     default:
       logger::error("urEventGetProfilingInfo: not supported ParamName");
       return UR_RESULT_ERROR_INVALID_VALUE;
@@ -672,6 +676,10 @@ ur_result_t urEventGetProfilingInfo(
         ContextEndTime *= ZeTimerResolution;
         return ReturnValue(ContextEndTime);
       }
+      case UR_PROFILING_INFO_COMMAND_COMPLETE:
+        logger::error("urEventGetProfilingInfo: "
+                      "UR_PROFILING_INFO_COMMAND_COMPLETE not supported");
+        return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
       default:
         logger::error("urEventGetProfilingInfo: not supported ParamName");
         return UR_RESULT_ERROR_INVALID_VALUE;
@@ -715,6 +723,10 @@ ur_result_t urEventGetProfilingInfo(
     // enqueue.
     //
     return ReturnValue(uint64_t{0});
+  case UR_PROFILING_INFO_COMMAND_COMPLETE:
+    logger::error("urEventGetProfilingInfo: UR_PROFILING_INFO_COMMAND_COMPLETE "
+                  "not supported");
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     logger::error("urEventGetProfilingInfo: not supported ParamName");
     return UR_RESULT_ERROR_INVALID_VALUE;
diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp
index 87865905c1..f981d24f42 100644
--- a/source/adapters/native_cpu/event.cpp
+++ b/source/adapters/native_cpu/event.cpp
@@ -52,6 +52,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   case UR_PROFILING_INFO_COMMAND_QUEUED:
   case UR_PROFILING_INFO_COMMAND_SUBMIT:
   case UR_PROFILING_INFO_COMMAND_COMPLETE:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     break;
   }
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 42d905964f..e579753763 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -4733,6 +4733,8 @@ ur_result_t UR_APICALL urEventGetInfo(
 ///     - ::UR_RESULT_ERROR_INVALID_EVENT
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
+///         + If `propName` is not supported by the adapter.
 ur_result_t UR_APICALL urEventGetProfilingInfo(
     /// [in] handle of the event object
     ur_event_handle_t hEvent,
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 80c034cbd0..c5651c0fc4 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -4138,6 +4138,8 @@ ur_result_t UR_APICALL urEventGetInfo(
 ///     - ::UR_RESULT_ERROR_INVALID_EVENT
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
+///         + If `propName` is not supported by the adapter.
 ur_result_t UR_APICALL urEventGetProfilingInfo(
     /// [in] handle of the event object
     ur_event_handle_t hEvent,
diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp
index 16723df5cf..e63b6078de 100644
--- a/test/conformance/event/urEventGetProfilingInfo.cpp
+++ b/test/conformance/event/urEventGetProfilingInfo.cpp
@@ -11,8 +11,6 @@ using urEventGetProfilingInfoTest = uur::event::urEventTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEventGetProfilingInfoTest);
 
 TEST_P(urEventGetProfilingInfoTest, SuccessCommandQueued) {
-  UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::NativeCPU{});
-
   const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_QUEUED;
   size_t property_size = 0;
 
@@ -29,8 +27,6 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandQueued) {
 }
 
 TEST_P(urEventGetProfilingInfoTest, SuccessCommandSubmit) {
-  UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::NativeCPU{});
-
   const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_SUBMIT;
   size_t property_size = 0;
 
@@ -79,9 +75,6 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandEnd) {
 }
 
 TEST_P(urEventGetProfilingInfoTest, SuccessCommandComplete) {
-  UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::HIP{}, uur::LevelZero{},
-                       uur::NativeCPU{});
-
   const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_COMPLETE;
   size_t property_size = 0;
 
@@ -98,41 +91,41 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandComplete) {
 }
 
 TEST_P(urEventGetProfilingInfoTest, Success) {
-  UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::HIP{}, uur::LevelZero{},
-                       uur::LevelZeroV2{}, uur::NativeCPU{});
-
-  uint8_t size = 8;
-
-  uint64_t queued_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(
-      event, UR_PROFILING_INFO_COMMAND_QUEUED, size, &queued_value, nullptr));
-  ASSERT_NE(queued_value, 0);
-
-  uint64_t submit_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(
-      event, UR_PROFILING_INFO_COMMAND_SUBMIT, size, &submit_value, nullptr));
-  ASSERT_NE(submit_value, 0);
-
-  uint64_t start_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START,
-                                         size, &start_value, nullptr));
-  ASSERT_NE(start_value, 0);
+  // AMD devices may report a "start" time before the "submit" time
+  UUR_KNOWN_FAILURE_ON(uur::HIP{});
+
+  // If a and b are supported, asserts that a <= b
+  auto test_timing = [=](ur_profiling_info_t a, ur_profiling_info_t b) {
+    std::stringstream trace{"Profiling Info: "};
+    trace << a << " <= " << b;
+    SCOPED_TRACE(trace.str());
+    uint64_t a_time;
+    auto result =
+        urEventGetProfilingInfo(event, a, sizeof(a_time), &a_time, nullptr);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) {
+      return;
+    }
+    ASSERT_SUCCESS(result);
 
-  uint64_t end_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END,
-                                         size, &end_value, nullptr));
-  ASSERT_NE(end_value, 0);
+    uint64_t b_time;
+    result =
+        urEventGetProfilingInfo(event, b, sizeof(b_time), &b_time, nullptr);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) {
+      return;
+    }
+    ASSERT_SUCCESS(result);
 
-  uint64_t complete_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(event,
-                                         UR_PROFILING_INFO_COMMAND_COMPLETE,
-                                         size, &complete_value, nullptr));
-  ASSERT_NE(complete_value, 0);
+    // Note: This assumes that the counter doesn't overflow
+    ASSERT_LE(a_time, b_time);
+  };
 
-  ASSERT_LE(queued_value, submit_value);
-  ASSERT_LT(submit_value, start_value);
-  ASSERT_LT(start_value, end_value);
-  ASSERT_LE(end_value, complete_value);
+  test_timing(UR_PROFILING_INFO_COMMAND_QUEUED,
+              UR_PROFILING_INFO_COMMAND_SUBMIT);
+  test_timing(UR_PROFILING_INFO_COMMAND_SUBMIT,
+              UR_PROFILING_INFO_COMMAND_START);
+  test_timing(UR_PROFILING_INFO_COMMAND_START, UR_PROFILING_INFO_COMMAND_END);
+  test_timing(UR_PROFILING_INFO_COMMAND_END,
+              UR_PROFILING_INFO_COMMAND_COMPLETE);
 }
 
 TEST_P(urEventGetProfilingInfoTest, InvalidNullHandle) {
diff --git a/test/conformance/testing/include/uur/optional_queries.h b/test/conformance/testing/include/uur/optional_queries.h
index d3f670545a..e0a78c18f5 100644
--- a/test/conformance/testing/include/uur/optional_queries.h
+++ b/test/conformance/testing/include/uur/optional_queries.h
@@ -117,4 +117,16 @@ template <> inline bool isQueryOptional(ur_queue_info_t query) {
                    query) != optional_ur_queue_info_t.end();
 }
 
+constexpr std::array optional_ur_profiling_info_t = {
+    UR_PROFILING_INFO_COMMAND_QUEUED,   UR_PROFILING_INFO_COMMAND_SUBMIT,
+    UR_PROFILING_INFO_COMMAND_START,    UR_PROFILING_INFO_COMMAND_END,
+    UR_PROFILING_INFO_COMMAND_COMPLETE,
+};
+
+template <> inline bool isQueryOptional(ur_profiling_info_t query) {
+  return std::find(optional_ur_profiling_info_t.begin(),
+                   optional_ur_profiling_info_t.end(),
+                   query) != optional_ur_profiling_info_t.end();
+}
+
 } // namespace uur

From fd4508a5ef7e9c3fd22d8bfec8c7634b2b045194 Mon Sep 17 00:00:00 2001
From: Ben Tracy <benjamin.tracy@intel.com>
Date: Thu, 20 Feb 2025 12:48:49 +0000
Subject: [PATCH 10/13] Fix issue with command-buffer local mem update (#17069)

- Fix group count not being recalculated when a user only passes a new
local work size and no new global size
- Remove CTS test skips for local update on L0
---
 source/adapters/level_zero/command_buffer.cpp | 39 ++++++++++++-------
 source/adapters/level_zero/command_buffer.hpp | 11 ++++++
 .../update/local_memory_update.cpp            |  6 ---
 3 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 5d6ad6c68b..4705964190 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -907,17 +907,18 @@ ur_result_t setKernelPendingArguments(
  * @param[in] CommandBuffer The CommandBuffer associated with the new command.
  * @param[in] Kernel  The Kernel associated with the new command.
  * @param[in] WorkDim Dimensions of the kernel associated with the new command.
+ * @param[in] GlobalWorkSize Global work size of the kernel associated with the
+ * new command.
  * @param[in] LocalWorkSize LocalWorkSize of the kernel associated with the new
  * command.
  * @param[out] Command The handle to the new command.
  * @return UR_RESULT_SUCCESS or an error code on failure
  */
-ur_result_t
-createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer,
-                    ur_kernel_handle_t Kernel, uint32_t WorkDim,
-                    const size_t *LocalWorkSize, uint32_t NumKernelAlternatives,
-                    ur_kernel_handle_t *KernelAlternatives,
-                    ur_exp_command_buffer_command_handle_t *Command) {
+ur_result_t createCommandHandle(
+    ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
+    uint32_t WorkDim, const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives,
+    ur_exp_command_buffer_command_handle_t *Command) {
 
   assert(CommandBuffer->IsUpdatable);
 
@@ -989,6 +990,8 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer,
         CommandBuffer, Kernel, CommandId, WorkDim, LocalWorkSize != nullptr,
         NumKernelAlternatives, KernelAlternatives);
 
+    NewCommand->setGlobalWorkSize(GlobalWorkSize);
+
     *Command = NewCommand.get();
 
     CommandBuffer->CommandHandles.push_back(std::move(NewCommand));
@@ -1063,9 +1066,9 @@ ur_result_t urCommandBufferAppendKernelLaunchExp(
   }
 
   if (Command) {
-    UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize,
-                                NumKernelAlternatives, KernelAlternatives,
-                                Command));
+    UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, GlobalWorkSize,
+                                LocalWorkSize, NumKernelAlternatives,
+                                KernelAlternatives, Command));
   }
   std::vector<ze_event_handle_t> ZeEventList;
   ze_event_handle_t ZeLaunchEvent = nullptr;
@@ -1919,10 +1922,16 @@ ur_result_t updateKernelCommand(
     Descs.push_back(std::move(MutableGroupSizeDesc));
   }
 
-  // Check if a new global size is provided and if we need to update the group
-  // count.
+  // Check if a new global or local size is provided and if so we need to update
+  // the group count.
   ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
-  if (NewGlobalWorkSize && Dim > 0) {
+  if ((NewGlobalWorkSize || NewLocalWorkSize) && Dim > 0) {
+    // If a new global work size is provided update that in the command,
+    // otherwise the previous work group size will be used
+    if (NewGlobalWorkSize) {
+      Command->WorkDim = Dim;
+      Command->setGlobalWorkSize(NewGlobalWorkSize);
+    }
     // If a new global work size is provided but a new local work size is not
     // then we still need to update local work size based on the size suggested
     // by the driver for the kernel.
@@ -1932,9 +1941,9 @@ ur_result_t updateKernelCommand(
     UR_CALL(getZeKernel(ZeDevice, Command->Kernel, &ZeKernel));
 
     uint32_t WG[3];
-    UR_CALL(calculateKernelWorkDimensions(ZeKernel, CommandBuffer->Device,
-                                          ZeThreadGroupDimensions, WG, Dim,
-                                          NewGlobalWorkSize, NewLocalWorkSize));
+    UR_CALL(calculateKernelWorkDimensions(
+        ZeKernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG, Dim,
+        Command->GlobalWorkSize, NewLocalWorkSize));
 
     auto MutableGroupCountDesc =
         std::make_unique<ZeStruct<ze_mutable_group_count_exp_desc_t>>();
diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp
index c9389eaf11..491cb2638e 100644
--- a/source/adapters/level_zero/command_buffer.hpp
+++ b/source/adapters/level_zero/command_buffer.hpp
@@ -172,8 +172,19 @@ struct kernel_command_handle : public ur_exp_command_buffer_command_handle_t_ {
 
   ~kernel_command_handle();
 
+  void setGlobalWorkSize(const size_t *GlobalWorkSizePtr) {
+    const size_t CopySize = sizeof(size_t) * WorkDim;
+    std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize);
+    if (WorkDim < 3) {
+      const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim);
+      std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize);
+    }
+  }
+
   // Work-dimension the command was originally created with.
   uint32_t WorkDim;
+  // Global work size of the kernel
+  size_t GlobalWorkSize[3];
   // Set to true if the user set the local work size on command creation.
   bool UserDefinedLocalSize;
   // Currently active kernel handle
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index 50c690fb89..1621b902a9 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -378,7 +378,6 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) {
 // Test updating A,X,Y parameters to new values and local memory parameters
 // to new smaller values.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
-  UUR_KNOWN_FAILURE_ON(uur::LevelZero{});
 
   // Run command-buffer prior to update an verify output
   ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
@@ -1081,11 +1080,6 @@ struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase {
     UUR_RETURN_ON_FATAL_FAILURE(
         urUpdatableCommandBufferExpExecutionTest::SetUp());
 
-    if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) {
-      GTEST_SKIP()
-          << "Local memory argument update not supported on Level Zero.";
-    }
-
     // HIP has extra args for local memory so we define an offset for arg
     // indices here for updating
     hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;

From cf84b75d00b290389a7aeb81e55b7839d4b7b6fe Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald.vandijk@codeplay.com>
Date: Thu, 20 Feb 2025 14:07:34 +0000
Subject: [PATCH 11/13] Fix MSVC build. (#17090)

MSVC warns about a possible uninitialized variable. This is a false
positive but explicitly initializing always is harmless, so do this.
---
 source/adapters/native_cpu/enqueue.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index 0ffe9415f1..d1c83a4655 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -269,7 +269,7 @@ ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
                             ur_event_handle_t *phEvent,
                             const std::function<ur_result_t()> &f) {
   urEventWait(numEventsInWaitList, phEventWaitList);
-  ur_event_handle_t event;
+  ur_event_handle_t event = nullptr;
   if (phEvent) {
     event = new ur_event_handle_t_(hQueue, command_type);
     event->tick_start();

From 99fa00b3a26080ff0de0592e78a9cb2d38321d95 Mon Sep 17 00:00:00 2001
From: Piotr Balcer <piotr.balcer@intel.com>
Date: Fri, 21 Feb 2025 07:04:28 +0100
Subject: [PATCH 12/13] add manually triggered benchmark action (#17088)

This is a first step towards reenabling UR performance testing CI. This
introduces the reusable yml workflow and a way to trigger it manually.

Here's an example how it looks:
https://github.com/pbalcer/llvm/pull/2#issuecomment-2671230460
---
 scripts/benchmarks/benches/oneapi.py | 2 +-
 scripts/benchmarks/main.py           | 4 +---
 scripts/benchmarks/options.py        | 2 +-
 scripts/benchmarks/requirements.txt  | 4 ++++
 4 files changed, 7 insertions(+), 5 deletions(-)
 create mode 100644 scripts/benchmarks/requirements.txt

diff --git a/scripts/benchmarks/benches/oneapi.py b/scripts/benchmarks/benches/oneapi.py
index b6b3f62c5c..0547f6646e 100644
--- a/scripts/benchmarks/benches/oneapi.py
+++ b/scripts/benchmarks/benches/oneapi.py
@@ -11,7 +11,7 @@
 
 class OneAPI:
     # random unique number for benchmark oneAPI installation
-    ONEAPI_BENCHMARK_INSTANCE_ID = 98765
+    ONEAPI_BENCHMARK_INSTANCE_ID = 987654
 
     def __init__(self):
         self.oneapi_dir = os.path.join(options.workdir, "oneapi")
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 0b932a888d..69237c46f6 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -262,9 +262,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             compare_names.append(saved_name)
 
     if options.output_html:
-        html_content = generate_html(
-            history.runs, "oneapi-src/unified-runtime", compare_names
-        )
+        html_content = generate_html(history.runs, "intel/llvm", compare_names)
 
         with open("benchmark_results.html", "w") as file:
             file.write(html_content)
diff --git a/scripts/benchmarks/options.py b/scripts/benchmarks/options.py
index 394766b605..2e92675264 100644
--- a/scripts/benchmarks/options.py
+++ b/scripts/benchmarks/options.py
@@ -37,7 +37,7 @@ class Options:
     build_compute_runtime: bool = False
     extra_ld_libraries: list[str] = field(default_factory=list)
     extra_env_vars: dict = field(default_factory=dict)
-    compute_runtime_tag: str = "24.52.32224.10"
+    compute_runtime_tag: str = "25.05.32567.12"
     build_igc: bool = False
     current_run_name: str = "This PR"
 
diff --git a/scripts/benchmarks/requirements.txt b/scripts/benchmarks/requirements.txt
new file mode 100644
index 0000000000..99ba0caab5
--- /dev/null
+++ b/scripts/benchmarks/requirements.txt
@@ -0,0 +1,4 @@
+matplotlib==3.9.2
+mpld3==0.5.10
+dataclasses-json==0.6.7
+PyYAML==6.0.1

From dd7d5c6256f7ec90be9753e6e5a24d06a6b36fd6 Mon Sep 17 00:00:00 2001
From: Lukasz Dorau <lukasz.dorau@intel.com>
Date: Fri, 21 Feb 2025 12:06:17 +0100
Subject: [PATCH 13/13] Use UMF Proxy pool manager with UMF CUDA memory
 provider in UR (#17015)

Use UMF Proxy pool manager with UMF CUDA memory provider in UR.

UMF Proxy pool manager is just a wrapper for the UMF memory provider
(CUDA memory provider in this case) plus it adds also tracking of memory
allocations.

Moved from: https://github.com/oneapi-src/unified-runtime/pull/2659

Signed-off-by: Lukasz Dorau <lukasz.dorau@intel.com>
---
 source/adapters/cuda/common.hpp   |  38 +++++++
 source/adapters/cuda/context.hpp  |  33 ++++--
 source/adapters/cuda/device.hpp   |  17 +++-
 source/adapters/cuda/memory.cpp   |   8 +-
 source/adapters/cuda/memory.hpp   |   4 +-
 source/adapters/cuda/platform.cpp |  40 ++++++--
 source/adapters/cuda/usm.cpp      | 161 ++++--------------------------
 source/adapters/cuda/usm.hpp      |  74 --------------
 source/common/umf_helpers.hpp     |  54 ++++------
 9 files changed, 157 insertions(+), 272 deletions(-)

diff --git a/source/adapters/cuda/common.hpp b/source/adapters/cuda/common.hpp
index 67223c45bc..23189ce91f 100644
--- a/source/adapters/cuda/common.hpp
+++ b/source/adapters/cuda/common.hpp
@@ -12,6 +12,17 @@
 #include <cuda.h>
 #include <ur/ur.hpp>
 
+#include <umf/base.h>
+#include <umf/providers/provider_cuda.h>
+
+#define UMF_RETURN_UMF_ERROR(UmfResult)                                        \
+  do {                                                                         \
+    umf_result_t UmfResult_ = (UmfResult);                                     \
+    if (UmfResult_ != UMF_RESULT_SUCCESS) {                                    \
+      return UmfResult_;                                                       \
+    }                                                                          \
+  } while (0)
+
 ur_result_t mapErrorUR(CUresult Result);
 
 /// Converts CUDA error into UR error codes, and outputs error information
@@ -59,3 +70,30 @@ void assertion(bool Condition, const char *Message = nullptr);
 
 } // namespace ur
 } // namespace detail
+
+namespace umf {
+
+using cuda_params_unique_handle_t = std::unique_ptr<
+    umf_cuda_memory_provider_params_t,
+    std::function<umf_result_t(umf_cuda_memory_provider_params_handle_t)>>;
+
+inline umf_result_t setCUMemoryProviderParams(
+    umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams,
+    int cuDevice, void *cuContext, umf_usm_memory_type_t memType) {
+
+  umf_result_t UmfResult =
+      umfCUDAMemoryProviderParamsSetContext(CUMemoryProviderParams, cuContext);
+  UMF_RETURN_UMF_ERROR(UmfResult);
+
+  UmfResult =
+      umfCUDAMemoryProviderParamsSetDevice(CUMemoryProviderParams, cuDevice);
+  UMF_RETURN_UMF_ERROR(UmfResult);
+
+  UmfResult =
+      umfCUDAMemoryProviderParamsSetMemoryType(CUMemoryProviderParams, memType);
+  UMF_RETURN_UMF_ERROR(UmfResult);
+
+  return UMF_RESULT_SUCCESS;
+}
+
+} // namespace umf
diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp
index 96a1464a87..e84b4b7f7a 100644
--- a/source/adapters/cuda/context.hpp
+++ b/source/adapters/cuda/context.hpp
@@ -77,8 +77,9 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 ///
 
 static ur_result_t
-CreateHostMemoryProvider(ur_device_handle_t_ *DeviceHandle,
-                         umf_memory_provider_handle_t *MemoryProviderHost) {
+CreateHostMemoryProviderPool(ur_device_handle_t_ *DeviceHandle,
+                             umf_memory_provider_handle_t *MemoryProviderHost,
+                             umf_memory_pool_handle_t *MemoryPoolHost) {
   umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr;
 
   *MemoryProviderHost = nullptr;
@@ -91,10 +92,20 @@ CreateHostMemoryProvider(ur_device_handle_t_ *DeviceHandle,
   umf::cuda_params_unique_handle_t CUMemoryProviderParamsUnique(
       CUMemoryProviderParams, umfCUDAMemoryProviderParamsDestroy);
 
-  // create UMF CUDA memory provider for the host memory (UMF_MEMORY_TYPE_HOST)
-  UmfResult = umf::createMemoryProvider(
-      CUMemoryProviderParamsUnique.get(), 0 /* cuDevice */, context,
-      UMF_MEMORY_TYPE_HOST, MemoryProviderHost);
+  UmfResult = umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(),
+                                             0 /* cuDevice */, context,
+                                             UMF_MEMORY_TYPE_HOST);
+  UMF_RETURN_UR_ERROR(UmfResult);
+
+  // create UMF CUDA memory provider and pool for the host memory
+  // (UMF_MEMORY_TYPE_HOST)
+  UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
+                                      CUMemoryProviderParamsUnique.get(),
+                                      MemoryProviderHost);
+  UMF_RETURN_UR_ERROR(UmfResult);
+
+  UmfResult = umfPoolCreate(umfProxyPoolOps(), *MemoryProviderHost, nullptr, 0,
+                            MemoryPoolHost);
   UMF_RETURN_UR_ERROR(UmfResult);
 
   return UR_RESULT_SUCCESS;
@@ -112,8 +123,10 @@ struct ur_context_handle_t_ {
   std::vector<ur_device_handle_t> Devices;
   std::atomic_uint32_t RefCount;
 
-  // UMF CUDA memory provider for the host memory (UMF_MEMORY_TYPE_HOST)
+  // UMF CUDA memory provider and pool for the host memory
+  // (UMF_MEMORY_TYPE_HOST)
   umf_memory_provider_handle_t MemoryProviderHost = nullptr;
+  umf_memory_pool_handle_t MemoryPoolHost = nullptr;
 
   ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices)
       : Devices{Devs, Devs + NumDevices}, RefCount{1} {
@@ -124,10 +137,14 @@ struct ur_context_handle_t_ {
     // Create UMF CUDA memory provider for the host memory
     // (UMF_MEMORY_TYPE_HOST) from any device (Devices[0] is used here, because
     // it is guaranteed to exist).
-    UR_CHECK_ERROR(CreateHostMemoryProvider(Devices[0], &MemoryProviderHost));
+    UR_CHECK_ERROR(CreateHostMemoryProviderPool(Devices[0], &MemoryProviderHost,
+                                                &MemoryPoolHost));
   };
 
   ~ur_context_handle_t_() {
+    if (MemoryPoolHost) {
+      umfPoolDestroy(MemoryPoolHost);
+    }
     if (MemoryProviderHost) {
       umfMemoryProviderDestroy(MemoryProviderHost);
     }
diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp
index eaf4ba6765..e94291367b 100644
--- a/source/adapters/cuda/device.hpp
+++ b/source/adapters/cuda/device.hpp
@@ -11,6 +11,7 @@
 
 #include <ur/ur.hpp>
 
+#include <umf/memory_pool.h>
 #include <umf/memory_provider.h>
 
 #include "common.hpp"
@@ -84,9 +85,17 @@ struct ur_device_handle_t_ {
 
     MemoryProviderDevice = nullptr;
     MemoryProviderShared = nullptr;
+    MemoryPoolDevice = nullptr;
+    MemoryPoolShared = nullptr;
   }
 
   ~ur_device_handle_t_() {
+    if (MemoryPoolDevice) {
+      umfPoolDestroy(MemoryPoolDevice);
+    }
+    if (MemoryPoolShared) {
+      umfPoolDestroy(MemoryPoolShared);
+    }
     if (MemoryProviderDevice) {
       umfMemoryProviderDestroy(MemoryProviderDevice);
     }
@@ -131,11 +140,15 @@ struct ur_device_handle_t_ {
   // bookkeeping for mipmappedArray leaks in Mapping external Memory
   std::map<CUarray, CUmipmappedArray> ChildCuarrayFromMipmapMap;
 
-  // UMF CUDA memory provider for the device memory (UMF_MEMORY_TYPE_DEVICE)
+  // UMF CUDA memory provider and pool for the device memory
+  // (UMF_MEMORY_TYPE_DEVICE)
   umf_memory_provider_handle_t MemoryProviderDevice;
+  umf_memory_pool_handle_t MemoryPoolDevice;
 
-  // UMF CUDA memory provider for the shared memory (UMF_MEMORY_TYPE_SHARED)
+  // UMF CUDA memory provider and pool for the shared memory
+  // (UMF_MEMORY_TYPE_SHARED)
   umf_memory_provider_handle_t MemoryProviderShared;
+  umf_memory_pool_handle_t MemoryPoolShared;
 };
 
 int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index 651fe0f43d..6e68275f3a 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -50,8 +50,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
           cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
       AllocMode = BufferMem::AllocMode::UseHostPtr;
     } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
-      UMF_CHECK_ERROR(umfMemoryProviderAlloc(hContext->MemoryProviderHost, size,
-                                             0, &HostPtr));
+      HostPtr = umfPoolMalloc(hContext->MemoryPoolHost, size);
+      UMF_CHECK_PTR(HostPtr);
       AllocMode = BufferMem::AllocMode::AllocHostPtr;
     } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
       AllocMode = BufferMem::AllocMode::CopyIn;
@@ -442,8 +442,8 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
                                        CU_MEMHOSTALLOC_DEVICEMAP));
       UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
     } else {
-      UMF_CHECK_ERROR(umfMemoryProviderAlloc(hDevice->MemoryProviderDevice,
-                                             Buffer.Size, 0, (void **)&DevPtr));
+      *(void **)&DevPtr = umfPoolMalloc(hDevice->MemoryPoolDevice, Buffer.Size);
+      UMF_CHECK_PTR(*(void **)&DevPtr);
     }
   } else {
     CUarray ImageArray{};
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index 6dcaa28414..f0fc14f864 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -158,7 +158,7 @@ struct BufferMem {
     case AllocMode::Classic:
       for (auto &DevPtr : Ptrs) {
         if (DevPtr != native_type{0}) {
-          UR_CHECK_ERROR(cuMemFree(DevPtr));
+          UMF_CHECK_ERROR(umfFree((void *)DevPtr));
         }
       }
       break;
@@ -166,7 +166,7 @@ struct BufferMem {
       UR_CHECK_ERROR(cuMemHostUnregister(HostPtr));
       break;
     case AllocMode::AllocHostPtr:
-      UR_CHECK_ERROR(cuMemFreeHost(HostPtr));
+      UMF_CHECK_ERROR(umfFree((void *)HostPtr));
     }
     return UR_RESULT_SUCCESS;
   }
diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp
index d53a027160..ac66c39afb 100644
--- a/source/adapters/cuda/platform.cpp
+++ b/source/adapters/cuda/platform.cpp
@@ -20,7 +20,7 @@
 #include <sstream>
 
 static ur_result_t
-CreateDeviceMemoryProviders(ur_platform_handle_t_ *Platform) {
+CreateDeviceMemoryProvidersPools(ur_platform_handle_t_ *Platform) {
   umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr;
 
   umf_result_t UmfResult =
@@ -37,16 +37,40 @@ CreateDeviceMemoryProviders(ur_platform_handle_t_ *Platform) {
 
     // create UMF CUDA memory provider for the device memory
     // (UMF_MEMORY_TYPE_DEVICE)
-    UmfResult = umf::createMemoryProvider(
-        CUMemoryProviderParamsUnique.get(), device, context,
-        UMF_MEMORY_TYPE_DEVICE, &device_handle->MemoryProviderDevice);
+    UmfResult =
+        umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(),
+                                       device, context, UMF_MEMORY_TYPE_DEVICE);
+    UMF_RETURN_UR_ERROR(UmfResult);
+
+    UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
+                                        CUMemoryProviderParamsUnique.get(),
+                                        &device_handle->MemoryProviderDevice);
     UMF_RETURN_UR_ERROR(UmfResult);
 
     // create UMF CUDA memory provider for the shared memory
     // (UMF_MEMORY_TYPE_SHARED)
-    UmfResult = umf::createMemoryProvider(
-        CUMemoryProviderParamsUnique.get(), device, context,
-        UMF_MEMORY_TYPE_SHARED, &device_handle->MemoryProviderShared);
+    UmfResult =
+        umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(),
+                                       device, context, UMF_MEMORY_TYPE_SHARED);
+    UMF_RETURN_UR_ERROR(UmfResult);
+
+    UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
+                                        CUMemoryProviderParamsUnique.get(),
+                                        &device_handle->MemoryProviderShared);
+    UMF_RETURN_UR_ERROR(UmfResult);
+
+    // create UMF CUDA memory pool for the device memory
+    // (UMF_MEMORY_TYPE_DEVICE)
+    UmfResult =
+        umfPoolCreate(umfProxyPoolOps(), device_handle->MemoryProviderDevice,
+                      nullptr, 0, &device_handle->MemoryPoolDevice);
+    UMF_RETURN_UR_ERROR(UmfResult);
+
+    // create UMF CUDA memory pool for the shared memory
+    // (UMF_MEMORY_TYPE_SHARED)
+    UmfResult =
+        umfPoolCreate(umfProxyPoolOps(), device_handle->MemoryProviderShared,
+                      nullptr, 0, &device_handle->MemoryPoolShared);
     UMF_RETURN_UR_ERROR(UmfResult);
   }
 
@@ -134,7 +158,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
                                           static_cast<uint32_t>(i)});
             }
 
-            UR_CHECK_ERROR(CreateDeviceMemoryProviders(&Platform));
+            UR_CHECK_ERROR(CreateDeviceMemoryProvidersPools(&Platform));
           } catch (const std::bad_alloc &) {
             // Signal out-of-memory situation
             for (int i = 0; i < NumDevices; ++i) {
diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index e40927b7a8..5d2d43442d 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -102,54 +102,12 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t USMFreeImpl(ur_context_handle_t hContext, void *Pointer) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  try {
-    unsigned int IsManaged;
-    unsigned int Type;
-    unsigned int DeviceOrdinal;
-    const int NumAttributes = 3;
-    void *AttributeValues[NumAttributes] = {&IsManaged, &Type, &DeviceOrdinal};
-
-    CUpointer_attribute Attributes[NumAttributes] = {
-        CU_POINTER_ATTRIBUTE_IS_MANAGED, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-        CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL};
-    UR_CHECK_ERROR(cuPointerGetAttributes(
-        NumAttributes, Attributes, AttributeValues, (CUdeviceptr)Pointer));
-    UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST,
-              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-
-    std::vector<ur_device_handle_t> ContextDevices = hContext->getDevices();
-    ur_platform_handle_t Platform = ContextDevices[0]->getPlatform();
-    unsigned int NumDevices = Platform->Devices.size();
-    UR_ASSERT(DeviceOrdinal < NumDevices, UR_RESULT_ERROR_INVALID_DEVICE);
-
-    ur_device_handle_t Device = Platform->Devices[DeviceOrdinal].get();
-    umf_memory_provider_handle_t MemoryProvider;
-
-    if (IsManaged) {
-      MemoryProvider = Device->MemoryProviderShared;
-    } else if (Type == CU_MEMORYTYPE_DEVICE) {
-      MemoryProvider = Device->MemoryProviderDevice;
-    } else {
-      MemoryProvider = hContext->MemoryProviderHost;
-    }
-
-    UMF_CHECK_ERROR(umfMemoryProviderFree(MemoryProvider, Pointer,
-                                          0 /* size is unknown */));
-  } catch (ur_result_t Err) {
-    Result = Err;
-  }
-  return Result;
-}
-
 /// USM: Frees the given USM pointer associated with the context.
 ///
 UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
                                               void *pMem) {
-  if (auto Pool = umfPoolByPtr(pMem))
-    return umf::umf2urResult(umfPoolFree(Pool, pMem));
-  return USMFreeImpl(hContext, pMem);
+  (void)hContext; // unused
+  return umf::umf2urResult(umfFree(pMem));
 }
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
@@ -158,8 +116,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
                                uint32_t Alignment) {
   try {
     ScopedContext Active(Device);
-    UMF_CHECK_ERROR(umfMemoryProviderAlloc(Device->MemoryProviderDevice, Size,
-                                           Alignment, ResultPtr));
+    *ResultPtr = umfPoolMalloc(Device->MemoryPoolDevice, Size);
+    UMF_CHECK_PTR(*ResultPtr);
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -180,8 +138,8 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
                                uint32_t Alignment) {
   try {
     ScopedContext Active(Device);
-    UMF_CHECK_ERROR(umfMemoryProviderAlloc(Device->MemoryProviderShared, Size,
-                                           Alignment, ResultPtr));
+    *ResultPtr = umfPoolMalloc(Device->MemoryPoolShared, Size);
+    UMF_CHECK_PTR(*ResultPtr);
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -199,8 +157,8 @@ ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t hContext,
                              ur_usm_host_mem_flags_t, size_t Size,
                              uint32_t Alignment) {
   try {
-    UMF_CHECK_ERROR(umfMemoryProviderAlloc(hContext->MemoryProviderHost, Size,
-                                           Alignment, ResultPtr));
+    *ResultPtr = umfPoolMalloc(hContext->MemoryPoolHost, Size);
+    UMF_CHECK_PTR(*ResultPtr);
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -326,73 +284,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMReleaseExp(ur_context_handle_t Context,
   return UR_RESULT_SUCCESS;
 }
 
-umf_result_t USMMemoryProvider::initialize(ur_context_handle_t Ctx,
-                                           ur_device_handle_t Dev) {
-  Context = Ctx;
-  Device = Dev;
-  // There isn't a way to query this in cuda, and there isn't much info on
-  // cuda's approach to alignment or transfer granularity between host and
-  // device. Within UMF this is only used to influence alignment, and since we
-  // discard that in our alloc implementations it seems we can safely ignore
-  // this as well, for now.
-  MinPageSize = 0;
-
-  return UMF_RESULT_SUCCESS;
-}
-
-enum umf_result_t USMMemoryProvider::alloc(size_t Size, size_t Align,
-                                           void **Ptr) {
-  auto Res = allocateImpl(Ptr, Size, Align);
-  if (Res != UR_RESULT_SUCCESS) {
-    getLastStatusRef() = Res;
-    return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC;
-  }
-
-  return UMF_RESULT_SUCCESS;
-}
-
-enum umf_result_t USMMemoryProvider::free(void *Ptr, size_t Size) {
-  (void)Size;
-
-  auto Res = USMFreeImpl(Context, Ptr);
-  if (Res != UR_RESULT_SUCCESS) {
-    getLastStatusRef() = Res;
-    return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC;
-  }
-
-  return UMF_RESULT_SUCCESS;
-}
-
-void USMMemoryProvider::get_last_native_error(const char **ErrMsg,
-                                              int32_t *ErrCode) {
-  (void)ErrMsg;
-  *ErrCode = static_cast<int32_t>(getLastStatusRef());
-}
-
-umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
-  (void)Ptr;
-  *PageSize = MinPageSize;
-
-  return UMF_RESULT_SUCCESS;
-}
-
-ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
-                                                  uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
-                            /*device flags*/ 0, Size, Alignment);
-}
-
-ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
-                                                  uint32_t Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
-                            Alignment);
-}
-
-ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
-                                                uint32_t Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
-}
-
 ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
                                              ur_usm_pool_desc_t *PoolDesc)
     : Context{Context} {
@@ -416,36 +307,28 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
     pNext = BaseDesc->pNext;
   }
 
-  auto MemProvider =
-      umf::memoryProviderMakeUnique<USMHostMemoryProvider>(Context, nullptr)
-          .second;
-
   auto UmfHostParamsHandle = getUmfParamsHandle(
       DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]);
-  HostMemPool =
-      umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
-                                 UmfHostParamsHandle.get())
-          .second;
+  HostMemPool = umf::poolMakeUniqueFromOpsProviderHandle(
+                    umfDisjointPoolOps(), Context->MemoryProviderHost,
+                    UmfHostParamsHandle.get())
+                    .second;
 
   for (const auto &Device : Context->getDevices()) {
-    MemProvider =
-        umf::memoryProviderMakeUnique<USMDeviceMemoryProvider>(Context, Device)
-            .second;
     auto UmfDeviceParamsHandle = getUmfParamsHandle(
         DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]);
-    DeviceMemPool =
-        umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
-                                   UmfDeviceParamsHandle.get())
-            .second;
-    MemProvider =
-        umf::memoryProviderMakeUnique<USMSharedMemoryProvider>(Context, Device)
-            .second;
+    DeviceMemPool = umf::poolMakeUniqueFromOpsProviderHandle(
+                        umfDisjointPoolOps(), Device->MemoryProviderDevice,
+                        UmfDeviceParamsHandle.get())
+                        .second;
+
     auto UmfSharedParamsHandle = getUmfParamsHandle(
         DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]);
-    SharedMemPool =
-        umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
-                                   UmfSharedParamsHandle.get())
-            .second;
+    SharedMemPool = umf::poolMakeUniqueFromOpsProviderHandle(
+                        umfDisjointPoolOps(), Device->MemoryProviderShared,
+                        UmfSharedParamsHandle.get())
+                        .second;
+
     Context->addPool(this);
   }
 }
diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp
index 7c6a2ea666..8258043d2b 100644
--- a/source/adapters/cuda/usm.hpp
+++ b/source/adapters/cuda/usm.hpp
@@ -48,80 +48,6 @@ class UsmAllocationException {
   ur_result_t getError() const { return Error; }
 };
 
-// Implements memory allocation via driver API for USM allocator interface.
-class USMMemoryProvider {
-private:
-  ur_result_t &getLastStatusRef() {
-    static thread_local ur_result_t LastStatus = UR_RESULT_SUCCESS;
-    return LastStatus;
-  }
-
-protected:
-  ur_context_handle_t Context;
-  ur_device_handle_t Device;
-  size_t MinPageSize;
-
-  // Internal allocation routine which must be implemented for each allocation
-  // type
-  virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                                   uint32_t Alignment) = 0;
-
-public:
-  umf_result_t initialize(ur_context_handle_t Ctx, ur_device_handle_t Dev);
-  umf_result_t alloc(size_t Size, size_t Align, void **Ptr);
-  umf_result_t free(void *Ptr, size_t Size);
-  void get_last_native_error(const char **ErrMsg, int32_t *ErrCode);
-  umf_result_t get_min_page_size(void *, size_t *);
-  umf_result_t get_recommended_page_size(size_t, size_t *) {
-    return UMF_RESULT_ERROR_NOT_SUPPORTED;
-  };
-  umf_result_t purge_lazy(void *, size_t) {
-    return UMF_RESULT_ERROR_NOT_SUPPORTED;
-  };
-  umf_result_t purge_force(void *, size_t) {
-    return UMF_RESULT_ERROR_NOT_SUPPORTED;
-  };
-  umf_result_t allocation_merge(void *, void *, size_t) {
-    return UMF_RESULT_ERROR_UNKNOWN;
-  }
-  umf_result_t allocation_split(void *, size_t, size_t) {
-    return UMF_RESULT_ERROR_UNKNOWN;
-  }
-  virtual const char *get_name() = 0;
-
-  virtual ~USMMemoryProvider() = default;
-};
-
-// Allocation routines for shared memory type
-class USMSharedMemoryProvider final : public USMMemoryProvider {
-public:
-  const char *get_name() override { return "USMSharedMemoryProvider"; }
-
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-};
-
-// Allocation routines for device memory type
-class USMDeviceMemoryProvider final : public USMMemoryProvider {
-public:
-  const char *get_name() override { return "USMSharedMemoryProvider"; }
-
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-};
-
-// Allocation routines for host memory type
-class USMHostMemoryProvider final : public USMMemoryProvider {
-public:
-  const char *get_name() override { return "USMSharedMemoryProvider"; }
-
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-};
-
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
                                ur_usm_device_mem_flags_t Flags, size_t Size,
diff --git a/source/common/umf_helpers.hpp b/source/common/umf_helpers.hpp
index 2433560a39..12432bd933 100644
--- a/source/common/umf_helpers.hpp
+++ b/source/common/umf_helpers.hpp
@@ -16,7 +16,7 @@
 #include <umf/memory_pool_ops.h>
 #include <umf/memory_provider.h>
 #include <umf/memory_provider_ops.h>
-#include <umf/providers/provider_cuda.h>
+#include <umf/pools/pool_proxy.h>
 #include <ur_api.h>
 
 #include "logger/ur_logger.hpp"
@@ -30,11 +30,10 @@
 
 #define UMF_CHECK_ERROR(UmfResult) UR_CHECK_ERROR(umf::umf2urResult(UmfResult));
 
-#define UMF_RETURN_UMF_ERROR(UmfResult)                                        \
+#define UMF_CHECK_PTR(ptr)                                                     \
   do {                                                                         \
-    umf_result_t UmfResult_ = (UmfResult);                                     \
-    if (UmfResult_ != UMF_RESULT_SUCCESS) {                                    \
-      return UmfResult_;                                                       \
+    if ((ptr) == nullptr) {                                                    \
+      UR_CHECK_ERROR(UR_RESULT_ERROR_OUT_OF_HOST_MEMORY);                      \
     }                                                                          \
   } while (0)
 
@@ -54,9 +53,6 @@ using pool_unique_handle_t =
 using provider_unique_handle_t =
     std::unique_ptr<umf_memory_provider_t,
                     std::function<void(umf_memory_provider_handle_t)>>;
-using cuda_params_unique_handle_t = std::unique_ptr<
-    umf_cuda_memory_provider_params_t,
-    std::function<umf_result_t(umf_cuda_memory_provider_params_handle_t)>>;
 
 #define DEFINE_CHECK_OP(op)                                                    \
   template <typename T> class HAS_OP_##op {                                    \
@@ -243,6 +239,21 @@ static inline auto poolMakeUniqueFromOps(umf_memory_pool_ops_t *ops,
       UMF_RESULT_SUCCESS, pool_unique_handle_t(hPool, umfPoolDestroy)};
 }
 
+static inline auto
+poolMakeUniqueFromOpsProviderHandle(umf_memory_pool_ops_t *ops,
+                                    umf_memory_provider_handle_t provider,
+                                    void *params) {
+  umf_memory_pool_handle_t hPool;
+  auto ret = umfPoolCreate(ops, provider, params, 0, &hPool);
+  if (ret != UMF_RESULT_SUCCESS) {
+    return std::pair<umf_result_t, pool_unique_handle_t>{
+        ret, pool_unique_handle_t(nullptr, nullptr)};
+  }
+
+  return std::pair<umf_result_t, pool_unique_handle_t>{
+      UMF_RESULT_SUCCESS, pool_unique_handle_t(hPool, umfPoolDestroy)};
+}
+
 static inline auto providerMakeUniqueFromOps(umf_memory_provider_ops_t *ops,
                                              void *params) {
   umf_memory_provider_handle_t hProvider;
@@ -301,33 +312,6 @@ inline ur_result_t umf2urResult(umf_result_t umfResult) {
   };
 }
 
-inline umf_result_t createMemoryProvider(
-    umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams,
-    int cuDevice, void *cuContext, umf_usm_memory_type_t memType,
-    umf_memory_provider_handle_t *provider) {
-
-  umf_result_t UmfResult =
-      umfCUDAMemoryProviderParamsSetContext(CUMemoryProviderParams, cuContext);
-  UMF_RETURN_UMF_ERROR(UmfResult);
-
-  UmfResult =
-      umfCUDAMemoryProviderParamsSetDevice(CUMemoryProviderParams, cuDevice);
-  UMF_RETURN_UMF_ERROR(UmfResult);
-
-  UmfResult =
-      umfCUDAMemoryProviderParamsSetMemoryType(CUMemoryProviderParams, memType);
-  UMF_RETURN_UMF_ERROR(UmfResult);
-
-  umf_memory_provider_handle_t umfCUDAprovider = nullptr;
-  UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
-                                      CUMemoryProviderParams, &umfCUDAprovider);
-  UMF_RETURN_UMF_ERROR(UmfResult);
-
-  *provider = umfCUDAprovider;
-
-  return UMF_RESULT_SUCCESS;
-}
-
 } // namespace umf
 
 #endif /* UMF_HELPERS_H */