diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake
index bdd4a6d15c..ec945ba2c2 100644
--- a/cmake/helpers.cmake
+++ b/cmake/helpers.cmake
@@ -157,11 +157,6 @@ function(add_ur_target_link_options name)
             if (UR_DEVELOPER_MODE)
                 target_link_options(${name} PRIVATE -Werror -Wextra)
             endif()
-            if (CMAKE_BUILD_TYPE STREQUAL "Release")
-                target_link_options(${name} PRIVATE
-                    $<$<CXX_COMPILER_ID:GNU>:-pie>
-                )
-            endif()
         endif()
     elseif(MSVC)
         target_link_options(${name} PRIVATE
@@ -176,7 +171,15 @@ function(add_ur_target_link_options name)
 endfunction()
 
 function(add_ur_target_exec_options name)
-    if(MSVC)
+    if(NOT MSVC)
+        if(NOT APPLE)
+            if(CMAKE_BUILD_TYPE STREQUAL "Release")
+                target_link_options(${name} PRIVATE
+                    $<$<CXX_COMPILER_ID:GNU>:-pie>
+                )
+            endif()
+        endif()
+    elseif(MSVC)
         target_link_options(${name} PRIVATE
             LINKER:/ALLOWISOLATION
         )
diff --git a/include/ur_api.h b/include/ur_api.h
index d7621bda32..c390ed4410 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -7105,21 +7105,21 @@ typedef enum ur_event_info_t {
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Profiling query information type
 typedef enum ur_profiling_info_t {
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event is enqueued
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event is enqueued
   UR_PROFILING_INFO_COMMAND_QUEUED = 0,
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event is submitted
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event is submitted
   UR_PROFILING_INFO_COMMAND_SUBMIT = 1,
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event starts execution
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event starts execution
   UR_PROFILING_INFO_COMMAND_START = 2,
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event has finished execution
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event has finished execution
   UR_PROFILING_INFO_COMMAND_END = 3,
-  /// [uint64_t] A 64-bit value of current device counter in nanoseconds
-  /// when the event and any child events enqueued by this event on the
-  /// device have finished execution
+  /// [uint64_t][optional-query] A 64-bit value of current device counter in
+  /// nanoseconds when the event and any child events enqueued by this event
+  /// on the device have finished execution
   UR_PROFILING_INFO_COMMAND_COMPLETE = 4,
   /// @cond
   UR_PROFILING_INFO_FORCE_UINT32 = 0x7fffffff
@@ -7193,6 +7193,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(
 ///     - ::UR_RESULT_ERROR_INVALID_EVENT
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
+///         + If `propName` is not supported by the adapter.
 UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
     /// [in] handle of the event object
     ur_event_handle_t hEvent,
@@ -10111,6 +10113,7 @@ typedef struct ur_exp_command_buffer_command_handle_t_
 ///         + `NULL == hContext`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pCommandBufferDesc`
 ///         + `NULL == phCommandBuffer`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_DEVICE
@@ -10125,7 +10128,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer);
diff --git a/scripts/benchmarks/benches/oneapi.py b/scripts/benchmarks/benches/oneapi.py
index b6b3f62c5c..0547f6646e 100644
--- a/scripts/benchmarks/benches/oneapi.py
+++ b/scripts/benchmarks/benches/oneapi.py
@@ -11,7 +11,7 @@
 
 class OneAPI:
     # random unique number for benchmark oneAPI installation
-    ONEAPI_BENCHMARK_INSTANCE_ID = 98765
+    ONEAPI_BENCHMARK_INSTANCE_ID = 987654
 
     def __init__(self):
         self.oneapi_dir = os.path.join(options.workdir, "oneapi")
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index 0b932a888d..69237c46f6 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -262,9 +262,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
             compare_names.append(saved_name)
 
     if options.output_html:
-        html_content = generate_html(
-            history.runs, "oneapi-src/unified-runtime", compare_names
-        )
+        html_content = generate_html(history.runs, "intel/llvm", compare_names)
 
         with open("benchmark_results.html", "w") as file:
             file.write(html_content)
diff --git a/scripts/benchmarks/options.py b/scripts/benchmarks/options.py
index 394766b605..2e92675264 100644
--- a/scripts/benchmarks/options.py
+++ b/scripts/benchmarks/options.py
@@ -37,7 +37,7 @@ class Options:
     build_compute_runtime: bool = False
     extra_ld_libraries: list[str] = field(default_factory=list)
     extra_env_vars: dict = field(default_factory=dict)
-    compute_runtime_tag: str = "24.52.32224.10"
+    compute_runtime_tag: str = "25.05.32567.12"
     build_igc: bool = False
     current_run_name: str = "This PR"
 
diff --git a/scripts/benchmarks/requirements.txt b/scripts/benchmarks/requirements.txt
new file mode 100644
index 0000000000..99ba0caab5
--- /dev/null
+++ b/scripts/benchmarks/requirements.txt
@@ -0,0 +1,4 @@
+matplotlib==3.9.2
+mpld3==0.5.10
+dataclasses-json==0.6.7
+PyYAML==6.0.1
diff --git a/scripts/core/event.yml b/scripts/core/event.yml
index da2fa457de..1b0eeca23e 100644
--- a/scripts/core/event.yml
+++ b/scripts/core/event.yml
@@ -121,15 +121,15 @@ name: $x_profiling_info_t
 typed_etors: True
 etors:
     - name: COMMAND_QUEUED
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event is enqueued"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event is enqueued"
     - name: COMMAND_SUBMIT
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event is submitted"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event is submitted"
     - name: COMMAND_START
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event starts execution"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event starts execution"
     - name: COMMAND_END
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event has finished execution"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event has finished execution"
     - name: COMMAND_COMPLETE
-      desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event and any child events enqueued by this event on the device have finished execution"
+      desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event and any child events enqueued by this event on the device have finished execution"
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Get event object information"
@@ -198,6 +198,8 @@ returns:
     - $X_RESULT_ERROR_INVALID_EVENT
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
+    - $X_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
+      - "If `propName` is not supported by the adapter."
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Wait for a list of events to finish."
diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml
index 218c626423..680bc60f8d 100644
--- a/scripts/core/exp-command-buffer.yml
+++ b/scripts/core/exp-command-buffer.yml
@@ -282,7 +282,7 @@ params:
       desc: "[in] Handle of the device object."
     - type: "const $x_exp_command_buffer_desc_t*"
       name: pCommandBufferDesc
-      desc: "[in][optional] command-buffer descriptor."
+      desc: "[in] Command-buffer descriptor."
     - type: "$x_exp_command_buffer_handle_t*"
       name: phCommandBuffer
       desc: "[out][alloc] Pointer to command-Buffer handle."
diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 0d274e6c15..42ec8dbafc 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -352,10 +352,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
-
-  const bool IsUpdatable =
-      pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false;
-
+  const bool IsUpdatable = pCommandBufferDesc->isUpdatable;
   try {
     *phCommandBuffer =
         new ur_exp_command_buffer_handle_t_(hContext, hDevice, IsUpdatable);
diff --git a/source/adapters/cuda/common.hpp b/source/adapters/cuda/common.hpp
index 67223c45bc..23189ce91f 100644
--- a/source/adapters/cuda/common.hpp
+++ b/source/adapters/cuda/common.hpp
@@ -12,6 +12,17 @@
 #include <cuda.h>
 #include <ur/ur.hpp>
 
+#include <umf/base.h>
+#include <umf/providers/provider_cuda.h>
+
+#define UMF_RETURN_UMF_ERROR(UmfResult)                                        \
+  do {                                                                         \
+    umf_result_t UmfResult_ = (UmfResult);                                     \
+    if (UmfResult_ != UMF_RESULT_SUCCESS) {                                    \
+      return UmfResult_;                                                       \
+    }                                                                          \
+  } while (0)
+
 ur_result_t mapErrorUR(CUresult Result);
 
 /// Converts CUDA error into UR error codes, and outputs error information
@@ -59,3 +70,30 @@ void assertion(bool Condition, const char *Message = nullptr);
 
 } // namespace ur
 } // namespace detail
+
+namespace umf {
+
+using cuda_params_unique_handle_t = std::unique_ptr<
+    umf_cuda_memory_provider_params_t,
+    std::function<umf_result_t(umf_cuda_memory_provider_params_handle_t)>>;
+
+inline umf_result_t setCUMemoryProviderParams(
+    umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams,
+    int cuDevice, void *cuContext, umf_usm_memory_type_t memType) {
+
+  umf_result_t UmfResult =
+      umfCUDAMemoryProviderParamsSetContext(CUMemoryProviderParams, cuContext);
+  UMF_RETURN_UMF_ERROR(UmfResult);
+
+  UmfResult =
+      umfCUDAMemoryProviderParamsSetDevice(CUMemoryProviderParams, cuDevice);
+  UMF_RETURN_UMF_ERROR(UmfResult);
+
+  UmfResult =
+      umfCUDAMemoryProviderParamsSetMemoryType(CUMemoryProviderParams, memType);
+  UMF_RETURN_UMF_ERROR(UmfResult);
+
+  return UMF_RESULT_SUCCESS;
+}
+
+} // namespace umf
diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp
index 96a1464a87..e84b4b7f7a 100644
--- a/source/adapters/cuda/context.hpp
+++ b/source/adapters/cuda/context.hpp
@@ -77,8 +77,9 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 ///
 
 static ur_result_t
-CreateHostMemoryProvider(ur_device_handle_t_ *DeviceHandle,
-                         umf_memory_provider_handle_t *MemoryProviderHost) {
+CreateHostMemoryProviderPool(ur_device_handle_t_ *DeviceHandle,
+                             umf_memory_provider_handle_t *MemoryProviderHost,
+                             umf_memory_pool_handle_t *MemoryPoolHost) {
   umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr;
 
   *MemoryProviderHost = nullptr;
@@ -91,10 +92,20 @@ CreateHostMemoryProvider(ur_device_handle_t_ *DeviceHandle,
   umf::cuda_params_unique_handle_t CUMemoryProviderParamsUnique(
       CUMemoryProviderParams, umfCUDAMemoryProviderParamsDestroy);
 
-  // create UMF CUDA memory provider for the host memory (UMF_MEMORY_TYPE_HOST)
-  UmfResult = umf::createMemoryProvider(
-      CUMemoryProviderParamsUnique.get(), 0 /* cuDevice */, context,
-      UMF_MEMORY_TYPE_HOST, MemoryProviderHost);
+  UmfResult = umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(),
+                                             0 /* cuDevice */, context,
+                                             UMF_MEMORY_TYPE_HOST);
+  UMF_RETURN_UR_ERROR(UmfResult);
+
+  // create UMF CUDA memory provider and pool for the host memory
+  // (UMF_MEMORY_TYPE_HOST)
+  UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
+                                      CUMemoryProviderParamsUnique.get(),
+                                      MemoryProviderHost);
+  UMF_RETURN_UR_ERROR(UmfResult);
+
+  UmfResult = umfPoolCreate(umfProxyPoolOps(), *MemoryProviderHost, nullptr, 0,
+                            MemoryPoolHost);
   UMF_RETURN_UR_ERROR(UmfResult);
 
   return UR_RESULT_SUCCESS;
@@ -112,8 +123,10 @@ struct ur_context_handle_t_ {
   std::vector<ur_device_handle_t> Devices;
   std::atomic_uint32_t RefCount;
 
-  // UMF CUDA memory provider for the host memory (UMF_MEMORY_TYPE_HOST)
+  // UMF CUDA memory provider and pool for the host memory
+  // (UMF_MEMORY_TYPE_HOST)
   umf_memory_provider_handle_t MemoryProviderHost = nullptr;
+  umf_memory_pool_handle_t MemoryPoolHost = nullptr;
 
   ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices)
       : Devices{Devs, Devs + NumDevices}, RefCount{1} {
@@ -124,10 +137,14 @@ struct ur_context_handle_t_ {
     // Create UMF CUDA memory provider for the host memory
     // (UMF_MEMORY_TYPE_HOST) from any device (Devices[0] is used here, because
     // it is guaranteed to exist).
-    UR_CHECK_ERROR(CreateHostMemoryProvider(Devices[0], &MemoryProviderHost));
+    UR_CHECK_ERROR(CreateHostMemoryProviderPool(Devices[0], &MemoryProviderHost,
+                                                &MemoryPoolHost));
   };
 
   ~ur_context_handle_t_() {
+    if (MemoryPoolHost) {
+      umfPoolDestroy(MemoryPoolHost);
+    }
     if (MemoryProviderHost) {
       umfMemoryProviderDestroy(MemoryProviderHost);
     }
diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp
index eaf4ba6765..e94291367b 100644
--- a/source/adapters/cuda/device.hpp
+++ b/source/adapters/cuda/device.hpp
@@ -11,6 +11,7 @@
 
 #include <ur/ur.hpp>
 
+#include <umf/memory_pool.h>
 #include <umf/memory_provider.h>
 
 #include "common.hpp"
@@ -84,9 +85,17 @@ struct ur_device_handle_t_ {
 
     MemoryProviderDevice = nullptr;
     MemoryProviderShared = nullptr;
+    MemoryPoolDevice = nullptr;
+    MemoryPoolShared = nullptr;
   }
 
   ~ur_device_handle_t_() {
+    if (MemoryPoolDevice) {
+      umfPoolDestroy(MemoryPoolDevice);
+    }
+    if (MemoryPoolShared) {
+      umfPoolDestroy(MemoryPoolShared);
+    }
     if (MemoryProviderDevice) {
       umfMemoryProviderDestroy(MemoryProviderDevice);
     }
@@ -131,11 +140,15 @@ struct ur_device_handle_t_ {
   // bookkeeping for mipmappedArray leaks in Mapping external Memory
   std::map<CUarray, CUmipmappedArray> ChildCuarrayFromMipmapMap;
 
-  // UMF CUDA memory provider for the device memory (UMF_MEMORY_TYPE_DEVICE)
+  // UMF CUDA memory provider and pool for the device memory
+  // (UMF_MEMORY_TYPE_DEVICE)
   umf_memory_provider_handle_t MemoryProviderDevice;
+  umf_memory_pool_handle_t MemoryPoolDevice;
 
-  // UMF CUDA memory provider for the shared memory (UMF_MEMORY_TYPE_SHARED)
+  // UMF CUDA memory provider and pool for the shared memory
+  // (UMF_MEMORY_TYPE_SHARED)
   umf_memory_provider_handle_t MemoryProviderShared;
+  umf_memory_pool_handle_t MemoryPoolShared;
 };
 
 int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);
diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp
index a6c2208e8f..37792bf5b2 100644
--- a/source/adapters/cuda/event.cpp
+++ b/source/adapters/cuda/event.cpp
@@ -213,6 +213,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
     return ReturnValue(static_cast<uint64_t>(hEvent->getStartTime()));
   case UR_PROFILING_INFO_COMMAND_END:
     return ReturnValue(static_cast<uint64_t>(hEvent->getEndTime()));
+  case UR_PROFILING_INFO_COMMAND_COMPLETE:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     break;
   }
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index 651fe0f43d..6e68275f3a 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -50,8 +50,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
           cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
       AllocMode = BufferMem::AllocMode::UseHostPtr;
     } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
-      UMF_CHECK_ERROR(umfMemoryProviderAlloc(hContext->MemoryProviderHost, size,
-                                             0, &HostPtr));
+      HostPtr = umfPoolMalloc(hContext->MemoryPoolHost, size);
+      UMF_CHECK_PTR(HostPtr);
       AllocMode = BufferMem::AllocMode::AllocHostPtr;
     } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
       AllocMode = BufferMem::AllocMode::CopyIn;
@@ -442,8 +442,8 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
                                        CU_MEMHOSTALLOC_DEVICEMAP));
       UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
     } else {
-      UMF_CHECK_ERROR(umfMemoryProviderAlloc(hDevice->MemoryProviderDevice,
-                                             Buffer.Size, 0, (void **)&DevPtr));
+      *(void **)&DevPtr = umfPoolMalloc(hDevice->MemoryPoolDevice, Buffer.Size);
+      UMF_CHECK_PTR(*(void **)&DevPtr);
     }
   } else {
     CUarray ImageArray{};
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index 6dcaa28414..f0fc14f864 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -158,7 +158,7 @@ struct BufferMem {
     case AllocMode::Classic:
       for (auto &DevPtr : Ptrs) {
         if (DevPtr != native_type{0}) {
-          UR_CHECK_ERROR(cuMemFree(DevPtr));
+          UMF_CHECK_ERROR(umfFree((void *)DevPtr));
         }
       }
       break;
@@ -166,7 +166,7 @@ struct BufferMem {
       UR_CHECK_ERROR(cuMemHostUnregister(HostPtr));
       break;
     case AllocMode::AllocHostPtr:
-      UR_CHECK_ERROR(cuMemFreeHost(HostPtr));
+      UMF_CHECK_ERROR(umfFree((void *)HostPtr));
     }
     return UR_RESULT_SUCCESS;
   }
diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp
index d53a027160..ac66c39afb 100644
--- a/source/adapters/cuda/platform.cpp
+++ b/source/adapters/cuda/platform.cpp
@@ -20,7 +20,7 @@
 #include <sstream>
 
 static ur_result_t
-CreateDeviceMemoryProviders(ur_platform_handle_t_ *Platform) {
+CreateDeviceMemoryProvidersPools(ur_platform_handle_t_ *Platform) {
   umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr;
 
   umf_result_t UmfResult =
@@ -37,16 +37,40 @@ CreateDeviceMemoryProviders(ur_platform_handle_t_ *Platform) {
 
     // create UMF CUDA memory provider for the device memory
     // (UMF_MEMORY_TYPE_DEVICE)
-    UmfResult = umf::createMemoryProvider(
-        CUMemoryProviderParamsUnique.get(), device, context,
-        UMF_MEMORY_TYPE_DEVICE, &device_handle->MemoryProviderDevice);
+    UmfResult =
+        umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(),
+                                       device, context, UMF_MEMORY_TYPE_DEVICE);
+    UMF_RETURN_UR_ERROR(UmfResult);
+
+    UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
+                                        CUMemoryProviderParamsUnique.get(),
+                                        &device_handle->MemoryProviderDevice);
     UMF_RETURN_UR_ERROR(UmfResult);
 
     // create UMF CUDA memory provider for the shared memory
     // (UMF_MEMORY_TYPE_SHARED)
-    UmfResult = umf::createMemoryProvider(
-        CUMemoryProviderParamsUnique.get(), device, context,
-        UMF_MEMORY_TYPE_SHARED, &device_handle->MemoryProviderShared);
+    UmfResult =
+        umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(),
+                                       device, context, UMF_MEMORY_TYPE_SHARED);
+    UMF_RETURN_UR_ERROR(UmfResult);
+
+    UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
+                                        CUMemoryProviderParamsUnique.get(),
+                                        &device_handle->MemoryProviderShared);
+    UMF_RETURN_UR_ERROR(UmfResult);
+
+    // create UMF CUDA memory pool for the device memory
+    // (UMF_MEMORY_TYPE_DEVICE)
+    UmfResult =
+        umfPoolCreate(umfProxyPoolOps(), device_handle->MemoryProviderDevice,
+                      nullptr, 0, &device_handle->MemoryPoolDevice);
+    UMF_RETURN_UR_ERROR(UmfResult);
+
+    // create UMF CUDA memory pool for the shared memory
+    // (UMF_MEMORY_TYPE_SHARED)
+    UmfResult =
+        umfPoolCreate(umfProxyPoolOps(), device_handle->MemoryProviderShared,
+                      nullptr, 0, &device_handle->MemoryPoolShared);
     UMF_RETURN_UR_ERROR(UmfResult);
   }
 
@@ -134,7 +158,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
                                           static_cast<uint32_t>(i)});
             }
 
-            UR_CHECK_ERROR(CreateDeviceMemoryProviders(&Platform));
+            UR_CHECK_ERROR(CreateDeviceMemoryProvidersPools(&Platform));
           } catch (const std::bad_alloc &) {
             // Signal out-of-memory situation
             for (int i = 0; i < NumDevices; ++i) {
diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index e40927b7a8..5d2d43442d 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -102,54 +102,12 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t USMFreeImpl(ur_context_handle_t hContext, void *Pointer) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  try {
-    unsigned int IsManaged;
-    unsigned int Type;
-    unsigned int DeviceOrdinal;
-    const int NumAttributes = 3;
-    void *AttributeValues[NumAttributes] = {&IsManaged, &Type, &DeviceOrdinal};
-
-    CUpointer_attribute Attributes[NumAttributes] = {
-        CU_POINTER_ATTRIBUTE_IS_MANAGED, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-        CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL};
-    UR_CHECK_ERROR(cuPointerGetAttributes(
-        NumAttributes, Attributes, AttributeValues, (CUdeviceptr)Pointer));
-    UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST,
-              UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-
-    std::vector<ur_device_handle_t> ContextDevices = hContext->getDevices();
-    ur_platform_handle_t Platform = ContextDevices[0]->getPlatform();
-    unsigned int NumDevices = Platform->Devices.size();
-    UR_ASSERT(DeviceOrdinal < NumDevices, UR_RESULT_ERROR_INVALID_DEVICE);
-
-    ur_device_handle_t Device = Platform->Devices[DeviceOrdinal].get();
-    umf_memory_provider_handle_t MemoryProvider;
-
-    if (IsManaged) {
-      MemoryProvider = Device->MemoryProviderShared;
-    } else if (Type == CU_MEMORYTYPE_DEVICE) {
-      MemoryProvider = Device->MemoryProviderDevice;
-    } else {
-      MemoryProvider = hContext->MemoryProviderHost;
-    }
-
-    UMF_CHECK_ERROR(umfMemoryProviderFree(MemoryProvider, Pointer,
-                                          0 /* size is unknown */));
-  } catch (ur_result_t Err) {
-    Result = Err;
-  }
-  return Result;
-}
-
 /// USM: Frees the given USM pointer associated with the context.
 ///
 UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
                                               void *pMem) {
-  if (auto Pool = umfPoolByPtr(pMem))
-    return umf::umf2urResult(umfPoolFree(Pool, pMem));
-  return USMFreeImpl(hContext, pMem);
+  (void)hContext; // unused
+  return umf::umf2urResult(umfFree(pMem));
 }
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
@@ -158,8 +116,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
                                uint32_t Alignment) {
   try {
     ScopedContext Active(Device);
-    UMF_CHECK_ERROR(umfMemoryProviderAlloc(Device->MemoryProviderDevice, Size,
-                                           Alignment, ResultPtr));
+    *ResultPtr = umfPoolMalloc(Device->MemoryPoolDevice, Size);
+    UMF_CHECK_PTR(*ResultPtr);
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -180,8 +138,8 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
                                uint32_t Alignment) {
   try {
     ScopedContext Active(Device);
-    UMF_CHECK_ERROR(umfMemoryProviderAlloc(Device->MemoryProviderShared, Size,
-                                           Alignment, ResultPtr));
+    *ResultPtr = umfPoolMalloc(Device->MemoryPoolShared, Size);
+    UMF_CHECK_PTR(*ResultPtr);
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -199,8 +157,8 @@ ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t hContext,
                              ur_usm_host_mem_flags_t, size_t Size,
                              uint32_t Alignment) {
   try {
-    UMF_CHECK_ERROR(umfMemoryProviderAlloc(hContext->MemoryProviderHost, Size,
-                                           Alignment, ResultPtr));
+    *ResultPtr = umfPoolMalloc(hContext->MemoryPoolHost, Size);
+    UMF_CHECK_PTR(*ResultPtr);
   } catch (ur_result_t Err) {
     return Err;
   }
@@ -326,73 +284,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMReleaseExp(ur_context_handle_t Context,
   return UR_RESULT_SUCCESS;
 }
 
-umf_result_t USMMemoryProvider::initialize(ur_context_handle_t Ctx,
-                                           ur_device_handle_t Dev) {
-  Context = Ctx;
-  Device = Dev;
-  // There isn't a way to query this in cuda, and there isn't much info on
-  // cuda's approach to alignment or transfer granularity between host and
-  // device. Within UMF this is only used to influence alignment, and since we
-  // discard that in our alloc implementations it seems we can safely ignore
-  // this as well, for now.
-  MinPageSize = 0;
-
-  return UMF_RESULT_SUCCESS;
-}
-
-enum umf_result_t USMMemoryProvider::alloc(size_t Size, size_t Align,
-                                           void **Ptr) {
-  auto Res = allocateImpl(Ptr, Size, Align);
-  if (Res != UR_RESULT_SUCCESS) {
-    getLastStatusRef() = Res;
-    return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC;
-  }
-
-  return UMF_RESULT_SUCCESS;
-}
-
-enum umf_result_t USMMemoryProvider::free(void *Ptr, size_t Size) {
-  (void)Size;
-
-  auto Res = USMFreeImpl(Context, Ptr);
-  if (Res != UR_RESULT_SUCCESS) {
-    getLastStatusRef() = Res;
-    return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC;
-  }
-
-  return UMF_RESULT_SUCCESS;
-}
-
-void USMMemoryProvider::get_last_native_error(const char **ErrMsg,
-                                              int32_t *ErrCode) {
-  (void)ErrMsg;
-  *ErrCode = static_cast<int32_t>(getLastStatusRef());
-}
-
-umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
-  (void)Ptr;
-  *PageSize = MinPageSize;
-
-  return UMF_RESULT_SUCCESS;
-}
-
-ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
-                                                  uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
-                            /*device flags*/ 0, Size, Alignment);
-}
-
-ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
-                                                  uint32_t Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
-                            Alignment);
-}
-
-ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
-                                                uint32_t Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
-}
-
 ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
                                              ur_usm_pool_desc_t *PoolDesc)
     : Context{Context} {
@@ -416,36 +307,28 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
     pNext = BaseDesc->pNext;
   }
 
-  auto MemProvider =
-      umf::memoryProviderMakeUnique<USMHostMemoryProvider>(Context, nullptr)
-          .second;
-
   auto UmfHostParamsHandle = getUmfParamsHandle(
       DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]);
-  HostMemPool =
-      umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
-                                 UmfHostParamsHandle.get())
-          .second;
+  HostMemPool = umf::poolMakeUniqueFromOpsProviderHandle(
+                    umfDisjointPoolOps(), Context->MemoryProviderHost,
+                    UmfHostParamsHandle.get())
+                    .second;
 
   for (const auto &Device : Context->getDevices()) {
-    MemProvider =
-        umf::memoryProviderMakeUnique<USMDeviceMemoryProvider>(Context, Device)
-            .second;
     auto UmfDeviceParamsHandle = getUmfParamsHandle(
         DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]);
-    DeviceMemPool =
-        umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
-                                   UmfDeviceParamsHandle.get())
-            .second;
-    MemProvider =
-        umf::memoryProviderMakeUnique<USMSharedMemoryProvider>(Context, Device)
-            .second;
+    DeviceMemPool = umf::poolMakeUniqueFromOpsProviderHandle(
+                        umfDisjointPoolOps(), Device->MemoryProviderDevice,
+                        UmfDeviceParamsHandle.get())
+                        .second;
+
     auto UmfSharedParamsHandle = getUmfParamsHandle(
         DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]);
-    SharedMemPool =
-        umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider),
-                                   UmfSharedParamsHandle.get())
-            .second;
+    SharedMemPool = umf::poolMakeUniqueFromOpsProviderHandle(
+                        umfDisjointPoolOps(), Device->MemoryProviderShared,
+                        UmfSharedParamsHandle.get())
+                        .second;
+
     Context->addPool(this);
   }
 }
diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp
index 7c6a2ea666..8258043d2b 100644
--- a/source/adapters/cuda/usm.hpp
+++ b/source/adapters/cuda/usm.hpp
@@ -48,80 +48,6 @@ class UsmAllocationException {
   ur_result_t getError() const { return Error; }
 };
 
-// Implements memory allocation via driver API for USM allocator interface.
-class USMMemoryProvider {
-private:
-  ur_result_t &getLastStatusRef() {
-    static thread_local ur_result_t LastStatus = UR_RESULT_SUCCESS;
-    return LastStatus;
-  }
-
-protected:
-  ur_context_handle_t Context;
-  ur_device_handle_t Device;
-  size_t MinPageSize;
-
-  // Internal allocation routine which must be implemented for each allocation
-  // type
-  virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                                   uint32_t Alignment) = 0;
-
-public:
-  umf_result_t initialize(ur_context_handle_t Ctx, ur_device_handle_t Dev);
-  umf_result_t alloc(size_t Size, size_t Align, void **Ptr);
-  umf_result_t free(void *Ptr, size_t Size);
-  void get_last_native_error(const char **ErrMsg, int32_t *ErrCode);
-  umf_result_t get_min_page_size(void *, size_t *);
-  umf_result_t get_recommended_page_size(size_t, size_t *) {
-    return UMF_RESULT_ERROR_NOT_SUPPORTED;
-  };
-  umf_result_t purge_lazy(void *, size_t) {
-    return UMF_RESULT_ERROR_NOT_SUPPORTED;
-  };
-  umf_result_t purge_force(void *, size_t) {
-    return UMF_RESULT_ERROR_NOT_SUPPORTED;
-  };
-  umf_result_t allocation_merge(void *, void *, size_t) {
-    return UMF_RESULT_ERROR_UNKNOWN;
-  }
-  umf_result_t allocation_split(void *, size_t, size_t) {
-    return UMF_RESULT_ERROR_UNKNOWN;
-  }
-  virtual const char *get_name() = 0;
-
-  virtual ~USMMemoryProvider() = default;
-};
-
-// Allocation routines for shared memory type
-class USMSharedMemoryProvider final : public USMMemoryProvider {
-public:
-  const char *get_name() override { return "USMSharedMemoryProvider"; }
-
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-};
-
-// Allocation routines for device memory type
-class USMDeviceMemoryProvider final : public USMMemoryProvider {
-public:
-  const char *get_name() override { return "USMSharedMemoryProvider"; }
-
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-};
-
-// Allocation routines for host memory type
-class USMHostMemoryProvider final : public USMMemoryProvider {
-public:
-  const char *get_name() override { return "USMSharedMemoryProvider"; }
-
-protected:
-  ur_result_t allocateImpl(void **ResultPtr, size_t Size,
-                           uint32_t Alignment) override;
-};
-
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
                                ur_usm_device_mem_flags_t Flags, size_t Size,
diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp
index 4c65e61dab..ce07332ce8 100644
--- a/source/adapters/hip/command_buffer.cpp
+++ b/source/adapters/hip/command_buffer.cpp
@@ -233,9 +233,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
-  const bool IsUpdatable =
-      pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false;
-
+  const bool IsUpdatable = pCommandBufferDesc->isUpdatable;
   try {
     *phCommandBuffer =
         new ur_exp_command_buffer_handle_t_(hContext, hDevice, IsUpdatable);
diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp
index 81c839cf32..d971207a6c 100644
--- a/source/adapters/hip/event.cpp
+++ b/source/adapters/hip/event.cpp
@@ -234,6 +234,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
     return ReturnValue(static_cast<uint64_t>(hEvent->getStartTime()));
   case UR_PROFILING_INFO_COMMAND_END:
     return ReturnValue(static_cast<uint64_t>(hEvent->getEndTime()));
+  case UR_PROFILING_INFO_COMMAND_COMPLETE:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     break;
   }
diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index 879ee0f1cc..4705964190 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -586,9 +586,7 @@ bool canBeInOrder(ur_context_handle_t Context,
   bool CompatibleDriver = Context->getPlatform()->isDriverVersionNewerOrSimilar(
       1, 3, L0_DRIVER_INORDER_MIN_VERSION);
   bool CanUseDriverInOrderLists = CompatibleDriver && DriverInOrderRequested;
-  return CanUseDriverInOrderLists
-             ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false)
-             : false;
+  return CanUseDriverInOrderLists ? CommandBufferDesc->isInOrder : false;
 }
 
 /**
@@ -624,9 +622,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
                          const ur_exp_command_buffer_desc_t *CommandBufferDesc,
                          ur_exp_command_buffer_handle_t *CommandBuffer) {
   bool IsInOrder = canBeInOrder(Context, CommandBufferDesc);
-  bool EnableProfiling =
-      CommandBufferDesc && CommandBufferDesc->enableProfiling && !IsInOrder;
-  bool IsUpdatable = CommandBufferDesc && CommandBufferDesc->isUpdatable;
+  bool EnableProfiling = CommandBufferDesc->enableProfiling && !IsInOrder;
+  bool IsUpdatable = CommandBufferDesc->isUpdatable;
   bool ImmediateAppendPath = checkImmediateAppendSupport(Context, Device);
   const bool WaitEventPath = !ImmediateAppendPath;
   bool UseCounterBasedEvents = checkCounterBasedEventsSupport(Device) &&
@@ -910,17 +907,18 @@ ur_result_t setKernelPendingArguments(
  * @param[in] CommandBuffer The CommandBuffer associated with the new command.
  * @param[in] Kernel  The Kernel associated with the new command.
  * @param[in] WorkDim Dimensions of the kernel associated with the new command.
+ * @param[in] GlobalWorkSize Global work size of the kernel associated with the
+ * new command.
  * @param[in] LocalWorkSize LocalWorkSize of the kernel associated with the new
  * command.
  * @param[out] Command The handle to the new command.
  * @return UR_RESULT_SUCCESS or an error code on failure
  */
-ur_result_t
-createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer,
-                    ur_kernel_handle_t Kernel, uint32_t WorkDim,
-                    const size_t *LocalWorkSize, uint32_t NumKernelAlternatives,
-                    ur_kernel_handle_t *KernelAlternatives,
-                    ur_exp_command_buffer_command_handle_t *Command) {
+ur_result_t createCommandHandle(
+    ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
+    uint32_t WorkDim, const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+    uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives,
+    ur_exp_command_buffer_command_handle_t *Command) {
 
   assert(CommandBuffer->IsUpdatable);
 
@@ -992,6 +990,8 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer,
         CommandBuffer, Kernel, CommandId, WorkDim, LocalWorkSize != nullptr,
         NumKernelAlternatives, KernelAlternatives);
 
+    NewCommand->setGlobalWorkSize(GlobalWorkSize);
+
     *Command = NewCommand.get();
 
     CommandBuffer->CommandHandles.push_back(std::move(NewCommand));
@@ -1066,9 +1066,9 @@ ur_result_t urCommandBufferAppendKernelLaunchExp(
   }
 
   if (Command) {
-    UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize,
-                                NumKernelAlternatives, KernelAlternatives,
-                                Command));
+    UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, GlobalWorkSize,
+                                LocalWorkSize, NumKernelAlternatives,
+                                KernelAlternatives, Command));
   }
   std::vector<ze_event_handle_t> ZeEventList;
   ze_event_handle_t ZeLaunchEvent = nullptr;
@@ -1922,10 +1922,16 @@ ur_result_t updateKernelCommand(
     Descs.push_back(std::move(MutableGroupSizeDesc));
   }
 
-  // Check if a new global size is provided and if we need to update the group
-  // count.
+  // Check if a new global or local size is provided and if so we need to update
+  // the group count.
   ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
-  if (NewGlobalWorkSize && Dim > 0) {
+  if ((NewGlobalWorkSize || NewLocalWorkSize) && Dim > 0) {
+    // If a new global work size is provided update that in the command,
+    // otherwise the previous work group size will be used
+    if (NewGlobalWorkSize) {
+      Command->WorkDim = Dim;
+      Command->setGlobalWorkSize(NewGlobalWorkSize);
+    }
     // If a new global work size is provided but a new local work size is not
     // then we still need to update local work size based on the size suggested
     // by the driver for the kernel.
@@ -1935,9 +1941,9 @@ ur_result_t updateKernelCommand(
     UR_CALL(getZeKernel(ZeDevice, Command->Kernel, &ZeKernel));
 
     uint32_t WG[3];
-    UR_CALL(calculateKernelWorkDimensions(ZeKernel, CommandBuffer->Device,
-                                          ZeThreadGroupDimensions, WG, Dim,
-                                          NewGlobalWorkSize, NewLocalWorkSize));
+    UR_CALL(calculateKernelWorkDimensions(
+        ZeKernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG, Dim,
+        Command->GlobalWorkSize, NewLocalWorkSize));
 
     auto MutableGroupCountDesc =
         std::make_unique<ZeStruct<ze_mutable_group_count_exp_desc_t>>();
diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp
index c9389eaf11..491cb2638e 100644
--- a/source/adapters/level_zero/command_buffer.hpp
+++ b/source/adapters/level_zero/command_buffer.hpp
@@ -172,8 +172,19 @@ struct kernel_command_handle : public ur_exp_command_buffer_command_handle_t_ {
 
   ~kernel_command_handle();
 
+  void setGlobalWorkSize(const size_t *GlobalWorkSizePtr) {
+    const size_t CopySize = sizeof(size_t) * WorkDim;
+    std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize);
+    if (WorkDim < 3) {
+      const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim);
+      std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize);
+    }
+  }
+
   // Work-dimension the command was originally created with.
   uint32_t WorkDim;
+  // Global work size of the kernel
+  size_t GlobalWorkSize[3];
   // Set to true if the user set the local work size on command creation.
   bool UserDefinedLocalSize;
   // Currently active kernel handle
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index e12352b6b1..cfcd1d316d 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -609,6 +609,10 @@ ur_result_t urEventGetProfilingInfo(
 
       return ReturnValue(ContextEndTime);
     }
+    case UR_PROFILING_INFO_COMMAND_COMPLETE:
+      logger::error("urEventGetProfilingInfo: "
+                    "UR_PROFILING_INFO_COMMAND_COMPLETE not supported");
+      return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
     default:
       logger::error("urEventGetProfilingInfo: not supported ParamName");
       return UR_RESULT_ERROR_INVALID_VALUE;
@@ -672,6 +676,10 @@ ur_result_t urEventGetProfilingInfo(
         ContextEndTime *= ZeTimerResolution;
         return ReturnValue(ContextEndTime);
       }
+      case UR_PROFILING_INFO_COMMAND_COMPLETE:
+        logger::error("urEventGetProfilingInfo: "
+                      "UR_PROFILING_INFO_COMMAND_COMPLETE not supported");
+        return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
       default:
         logger::error("urEventGetProfilingInfo: not supported ParamName");
         return UR_RESULT_ERROR_INVALID_VALUE;
@@ -715,6 +723,10 @@ ur_result_t urEventGetProfilingInfo(
     // enqueue.
     //
     return ReturnValue(uint64_t{0});
+  case UR_PROFILING_INFO_COMMAND_COMPLETE:
+    logger::error("urEventGetProfilingInfo: UR_PROFILING_INFO_COMMAND_COMPLETE "
+                  "not supported");
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     logger::error("urEventGetProfilingInfo: not supported ParamName");
     return UR_RESULT_ERROR_INVALID_VALUE;
diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp
index 8a2153e0a5..129db02594 100644
--- a/source/adapters/level_zero/v2/api.cpp
+++ b/source/adapters/level_zero/v2/api.cpp
@@ -170,53 +170,6 @@ ur_result_t urBindlessImagesReleaseExternalSemaphoreExp(
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
-ur_result_t urCommandBufferAppendUSMFillExp(
-    ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory,
-    const void *pPattern, size_t patternSize, size_t size,
-    uint32_t numSyncPointsInWaitList,
-    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
-    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urCommandBufferAppendMemBufferFillExp(
-    ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
-    const void *pPattern, size_t patternSize, size_t offset, size_t size,
-    uint32_t numSyncPointsInWaitList,
-    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
-    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urCommandBufferAppendUSMPrefetchExp(
-    ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
-    size_t size, ur_usm_migration_flags_t flags,
-    uint32_t numSyncPointsInWaitList,
-    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
-    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
-ur_result_t urCommandBufferAppendUSMAdviseExp(
-    ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
-    size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList,
-    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
-    uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList,
-    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
-    ur_exp_command_buffer_command_handle_t *phCommand) {
-  logger::error("{} function not implemented!", __FUNCTION__);
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
-}
-
 ur_result_t urCommandBufferUpdateKernelLaunchExp(
     ur_exp_command_buffer_command_handle_t hCommand,
     const ur_exp_command_buffer_update_kernel_launch_desc_t
diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp
index 8253527efe..8511d818b2 100644
--- a/source/adapters/level_zero/v2/command_buffer.cpp
+++ b/source/adapters/level_zero/v2/command_buffer.cpp
@@ -346,6 +346,115 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp(
   return exceptionToResult(std::current_exception());
 }
 
+ur_result_t urCommandBufferAppendUSMFillExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory,
+    const void *pPattern, size_t patternSize, size_t size,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
+
+  // the same issue as in urCommandBufferAppendKernelLaunchExp
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  // sync mechanic can be ignored, because all lists are in-order
+  std::ignore = numSyncPointsInWaitList;
+  std::ignore = pSyncPointWaitList;
+  std::ignore = pSyncPoint;
+
+  std::ignore = phCommand;
+
+  UR_CALL(hCommandBuffer->commandListManager.appendUSMFill(
+      pMemory, patternSize, pPattern, size, 0, nullptr, nullptr));
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
+ur_result_t urCommandBufferAppendMemBufferFillExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer,
+    const void *pPattern, size_t patternSize, size_t offset, size_t size,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
+
+  // the same issue as in urCommandBufferAppendKernelLaunchExp
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  // sync mechanic can be ignored, because all lists are in-order
+  std::ignore = numSyncPointsInWaitList;
+  std::ignore = pSyncPointWaitList;
+  std::ignore = pSyncPoint;
+
+  std::ignore = phCommand;
+
+  UR_CALL(hCommandBuffer->commandListManager.appendMemBufferFill(
+      hBuffer, pPattern, patternSize, offset, size, 0, nullptr, nullptr));
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
+ur_result_t urCommandBufferAppendUSMPrefetchExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
+    size_t size, ur_usm_migration_flags_t flags,
+    uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
+
+  // the same issue as in urCommandBufferAppendKernelLaunchExp
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  // sync mechanic can be ignored, because all lists are in-order
+  std::ignore = numSyncPointsInWaitList;
+  std::ignore = pSyncPointWaitList;
+  std::ignore = pSyncPoint;
+
+  std::ignore = phCommand;
+
+  UR_CALL(hCommandBuffer->commandListManager.appendUSMPrefetch(
+      pMemory, size, flags, 0, nullptr, nullptr));
+
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
+ur_result_t urCommandBufferAppendUSMAdviseExp(
+    ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory,
+    size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent,
+    ur_exp_command_buffer_command_handle_t *phCommand) try {
+
+  // the same issue as in urCommandBufferAppendKernelLaunchExp
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+  // sync mechanic can be ignored, because all lists are in-order
+  std::ignore = numSyncPointsInWaitList;
+  std::ignore = pSyncPointWaitList;
+  std::ignore = pSyncPoint;
+
+  std::ignore = phCommand;
+
+  UR_CALL(hCommandBuffer->commandListManager.appendUSMAdvise(pMemory, size,
+                                                             advice, nullptr));
+
+  return UR_RESULT_SUCCESS;
+} catch (...) {
+  return exceptionToResult(std::current_exception());
+}
+
 ur_result_t
 urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer,
                           ur_exp_command_buffer_info_t propName,
diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp
index b1510d0a97..5c6d2330d7 100644
--- a/source/adapters/level_zero/v2/command_list_manager.cpp
+++ b/source/adapters/level_zero/v2/command_list_manager.cpp
@@ -31,6 +31,50 @@ ur_command_list_manager::~ur_command_list_manager() {
   ur::level_zero::urDeviceRelease(device);
 }
 
+ur_result_t ur_command_list_manager::appendGenericFillUnlocked(
+    ur_mem_buffer_t *dst, size_t offset, size_t patternSize,
+    const void *pPattern, size_t size, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
+    ur_command_t commandType) {
+
+  auto zeSignalEvent = getSignalEvent(phEvent, commandType);
+
+  auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
+
+  auto pDst = ur_cast<char *>(dst->getDevicePtr(
+      device, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size,
+      [&](void *src, void *dst, size_t size) {
+        ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
+                          (zeCommandList.get(), dst, src, size, nullptr,
+                           waitListView.num, waitListView.handles));
+        waitListView.clear();
+      }));
+
+  // PatternSize must be a power of two for zeCommandListAppendMemoryFill.
+  // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy.
+  if (isPowerOf2(patternSize)) {
+    ZE2UR_CALL(zeCommandListAppendMemoryFill,
+               (zeCommandList.get(), pDst, pPattern, patternSize, size,
+                zeSignalEvent, waitListView.num, waitListView.handles));
+  } else {
+    // Copy pattern into every entry in memory array pointed by Ptr.
+    uint32_t numOfCopySteps = size / patternSize;
+    const void *src = pPattern;
+
+    for (uint32_t step = 0; step < numOfCopySteps; ++step) {
+      void *dst = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(pDst) +
+                                           step * patternSize);
+      ZE2UR_CALL(zeCommandListAppendMemoryCopy,
+                 (zeCommandList.get(), dst, src, patternSize,
+                  step == numOfCopySteps - 1 ? zeSignalEvent : nullptr,
+                  waitListView.num, waitListView.handles));
+      waitListView.clear();
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
 ur_result_t ur_command_list_manager::appendGenericCopyUnlocked(
     ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset,
     size_t dstOffset, size_t size, uint32_t numEventsInWaitList,
@@ -209,6 +253,96 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy(
   return UR_RESULT_SUCCESS;
 }
 
+ur_result_t ur_command_list_manager::appendMemBufferFill(
+    ur_mem_handle_t hMem, const void *pPattern, size_t patternSize,
+    size_t offset, size_t size, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferFill");
+
+  auto hBuffer = hMem->getBuffer();
+  UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE);
+
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
+                                                          hBuffer->getMutex());
+
+  return appendGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, size,
+                                   numEventsInWaitList, phEventWaitList,
+                                   phEvent, UR_COMMAND_MEM_BUFFER_FILL);
+}
+
+ur_result_t ur_command_list_manager::appendUSMFill(
+    void *pMem, size_t patternSize, const void *pPattern, size_t size,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill");
+
+  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
+
+  ur_usm_handle_t dstHandle(context, size, pMem);
+  return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size,
+                                   numEventsInWaitList, phEventWaitList,
+                                   phEvent, UR_COMMAND_USM_FILL);
+}
+
+ur_result_t ur_command_list_manager::appendUSMPrefetch(
+    const void *pMem, size_t size, ur_usm_migration_flags_t flags,
+    uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
+    ur_event_handle_t *phEvent) {
+  TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch");
+
+  std::ignore = flags;
+
+  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
+
+  auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH);
+
+  auto [pWaitEvents, numWaitEvents] =
+      getWaitListView(phEventWaitList, numEventsInWaitList);
+
+  if (pWaitEvents) {
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (zeCommandList.get(), numWaitEvents, pWaitEvents));
+  }
+  // TODO: figure out how to translate "flags"
+  ZE2UR_CALL(zeCommandListAppendMemoryPrefetch,
+             (zeCommandList.get(), pMem, size));
+  if (zeSignalEvent) {
+    ZE2UR_CALL(zeCommandListAppendSignalEvent,
+               (zeCommandList.get(), zeSignalEvent));
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t
+ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size,
+                                         ur_usm_advice_flags_t advice,
+                                         ur_event_handle_t *phEvent) {
+  TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise");
+
+  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
+
+  auto zeAdvice = ur_cast<ze_memory_advice_t>(advice);
+
+  auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE);
+
+  auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0);
+
+  if (pWaitEvents) {
+    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+               (zeCommandList.get(), numWaitEvents, pWaitEvents));
+  }
+
+  ZE2UR_CALL(zeCommandListAppendMemAdvise,
+             (zeCommandList.get(), device->ZeDevice, pMem, size, zeAdvice));
+
+  if (zeSignalEvent) {
+    ZE2UR_CALL(zeCommandListAppendSignalEvent,
+               (zeCommandList.get(), zeSignalEvent));
+  }
+  return UR_RESULT_SUCCESS;
+}
+
 ur_result_t ur_command_list_manager::appendMemBufferRead(
     ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size,
     void *pDst, uint32_t numEventsInWaitList,
diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp
index e85d9b9049..3d3481cb22 100644
--- a/source/adapters/level_zero/v2/command_list_manager.hpp
+++ b/source/adapters/level_zero/v2/command_list_manager.hpp
@@ -99,6 +99,27 @@ struct ur_command_list_manager : public _ur_object {
                                 const ur_event_handle_t *phEventWaitList,
                                 ur_event_handle_t *phEvent);
 
+  ur_result_t appendMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern,
+                                  size_t patternSize, size_t offset,
+                                  size_t size, uint32_t numEventsInWaitList,
+                                  const ur_event_handle_t *phEventWaitList,
+                                  ur_event_handle_t *phEvent);
+
+  ur_result_t appendUSMFill(void *pMem, size_t patternSize,
+                            const void *pPattern, size_t size,
+                            uint32_t numEventsInWaitList,
+                            const ur_event_handle_t *phEventWaitList,
+                            ur_event_handle_t *phEvent);
+
+  ur_result_t appendUSMPrefetch(const void *pMem, size_t size,
+                                ur_usm_migration_flags_t flags,
+                                uint32_t numEventsInWaitList,
+                                const ur_event_handle_t *phEventWaitList,
+                                ur_event_handle_t *phEvent);
+
+  ur_result_t appendUSMAdvise(const void *pMem, size_t size,
+                              ur_usm_advice_flags_t advice,
+                              ur_event_handle_t *phEvent);
   ze_command_list_handle_t getZeCommandList();
 
   wait_list_view getWaitListView(const ur_event_handle_t *phWaitEvents,
@@ -107,6 +128,12 @@ struct ur_command_list_manager : public _ur_object {
                                    ur_command_t commandType);
 
 private:
+  ur_result_t appendGenericFillUnlocked(
+      ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize,
+      const void *pPattern, size_t size, uint32_t numEventsInWaitList,
+      const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
+      ur_command_t commandType);
+
   ur_result_t appendGenericCopyUnlocked(
       ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking,
       size_t srcOffset, size_t dstOffset, size_t size,
diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
index d33ac12f7e..3f88161b90 100644
--- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
+++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp
@@ -372,16 +372,11 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferFill(
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferFill");
 
-  auto hBuffer = hMem->getBuffer();
-
-  UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE);
-
-  std::scoped_lock<ur_shared_mutex, ur_shared_mutex> lock(this->Mutex,
-                                                          hBuffer->getMutex());
+  UR_CALL(commandListManager.appendMemBufferFill(
+      hMem, pPattern, patternSize, offset, size, numEventsInWaitList,
+      phEventWaitList, phEvent));
 
-  return enqueueGenericFillUnlocked(hBuffer, offset, patternSize, pPattern,
-                                    size, numEventsInWaitList, phEventWaitList,
-                                    phEvent, UR_COMMAND_MEM_BUFFER_FILL);
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageRead(
@@ -550,63 +545,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap(
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked(
-    ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize,
-    const void *pPattern, size_t size, uint32_t numEventsInWaitList,
-    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent,
-    ur_command_t commandType) {
-  auto zeSignalEvent = getSignalEvent(phEvent, commandType);
-
-  auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
-
-  auto pDst = ur_cast<char *>(hBuffer->getDevicePtr(
-      hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size,
-      [&](void *src, void *dst, size_t size) {
-        ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
-                          (commandListManager.getZeCommandList(), dst, src,
-                           size, nullptr, waitListView.num,
-                           waitListView.handles));
-        waitListView.clear();
-      }));
-
-  // PatternSize must be a power of two for zeCommandListAppendMemoryFill.
-  // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy.
-  if (isPowerOf2(patternSize)) {
-    ZE2UR_CALL(zeCommandListAppendMemoryFill,
-               (commandListManager.getZeCommandList(), pDst, pPattern,
-                patternSize, size, zeSignalEvent, waitListView.num,
-                waitListView.handles));
-  } else {
-    // Copy pattern into every entry in memory array pointed by Ptr.
-    uint32_t numOfCopySteps = size / patternSize;
-    const void *src = pPattern;
-
-    for (uint32_t step = 0; step < numOfCopySteps; ++step) {
-      void *dst = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(pDst) +
-                                           step * patternSize);
-      ZE2UR_CALL(zeCommandListAppendMemoryCopy,
-                 (commandListManager.getZeCommandList(), dst, src, patternSize,
-                  step == numOfCopySteps - 1 ? zeSignalEvent : nullptr,
-                  waitListView.num, waitListView.handles));
-      waitListView.clear();
-    }
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
 ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill(
     void *pMem, size_t patternSize, const void *pPattern, size_t size,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFill");
 
-  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
-
-  ur_usm_handle_t dstHandle(hContext, size, pMem);
-  return enqueueGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size,
-                                    numEventsInWaitList, phEventWaitList,
-                                    phEvent, UR_COMMAND_USM_FILL);
+  UR_CALL(commandListManager.appendUSMFill(pMem, patternSize, pPattern, size,
+                                           numEventsInWaitList, phEventWaitList,
+                                           phEvent));
+  return UR_RESULT_SUCCESS;
 }
 
 ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy(
@@ -628,29 +576,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch(
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMPrefetch");
-
-  std::ignore = flags;
-
-  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
-
-  auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH);
-
-  auto [pWaitEvents, numWaitEvents] =
-      getWaitListView(phEventWaitList, numEventsInWaitList);
-
-  if (pWaitEvents) {
-    ZE2UR_CALL(
-        zeCommandListAppendWaitOnEvents,
-        (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents));
-  }
-  // TODO: figure out how to translate "flags"
-  ZE2UR_CALL(zeCommandListAppendMemoryPrefetch,
-             (commandListManager.getZeCommandList(), pMem, size));
-  if (zeSignalEvent) {
-    ZE2UR_CALL(zeCommandListAppendSignalEvent,
-               (commandListManager.getZeCommandList(), zeSignalEvent));
-  }
-
+  UR_CALL(commandListManager.appendUSMPrefetch(
+      pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent));
   return UR_RESULT_SUCCESS;
 }
 
@@ -660,31 +587,7 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size,
                                                 ur_event_handle_t *phEvent) {
   TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMAdvise");
 
-  std::ignore = flags;
-
-  std::scoped_lock<ur_shared_mutex> lock(this->Mutex);
-
-  auto zeAdvice = ur_cast<ze_memory_advice_t>(advice);
-
-  auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE);
-
-  auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0);
-
-  if (pWaitEvents) {
-    ZE2UR_CALL(
-        zeCommandListAppendWaitOnEvents,
-        (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents));
-  }
-
-  // TODO: figure out how to translate "flags"
-  ZE2UR_CALL(zeCommandListAppendMemAdvise,
-             (commandListManager.getZeCommandList(), this->hDevice->ZeDevice,
-              pMem, size, zeAdvice));
-
-  if (zeSignalEvent) {
-    ZE2UR_CALL(zeCommandListAppendSignalEvent,
-               (commandListManager.getZeCommandList(), zeSignalEvent));
-  }
+  UR_CALL(commandListManager.appendUSMAdvise(pMem, size, advice, phEvent));
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp
index 6d5034d07b..9a4ace593e 100644
--- a/source/adapters/mock/ur_mockddi.cpp
+++ b/source/adapters/mock/ur_mockddi.cpp
@@ -8403,7 +8403,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) try {
diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index 1130385cfa..d1c83a4655 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -106,9 +106,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
                            pLocalWorkSize);
   auto &tp = hQueue->getDevice()->tp;
   const size_t numParallelThreads = tp.num_threads();
-  hKernel->updateMemPool(numParallelThreads);
   std::vector<std::future<void>> futures;
-  std::vector<std::function<void(size_t, ur_kernel_handle_t_)>> groups;
+  std::vector<std::function<void(size_t, ur_kernel_handle_t_ &)>> groups;
   auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0];
   auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1];
   auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2];
@@ -119,8 +118,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   auto event = new ur_event_handle_t_(hQueue, UR_COMMAND_KERNEL_LAUNCH);
   event->tick_start();
 
+  // Create a copy of the kernel and its arguments.
+  auto kernel = std::make_unique<ur_kernel_handle_t_>(*hKernel);
+  kernel->updateMemPool(numParallelThreads);
+
 #ifndef NATIVECPU_USE_OCK
-  hKernel->handleLocalArgs(1, 0);
   for (unsigned g2 = 0; g2 < numWG2; g2++) {
     for (unsigned g1 = 0; g1 < numWG1; g1++) {
       for (unsigned g0 = 0; g0 < numWG0; g0++) {
@@ -128,7 +130,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
           for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) {
             for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) {
               state.update(g0, g1, g2, local0, local1, local2);
-              hKernel->_subhandler(hKernel->getArgs().data(), &state);
+              kernel->_subhandler(kernel->getArgs(1, 0).data(), &state);
             }
           }
         }
@@ -139,7 +141,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   bool isLocalSizeOne =
       ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1;
   if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads &&
-      !hKernel->hasLocalArgs()) {
+      !kernel->hasLocalArgs()) {
     // If the local size is one, we make the assumption that we are running a
     // parallel_for over a sycl::range.
     // Todo: we could add more compiler checks and
@@ -160,7 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned g1 = 0; g1 < numWG1; g1++) {
         for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) {
           futures.emplace_back(tp.schedule_task(
-              [ndr, itemsPerThread, kernel = *hKernel, g0, g1, g2](size_t) {
+              [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) {
                 native_cpu::state resized_state =
                     getResizedState(ndr, itemsPerThread);
                 resized_state.update(g0, g1, g2);
@@ -172,7 +174,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0;
              g0++) {
           state.update(g0, g1, g2);
-          hKernel->_subhandler(hKernel->getArgs().data(), &state);
+          kernel->_subhandler(kernel->getArgs().data(), &state);
         }
       }
     }
@@ -185,12 +187,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
           futures.emplace_back(
-              tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2,
+              tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2,
                                 numParallelThreads](size_t threadId) mutable {
                 for (unsigned g0 = 0; g0 < numWG0; g0++) {
-                  kernel.handleLocalArgs(numParallelThreads, threadId);
                   state.update(g0, g1, g2);
-                  kernel._subhandler(kernel.getArgs().data(), &state);
+                  kernel._subhandler(
+                      kernel.getArgs(numParallelThreads, threadId).data(),
+                      &state);
                 }
               }));
         }
@@ -202,13 +205,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned g2 = 0; g2 < numWG2; g2++) {
         for (unsigned g1 = 0; g1 < numWG1; g1++) {
           for (unsigned g0 = 0; g0 < numWG0; g0++) {
-            groups.push_back(
-                [state, g0, g1, g2, numParallelThreads](
-                    size_t threadId, ur_kernel_handle_t_ kernel) mutable {
-                  kernel.handleLocalArgs(numParallelThreads, threadId);
-                  state.update(g0, g1, g2);
-                  kernel._subhandler(kernel.getArgs().data(), &state);
-                });
+            groups.push_back([state, g0, g1, g2, numParallelThreads](
+                                 size_t threadId,
+                                 ur_kernel_handle_t_ &kernel) mutable {
+              state.update(g0, g1, g2);
+              kernel._subhandler(
+                  kernel.getArgs(numParallelThreads, threadId).data(), &state);
+            });
           }
         }
       }
@@ -218,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       for (unsigned thread = 0; thread < numParallelThreads; thread++) {
         futures.emplace_back(
             tp.schedule_task([groups, thread, groupsPerThread,
-                              kernel = *hKernel](size_t threadId) {
+                              &kernel = *kernel](size_t threadId) {
               for (unsigned i = 0; i < groupsPerThread; i++) {
                 auto index = thread * groupsPerThread + i;
                 groups[index](threadId, kernel);
@@ -231,7 +234,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         futures.emplace_back(
             tp.schedule_task([groups, remainder,
                               scheduled = numParallelThreads * groupsPerThread,
-                              kernel = *hKernel](size_t threadId) {
+                              &kernel = *kernel](size_t threadId) {
               for (unsigned i = 0; i < remainder; i++) {
                 auto index = scheduled + i;
                 groups[index](threadId, kernel);
@@ -247,7 +250,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   if (phEvent) {
     *phEvent = event;
   }
-  event->set_callback([hKernel, event]() {
+  event->set_callback([kernel = std::move(kernel), hKernel, event]() {
     event->tick_end();
     // TODO: avoid calling clear() here.
     hKernel->_localArgInfo.clear();
@@ -266,7 +269,7 @@ ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue,
                             ur_event_handle_t *phEvent,
                             const std::function<ur_result_t()> &f) {
   urEventWait(numEventsInWaitList, phEventWaitList);
-  ur_event_handle_t event;
+  ur_event_handle_t event = nullptr;
   if (phEvent) {
     event = new ur_event_handle_t_(hQueue, command_type);
     event->tick_start();
diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp
index 37eaf1f6d1..f981d24f42 100644
--- a/source/adapters/native_cpu/event.cpp
+++ b/source/adapters/native_cpu/event.cpp
@@ -52,6 +52,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   case UR_PROFILING_INFO_COMMAND_QUEUED:
   case UR_PROFILING_INFO_COMMAND_SUBMIT:
   case UR_PROFILING_INFO_COMMAND_COMPLETE:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     break;
   }
@@ -146,7 +147,7 @@ void ur_event_handle_t_::wait() {
   // The callback may need to acquire the lock, so we unlock it here
   lock.unlock();
 
-  if (callback)
+  if (callback.valid())
     callback();
 }
 
diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp
index 60176a33a6..479c671b38 100644
--- a/source/adapters/native_cpu/event.hpp
+++ b/source/adapters/native_cpu/event.hpp
@@ -21,7 +21,9 @@ struct ur_event_handle_t_ : RefCounted {
 
   ~ur_event_handle_t_();
 
-  void set_callback(const std::function<void()> &cb) { callback = cb; }
+  template <typename T> auto set_callback(T &&cb) {
+    callback = std::packaged_task<void()>(std::forward<T>(cb));
+  }
 
   void wait();
 
@@ -60,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted {
   bool done;
   std::mutex mutex;
   std::vector<std::future<void>> futures;
-  std::function<void()> callback;
+  std::packaged_task<void()> callback;
   uint64_t timestamp_start = 0;
   uint64_t timestamp_end = 0;
 };
diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp
index 4d2dec85cb..9e13286f3e 100644
--- a/source/adapters/native_cpu/kernel.hpp
+++ b/source/adapters/native_cpu/kernel.hpp
@@ -35,18 +35,9 @@ struct ur_kernel_handle_t_ : RefCounted {
   ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
       : Args(other.Args), hProgram(other.hProgram), _name(other._name),
         _subhandler(other._subhandler), _localArgInfo(other._localArgInfo),
-        _localMemPool(other._localMemPool),
-        _localMemPoolSize(other._localMemPoolSize),
-        ReqdWGSize(other.ReqdWGSize) {
-    incrementReferenceCount();
-  }
+        ReqdWGSize(other.ReqdWGSize) {}
 
-  ~ur_kernel_handle_t_() {
-    if (decrementReferenceCount() == 0) {
-      free(_localMemPool);
-      Args.deallocate();
-    }
-  }
+  ~ur_kernel_handle_t_() { free(_localMemPool); }
 
   ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler,
@@ -64,27 +55,62 @@ struct ur_kernel_handle_t_ : RefCounted {
     std::vector<bool> OwnsMem;
     static constexpr size_t MaxAlign = 16 * sizeof(double);
 
+    arguments() = default;
+
+    arguments(const arguments &Other)
+        : Indices(Other.Indices), ParamSizes(Other.ParamSizes),
+          OwnsMem(Other.OwnsMem.size(), false) {
+      for (size_t Index = 0; Index < Indices.size(); Index++) {
+        if (!Other.OwnsMem[Index]) {
+          continue;
+        }
+        addArg(Index, ParamSizes[Index], Indices[Index]);
+      }
+    }
+
+    arguments(arguments &&Other) : arguments() {
+      std::swap(Indices, Other.Indices);
+      std::swap(ParamSizes, Other.ParamSizes);
+      std::swap(OwnsMem, Other.OwnsMem);
+    }
+
+    ~arguments() {
+      assert(OwnsMem.size() == Indices.size() && "Size mismatch");
+      for (size_t Index = 0; Index < Indices.size(); Index++) {
+        if (!OwnsMem[Index]) {
+          continue;
+        }
+        native_cpu::aligned_free(Indices[Index]);
+      }
+    }
+
     /// Add an argument to the kernel.
     /// If the argument existed before, it is replaced.
     /// Otherwise, it is added.
     /// Gaps are filled with empty arguments.
     /// Implicit offset argument is kept at the back of the indices collection.
     void addArg(size_t Index, size_t Size, const void *Arg) {
+      bool NeedAlloc = true;
       if (Index + 1 > Indices.size()) {
         Indices.resize(Index + 1);
         OwnsMem.resize(Index + 1);
         ParamSizes.resize(Index + 1);
-
-        // Update the stored value for the argument
-        Indices[Index] = native_cpu::aligned_malloc(MaxAlign, Size);
-        OwnsMem[Index] = true;
-        ParamSizes[Index] = Size;
-      } else {
-        if (ParamSizes[Index] != Size) {
-          Indices[Index] = realloc(Indices[Index], Size);
-          ParamSizes[Index] = Size;
+      } else if (OwnsMem[Index]) {
+        if (ParamSizes[Index] == Size) {
+          NeedAlloc = false;
+        } else {
+          native_cpu::aligned_free(Indices[Index]);
         }
       }
+      if (NeedAlloc) {
+        size_t Align = MaxAlign;
+        while (Align > Size) {
+          Align >>= 1;
+        }
+        Indices[Index] = native_cpu::aligned_malloc(Align, Size);
+        ParamSizes[Index] = Size;
+        OwnsMem[Index] = true;
+      }
       std::memcpy(Indices[Index], Arg, Size);
     }
 
@@ -100,17 +126,6 @@ struct ur_kernel_handle_t_ : RefCounted {
       Indices[Index] = Arg;
     }
 
-    // This is called by the destructor of ur_kernel_handle_t_, since
-    // ur_kernel_handle_t_ implements reference counting and we want
-    // to deallocate only when the reference count is 0.
-    void deallocate() {
-      assert(OwnsMem.size() == Indices.size() && "Size mismatch");
-      for (size_t Index = 0; Index < Indices.size(); Index++) {
-        if (OwnsMem[Index])
-          native_cpu::aligned_free(Indices[Index]);
-      }
-    }
-
     const args_index_t &getIndices() const noexcept { return Indices; }
 
   } Args;
@@ -144,19 +159,26 @@ struct ur_kernel_handle_t_ : RefCounted {
 
   bool hasLocalArgs() const { return !_localArgInfo.empty(); }
 
-  // To be called before executing a work group if local args are present
-  void handleLocalArgs(size_t numParallelThread, size_t threadId) {
+  const std::vector<void *> &getArgs() const {
+    assert(!hasLocalArgs() && "For kernels with local arguments, thread "
+                              "information must be supplied.");
+    return Args.getIndices();
+  }
+
+  std::vector<void *> getArgs(size_t numThreads, size_t threadId) const {
+    auto Result = Args.getIndices();
+
     // For each local argument we have size*numthreads
     size_t offset = 0;
     for (auto &entry : _localArgInfo) {
-      Args.Indices[entry.argIndex] =
+      Result[entry.argIndex] =
           _localMemPool + offset + (entry.argSize * threadId);
       // update offset in the memory pool
-      offset += entry.argSize * numParallelThread;
+      offset += entry.argSize * numThreads;
     }
-  }
 
-  const std::vector<void *> &getArgs() const { return Args.getIndices(); }
+    return Result;
+  }
 
   void addArg(const void *Ptr, size_t Index, size_t Size) {
     Args.addArg(Index, Size, Ptr);
diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp
index d78ef0121b..3707dd4e5d 100644
--- a/source/adapters/opencl/command_buffer.cpp
+++ b/source/adapters/opencl/command_buffer.cpp
@@ -34,7 +34,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
 
   ur_queue_handle_t Queue = nullptr;
-  UR_RETURN_ON_FAILURE(urQueueCreate(hContext, hDevice, nullptr, &Queue));
+  ur_queue_properties_t QueueProperties = {UR_STRUCTURE_TYPE_QUEUE_PROPERTIES,
+                                           nullptr, 0};
+  const bool IsInOrder =
+      pCommandBufferDesc ? pCommandBufferDesc->isInOrder : false;
+  if (!IsInOrder) {
+    QueueProperties.flags = UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+  }
+  UR_RETURN_ON_FAILURE(
+      urQueueCreate(hContext, hDevice, &QueueProperties, &Queue));
 
   cl_context CLContext = cl_adapter::cast<cl_context>(hContext);
   cl_ext::clCreateCommandBufferKHR_fn clCreateCommandBufferKHR = nullptr;
@@ -43,8 +51,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
           CLContext, cl_ext::ExtFuncPtrCache->clCreateCommandBufferKHRCache,
           cl_ext::CreateCommandBufferName, &clCreateCommandBufferKHR));
 
-  const bool IsUpdatable =
-      pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false;
+  const bool IsUpdatable = pCommandBufferDesc->isUpdatable;
 
   ur_device_command_buffer_update_capability_flags_t UpdateCapabilities;
   cl_device_id CLDevice = cl_adapter::cast<cl_device_id>(hDevice);
@@ -67,7 +74,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
 
   try {
     auto URCommandBuffer = std::make_unique<ur_exp_command_buffer_handle_t_>(
-        Queue, hContext, hDevice, CLCommandBuffer, IsUpdatable);
+        Queue, hContext, hDevice, CLCommandBuffer, IsUpdatable, IsInOrder);
     *phCommandBuffer = URCommandBuffer.release();
   } catch (...) {
     return UR_RESULT_ERROR_OUT_OF_RESOURCES;
@@ -148,11 +155,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
 
   cl_command_properties_khr *Properties =
       hCommandBuffer->IsUpdatable ? UpdateProperties : nullptr;
+
+  const bool IsInOrder = hCommandBuffer->IsInOrder;
+  cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint;
+  const cl_sync_point_khr *SyncPointWaitList =
+      IsInOrder ? nullptr : pSyncPointWaitList;
+  uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList;
   CL_RETURN_ON_FAILURE(clCommandNDRangeKernelKHR(
       hCommandBuffer->CLCommandBuffer, nullptr, Properties,
       cl_adapter::cast<cl_kernel>(hKernel), workDim, pGlobalWorkOffset,
-      pGlobalWorkSize, pLocalWorkSize, numSyncPointsInWaitList,
-      pSyncPointWaitList, pSyncPoint, OutCommandHandle));
+      pGlobalWorkSize, pLocalWorkSize, WaitListSize, SyncPointWaitList,
+      RetSyncPoint, OutCommandHandle));
 
   try {
     auto Handle = std::make_unique<ur_exp_command_buffer_command_handle_t_>(
@@ -219,11 +232,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
           CLContext, cl_ext::ExtFuncPtrCache->clCommandCopyBufferKHRCache,
           cl_ext::CommandCopyBufferName, &clCommandCopyBufferKHR));
 
+  const bool IsInOrder = hCommandBuffer->IsInOrder;
+  cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint;
+  const cl_sync_point_khr *SyncPointWaitList =
+      IsInOrder ? nullptr : pSyncPointWaitList;
+  uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList;
   CL_RETURN_ON_FAILURE(clCommandCopyBufferKHR(
       hCommandBuffer->CLCommandBuffer, nullptr, nullptr,
       cl_adapter::cast<cl_mem>(hSrcMem), cl_adapter::cast<cl_mem>(hDstMem),
-      srcOffset, dstOffset, size, numSyncPointsInWaitList, pSyncPointWaitList,
-      pSyncPoint, nullptr));
+      srcOffset, dstOffset, size, WaitListSize, SyncPointWaitList, RetSyncPoint,
+      nullptr));
 
   return UR_RESULT_SUCCESS;
 }
@@ -257,12 +275,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
           CLContext, cl_ext::ExtFuncPtrCache->clCommandCopyBufferRectKHRCache,
           cl_ext::CommandCopyBufferRectName, &clCommandCopyBufferRectKHR));
 
+  const bool IsInOrder = hCommandBuffer->IsInOrder;
+  cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint;
+  const cl_sync_point_khr *SyncPointWaitList =
+      IsInOrder ? nullptr : pSyncPointWaitList;
+  uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList;
   CL_RETURN_ON_FAILURE(clCommandCopyBufferRectKHR(
       hCommandBuffer->CLCommandBuffer, nullptr, nullptr,
       cl_adapter::cast<cl_mem>(hSrcMem), cl_adapter::cast<cl_mem>(hDstMem),
       OpenCLOriginRect, OpenCLDstRect, OpenCLRegion, srcRowPitch, srcSlicePitch,
-      dstRowPitch, dstSlicePitch, numSyncPointsInWaitList, pSyncPointWaitList,
-      pSyncPoint, nullptr));
+      dstRowPitch, dstSlicePitch, WaitListSize, SyncPointWaitList, RetSyncPoint,
+      nullptr));
 
   return UR_RESULT_SUCCESS;
 }
@@ -361,10 +384,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
           CLContext, cl_ext::ExtFuncPtrCache->clCommandFillBufferKHRCache,
           cl_ext::CommandFillBufferName, &clCommandFillBufferKHR));
 
+  const bool IsInOrder = hCommandBuffer->IsInOrder;
+  cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint;
+  const cl_sync_point_khr *SyncPointWaitList =
+      IsInOrder ? nullptr : pSyncPointWaitList;
+  uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList;
   CL_RETURN_ON_FAILURE(clCommandFillBufferKHR(
       hCommandBuffer->CLCommandBuffer, nullptr, nullptr,
       cl_adapter::cast<cl_mem>(hBuffer), pPattern, patternSize, offset, size,
-      numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, nullptr));
+      WaitListSize, SyncPointWaitList, RetSyncPoint, nullptr));
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/opencl/command_buffer.hpp b/source/adapters/opencl/command_buffer.hpp
index 83fedc3c1e..6fbdc5d259 100644
--- a/source/adapters/opencl/command_buffer.hpp
+++ b/source/adapters/opencl/command_buffer.hpp
@@ -46,6 +46,8 @@ struct ur_exp_command_buffer_handle_t_ {
   /// Set to true if the kernel commands in the command-buffer can be updated,
   /// false otherwise
   bool IsUpdatable;
+  /// Set to true if the command-buffer was created with an in-order queue.
+  bool IsInOrder;
   /// Set to true if the command-buffer has been finalized, false otherwise
   bool IsFinalized;
   /// List of commands in the command-buffer.
@@ -58,10 +60,10 @@ struct ur_exp_command_buffer_handle_t_ {
                                   ur_context_handle_t hContext,
                                   ur_device_handle_t hDevice,
                                   cl_command_buffer_khr CLCommandBuffer,
-                                  bool IsUpdatable)
+                                  bool IsUpdatable, bool IsInOrder)
       : hInternalQueue(hQueue), hContext(hContext), hDevice(hDevice),
         CLCommandBuffer(CLCommandBuffer), IsUpdatable(IsUpdatable),
-        IsFinalized(false), RefCount(0) {}
+        IsInOrder(IsInOrder), IsFinalized(false), RefCount(0) {}
 
   ~ur_exp_command_buffer_handle_t_();
 
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 417016d630..5443e9de79 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -40,11 +40,11 @@ if (NOT DEFINED UMF_REPO)
 endif()
 
 if (NOT DEFINED UMF_TAG)
-    # commit ace9f4a60b686463fdad15cd016c548237cb79e0
-    # Author: Rafał Rudnicki <rafal.rudnicki@intel.com>
-    # Date:   Mon Feb 10 11:39:15 2025 +0100
-    # Merge pull request #1088 from ldorau/Fix_remove_CUDA_ERROR_INVALID_RESOURCE_TYPE
-    set(UMF_TAG ace9f4a60b686463fdad15cd016c548237cb79e0)
+    # commit 5a515c56c92be75944c8246535c408cee7711114
+    # Author: Lukasz Dorau <lukasz.dorau@intel.com>
+    # Date:   Mon Feb 17 10:56:05 2025 +0100
+    # Merge pull request #1086 from vinser52/svinogra_l0_linking
+    set(UMF_TAG 5a515c56c92be75944c8246535c408cee7711114)
 endif()
 
 message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}")
diff --git a/source/common/umf_helpers.hpp b/source/common/umf_helpers.hpp
index 2433560a39..12432bd933 100644
--- a/source/common/umf_helpers.hpp
+++ b/source/common/umf_helpers.hpp
@@ -16,7 +16,7 @@
 #include <umf/memory_pool_ops.h>
 #include <umf/memory_provider.h>
 #include <umf/memory_provider_ops.h>
-#include <umf/providers/provider_cuda.h>
+#include <umf/pools/pool_proxy.h>
 #include <ur_api.h>
 
 #include "logger/ur_logger.hpp"
@@ -30,11 +30,10 @@
 
 #define UMF_CHECK_ERROR(UmfResult) UR_CHECK_ERROR(umf::umf2urResult(UmfResult));
 
-#define UMF_RETURN_UMF_ERROR(UmfResult)                                        \
+#define UMF_CHECK_PTR(ptr)                                                     \
   do {                                                                         \
-    umf_result_t UmfResult_ = (UmfResult);                                     \
-    if (UmfResult_ != UMF_RESULT_SUCCESS) {                                    \
-      return UmfResult_;                                                       \
+    if ((ptr) == nullptr) {                                                    \
+      UR_CHECK_ERROR(UR_RESULT_ERROR_OUT_OF_HOST_MEMORY);                      \
     }                                                                          \
   } while (0)
 
@@ -54,9 +53,6 @@ using pool_unique_handle_t =
 using provider_unique_handle_t =
     std::unique_ptr<umf_memory_provider_t,
                     std::function<void(umf_memory_provider_handle_t)>>;
-using cuda_params_unique_handle_t = std::unique_ptr<
-    umf_cuda_memory_provider_params_t,
-    std::function<umf_result_t(umf_cuda_memory_provider_params_handle_t)>>;
 
 #define DEFINE_CHECK_OP(op)                                                    \
   template <typename T> class HAS_OP_##op {                                    \
@@ -243,6 +239,21 @@ static inline auto poolMakeUniqueFromOps(umf_memory_pool_ops_t *ops,
       UMF_RESULT_SUCCESS, pool_unique_handle_t(hPool, umfPoolDestroy)};
 }
 
+static inline auto
+poolMakeUniqueFromOpsProviderHandle(umf_memory_pool_ops_t *ops,
+                                    umf_memory_provider_handle_t provider,
+                                    void *params) {
+  umf_memory_pool_handle_t hPool;
+  auto ret = umfPoolCreate(ops, provider, params, 0, &hPool);
+  if (ret != UMF_RESULT_SUCCESS) {
+    return std::pair<umf_result_t, pool_unique_handle_t>{
+        ret, pool_unique_handle_t(nullptr, nullptr)};
+  }
+
+  return std::pair<umf_result_t, pool_unique_handle_t>{
+      UMF_RESULT_SUCCESS, pool_unique_handle_t(hPool, umfPoolDestroy)};
+}
+
 static inline auto providerMakeUniqueFromOps(umf_memory_provider_ops_t *ops,
                                              void *params) {
   umf_memory_provider_handle_t hProvider;
@@ -301,33 +312,6 @@ inline ur_result_t umf2urResult(umf_result_t umfResult) {
   };
 }
 
-inline umf_result_t createMemoryProvider(
-    umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams,
-    int cuDevice, void *cuContext, umf_usm_memory_type_t memType,
-    umf_memory_provider_handle_t *provider) {
-
-  umf_result_t UmfResult =
-      umfCUDAMemoryProviderParamsSetContext(CUMemoryProviderParams, cuContext);
-  UMF_RETURN_UMF_ERROR(UmfResult);
-
-  UmfResult =
-      umfCUDAMemoryProviderParamsSetDevice(CUMemoryProviderParams, cuDevice);
-  UMF_RETURN_UMF_ERROR(UmfResult);
-
-  UmfResult =
-      umfCUDAMemoryProviderParamsSetMemoryType(CUMemoryProviderParams, memType);
-  UMF_RETURN_UMF_ERROR(UmfResult);
-
-  umf_memory_provider_handle_t umfCUDAprovider = nullptr;
-  UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(),
-                                      CUMemoryProviderParams, &umfCUDAprovider);
-  UMF_RETURN_UMF_ERROR(UmfResult);
-
-  *provider = umfCUDAprovider;
-
-  return UMF_RESULT_SUCCESS;
-}
-
 } // namespace umf
 
 #endif /* UMF_HELPERS_H */
diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
index d5d5c2c729..d99c0545c3 100644
--- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp
@@ -175,7 +175,10 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context,
   if (!AllocInfoItOp) {
     // "Addr" might be a host pointer
     ReportBadFree(Addr, GetCurrentBacktrace(), nullptr);
-    return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    if (getOptions().HaltOnError) {
+      exitWithErrors();
+    }
+    return UR_RESULT_SUCCESS;
   }
 
   auto AllocInfoIt = *AllocInfoItOp;
@@ -190,17 +193,26 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context,
       // "Addr" might be a host pointer
       ReportBadFree(Addr, GetCurrentBacktrace(), nullptr);
     }
-    return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    if (getOptions().HaltOnError) {
+      exitWithErrors();
+    }
+    return UR_RESULT_SUCCESS;
   }
 
   if (Addr != AllocInfo->UserBegin) {
     ReportBadFree(Addr, GetCurrentBacktrace(), AllocInfo);
-    return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    if (getOptions().HaltOnError) {
+      exitWithErrors();
+    }
+    return UR_RESULT_SUCCESS;
   }
 
   if (AllocInfo->IsReleased) {
     ReportDoubleFree(Addr, GetCurrentBacktrace(), AllocInfo);
-    return UR_RESULT_ERROR_INVALID_ARGUMENT;
+    if (getOptions().HaltOnError) {
+      exitWithErrors();
+    }
+    return UR_RESULT_SUCCESS;
   }
 
   AllocInfo->IsReleased = true;
diff --git a/source/loader/layers/sanitizer/asan/asan_options.cpp b/source/loader/layers/sanitizer/asan/asan_options.cpp
index 7704547d38..799c892ff1 100644
--- a/source/loader/layers/sanitizer/asan/asan_options.cpp
+++ b/source/loader/layers/sanitizer/asan/asan_options.cpp
@@ -90,6 +90,7 @@ AsanOptions::AsanOptions() {
   SetBoolOption("detect_privates", DetectPrivates);
   SetBoolOption("print_stats", PrintStats);
   SetBoolOption("detect_leaks", DetectLeaks);
+  SetBoolOption("halt_on_error", HaltOnError);
 
   auto KV = OptionsEnvMap->find("quarantine_size_mb");
   if (KV != OptionsEnvMap->end()) {
diff --git a/source/loader/layers/sanitizer/asan/asan_options.hpp b/source/loader/layers/sanitizer/asan/asan_options.hpp
index 4da32ca278..cea30351d3 100644
--- a/source/loader/layers/sanitizer/asan/asan_options.hpp
+++ b/source/loader/layers/sanitizer/asan/asan_options.hpp
@@ -28,6 +28,7 @@ struct AsanOptions {
   bool PrintStats = false;
   bool DetectKernelArguments = true;
   bool DetectLeaks = true;
+  bool HaltOnError = true;
 
   explicit AsanOptions();
 };
diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.cpp b/source/loader/layers/sanitizer/msan/msan_shadow.cpp
index 8b210a2717..d4446d4dcf 100644
--- a/source/loader/layers/sanitizer/msan/msan_shadow.cpp
+++ b/source/loader/layers/sanitizer/msan/msan_shadow.cpp
@@ -141,7 +141,7 @@ ur_result_t MsanShadowMemoryGPU::Setup() {
     // To reserve very large amount of GPU virtual memroy, the pStart param
     // should be beyond the SVM range, so that GFX driver will automatically
     // switch to reservation on the GPU heap.
-    const void *StartAddress = (void *)(0x100'0000'0000'0000ULL);
+    const void *StartAddress = (void *)GetStartAddress();
     // TODO: Protect Bad Zone
     auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve(
         Context, StartAddress, ShadowSize, (void **)&ShadowBegin);
diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.hpp b/source/loader/layers/sanitizer/msan/msan_shadow.hpp
index 51847e4907..29b4a6d96a 100644
--- a/source/loader/layers/sanitizer/msan/msan_shadow.hpp
+++ b/source/loader/layers/sanitizer/msan/msan_shadow.hpp
@@ -103,6 +103,8 @@ struct MsanShadowMemoryGPU : public MsanShadowMemory {
 
   virtual size_t GetShadowSize() = 0;
 
+  virtual uptr GetStartAddress() { return 0; }
+
 private:
   ur_result_t EnqueueMapShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size,
                                std::vector<ur_event_handle_t> &EventWaitList,
@@ -134,6 +136,8 @@ struct MsanShadowMemoryPVC final : public MsanShadowMemoryGPU {
   uptr MemToShadow(uptr Ptr) override;
 
   size_t GetShadowSize() override { return 0x8000'0000'0000ULL; }
+
+  uptr GetStartAddress() override { return 0x100'0000'0000'0000ULL; }
 };
 
 /// Shadow Memory layout of GPU DG2 device
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index f4a7b7e60a..e716eaaa49 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -6978,7 +6978,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index eb2bd4c353..acb1f65c94 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -7615,7 +7615,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
@@ -7632,6 +7632,9 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     if (NULL == hDevice)
       return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
 
+    if (NULL == pCommandBufferDesc)
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+
     if (NULL == phCommandBuffer)
       return UR_RESULT_ERROR_INVALID_NULL_POINTER;
   }
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index ec6081509f..c1e21fd58b 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -7044,7 +7044,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 5761dab3a4..e579753763 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -4733,6 +4733,8 @@ ur_result_t UR_APICALL urEventGetInfo(
 ///     - ::UR_RESULT_ERROR_INVALID_EVENT
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
+///         + If `propName` is not supported by the adapter.
 ur_result_t UR_APICALL urEventGetProfilingInfo(
     /// [in] handle of the event object
     ur_event_handle_t hEvent,
@@ -7619,6 +7621,7 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp(
 ///         + `NULL == hContext`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pCommandBufferDesc`
 ///         + `NULL == phCommandBuffer`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_DEVICE
@@ -7633,7 +7636,7 @@ ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) try {
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 7023161cb1..c5651c0fc4 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -4138,6 +4138,8 @@ ur_result_t UR_APICALL urEventGetInfo(
 ///     - ::UR_RESULT_ERROR_INVALID_EVENT
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
+///         + If `propName` is not supported by the adapter.
 ur_result_t UR_APICALL urEventGetProfilingInfo(
     /// [in] handle of the event object
     ur_event_handle_t hEvent,
@@ -6664,6 +6666,7 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp(
 ///         + `NULL == hContext`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pCommandBufferDesc`
 ///         + `NULL == phCommandBuffer`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_DEVICE
@@ -6678,7 +6681,7 @@ ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext,
     /// [in] Handle of the device object.
     ur_device_handle_t hDevice,
-    /// [in][optional] command-buffer descriptor.
+    /// [in] Command-buffer descriptor.
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
     /// [out][alloc] Pointer to command-Buffer handle.
     ur_exp_command_buffer_handle_t *phCommandBuffer) {
diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp
index 16723df5cf..e63b6078de 100644
--- a/test/conformance/event/urEventGetProfilingInfo.cpp
+++ b/test/conformance/event/urEventGetProfilingInfo.cpp
@@ -11,8 +11,6 @@ using urEventGetProfilingInfoTest = uur::event::urEventTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEventGetProfilingInfoTest);
 
 TEST_P(urEventGetProfilingInfoTest, SuccessCommandQueued) {
-  UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::NativeCPU{});
-
   const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_QUEUED;
   size_t property_size = 0;
 
@@ -29,8 +27,6 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandQueued) {
 }
 
 TEST_P(urEventGetProfilingInfoTest, SuccessCommandSubmit) {
-  UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::NativeCPU{});
-
   const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_SUBMIT;
   size_t property_size = 0;
 
@@ -79,9 +75,6 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandEnd) {
 }
 
 TEST_P(urEventGetProfilingInfoTest, SuccessCommandComplete) {
-  UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::HIP{}, uur::LevelZero{},
-                       uur::NativeCPU{});
-
   const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_COMPLETE;
   size_t property_size = 0;
 
@@ -98,41 +91,41 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandComplete) {
 }
 
 TEST_P(urEventGetProfilingInfoTest, Success) {
-  UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::HIP{}, uur::LevelZero{},
-                       uur::LevelZeroV2{}, uur::NativeCPU{});
-
-  uint8_t size = 8;
-
-  uint64_t queued_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(
-      event, UR_PROFILING_INFO_COMMAND_QUEUED, size, &queued_value, nullptr));
-  ASSERT_NE(queued_value, 0);
-
-  uint64_t submit_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(
-      event, UR_PROFILING_INFO_COMMAND_SUBMIT, size, &submit_value, nullptr));
-  ASSERT_NE(submit_value, 0);
-
-  uint64_t start_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START,
-                                         size, &start_value, nullptr));
-  ASSERT_NE(start_value, 0);
+  // AMD devices may report a "start" time before the "submit" time
+  UUR_KNOWN_FAILURE_ON(uur::HIP{});
+
+  // If a and b are supported, asserts that a <= b
+  auto test_timing = [=](ur_profiling_info_t a, ur_profiling_info_t b) {
+    std::stringstream trace{"Profiling Info: "};
+    trace << a << " <= " << b;
+    SCOPED_TRACE(trace.str());
+    uint64_t a_time;
+    auto result =
+        urEventGetProfilingInfo(event, a, sizeof(a_time), &a_time, nullptr);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) {
+      return;
+    }
+    ASSERT_SUCCESS(result);
 
-  uint64_t end_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END,
-                                         size, &end_value, nullptr));
-  ASSERT_NE(end_value, 0);
+    uint64_t b_time;
+    result =
+        urEventGetProfilingInfo(event, b, sizeof(b_time), &b_time, nullptr);
+    if (result == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) {
+      return;
+    }
+    ASSERT_SUCCESS(result);
 
-  uint64_t complete_value = 0;
-  ASSERT_SUCCESS(urEventGetProfilingInfo(event,
-                                         UR_PROFILING_INFO_COMMAND_COMPLETE,
-                                         size, &complete_value, nullptr));
-  ASSERT_NE(complete_value, 0);
+    // Note: This assumes that the counter doesn't overflow
+    ASSERT_LE(a_time, b_time);
+  };
 
-  ASSERT_LE(queued_value, submit_value);
-  ASSERT_LT(submit_value, start_value);
-  ASSERT_LT(start_value, end_value);
-  ASSERT_LE(end_value, complete_value);
+  test_timing(UR_PROFILING_INFO_COMMAND_QUEUED,
+              UR_PROFILING_INFO_COMMAND_SUBMIT);
+  test_timing(UR_PROFILING_INFO_COMMAND_SUBMIT,
+              UR_PROFILING_INFO_COMMAND_START);
+  test_timing(UR_PROFILING_INFO_COMMAND_START, UR_PROFILING_INFO_COMMAND_END);
+  test_timing(UR_PROFILING_INFO_COMMAND_END,
+              UR_PROFILING_INFO_COMMAND_COMPLETE);
 }
 
 TEST_P(urEventGetProfilingInfoTest, InvalidNullHandle) {
diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h
index 0cd4d7a6b3..e0ddc0b389 100644
--- a/test/conformance/exp_command_buffer/fixtures.h
+++ b/test/conformance/exp_command_buffer/fixtures.h
@@ -60,8 +60,12 @@ struct urCommandBufferExpTest : uur::urContextTest {
     UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp());
 
     UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device));
+
+    ur_exp_command_buffer_desc_t desc{
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false,
+    };
     ASSERT_SUCCESS(
-        urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle));
+        urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle));
     ASSERT_NE(cmd_buf_handle, nullptr);
   }
 
@@ -83,8 +87,11 @@ struct urCommandBufferExpTestWithParam : urQueueTestWithParam<T> {
     UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam<T>::SetUp());
 
     UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(this->device));
-    ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device,
-                                            nullptr, &cmd_buf_handle));
+
+    ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC,
+                                      nullptr, false, false, false};
+    ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device, &desc,
+                                            &cmd_buf_handle));
     ASSERT_NE(cmd_buf_handle, nullptr);
   }
 
@@ -105,8 +112,11 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest {
     UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp());
 
     UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device));
+
+    ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC,
+                                      nullptr, false, false, false};
     ASSERT_SUCCESS(
-        urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle));
+        urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle));
     ASSERT_NE(cmd_buf_handle, nullptr);
   }
 
@@ -333,15 +343,8 @@ struct urCommandEventSyncTest : urCommandBufferExpTest {
       ASSERT_NE(buffer, nullptr);
     }
 
-    // Create a command-buffer with update enabled.
-    ur_exp_command_buffer_desc_t desc{
-        /*.stype=*/UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC,
-        /*.pNext =*/nullptr,
-        /*.isUpdatable =*/false,
-        /*.isInOrder =*/false,
-        /*.enableProfiling =*/false,
-    };
-
+    ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC,
+                                      nullptr, true, false, false};
     ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc,
                                             &second_cmd_buf_handle));
     ASSERT_NE(second_cmd_buf_handle, nullptr);
diff --git a/test/conformance/exp_command_buffer/kernel_event_sync.cpp b/test/conformance/exp_command_buffer/kernel_event_sync.cpp
index 84941ab26d..60844ffdd7 100644
--- a/test/conformance/exp_command_buffer/kernel_event_sync.cpp
+++ b/test/conformance/exp_command_buffer/kernel_event_sync.cpp
@@ -40,7 +40,10 @@ struct KernelCommandEventSyncTest
     ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 3, nullptr, device_ptrs[1]));
 
     // Create second command-buffer
-    ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr,
+    ur_exp_command_buffer_desc_t desc{
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false,
+    };
+    ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc,
                                             &second_cmd_buf_handle));
     ASSERT_NE(second_cmd_buf_handle, nullptr);
   }
diff --git a/test/conformance/exp_command_buffer/update/invalid_update.cpp b/test/conformance/exp_command_buffer/update/invalid_update.cpp
index 9c3da1ddd5..b93ffc26d4 100644
--- a/test/conformance/exp_command_buffer/update/invalid_update.cpp
+++ b/test/conformance/exp_command_buffer/update/invalid_update.cpp
@@ -109,8 +109,11 @@ TEST_P(InvalidUpdateTest, NotFinalizedCommandBuffer) {
 TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) {
   // Create a command-buffer without isUpdatable
   ur_exp_command_buffer_handle_t test_cmd_buf_handle = nullptr;
+  ur_exp_command_buffer_desc_t desc{
+      UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false,
+  };
   ASSERT_SUCCESS(
-      urCommandBufferCreateExp(context, device, nullptr, &test_cmd_buf_handle));
+      urCommandBufferCreateExp(context, device, &desc, &test_cmd_buf_handle));
   EXPECT_NE(test_cmd_buf_handle, nullptr);
 
   // Append a kernel commands to command-buffer and close command-buffer
diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
index 50c690fb89..1621b902a9 100644
--- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp
+++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp
@@ -378,7 +378,6 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) {
 // Test updating A,X,Y parameters to new values and local memory parameters
 // to new smaller values.
 TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) {
-  UUR_KNOWN_FAILURE_ON(uur::LevelZero{});
 
   // Run command-buffer prior to update an verify output
   ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
@@ -1081,11 +1080,6 @@ struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase {
     UUR_RETURN_ON_FATAL_FAILURE(
         urUpdatableCommandBufferExpExecutionTest::SetUp());
 
-    if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) {
-      GTEST_SKIP()
-          << "Local memory argument update not supported on Level Zero.";
-    }
-
     // HIP has extra args for local memory so we define an offset for arg
     // indices here for updating
     hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0;
diff --git a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp
index 45309c3dee..0094caa274 100644
--- a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp
+++ b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp
@@ -311,9 +311,12 @@ TEST_P(urMultiDeviceCommandBufferExpTest, Enqueue) {
     }
 
     // Create command-buffer
+    ur_exp_command_buffer_desc_t desc{
+        UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false,
+    };
     uur::raii::CommandBuffer cmd_buf_handle;
-    ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr,
-                                            cmd_buf_handle.ptr()));
+    ASSERT_SUCCESS(
+        urCommandBufferCreateExp(context, device, &desc, cmd_buf_handle.ptr()));
 
     // Append kernel command to command-buffer and close command-buffer
     ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
diff --git a/test/conformance/testing/include/uur/optional_queries.h b/test/conformance/testing/include/uur/optional_queries.h
index d3f670545a..e0a78c18f5 100644
--- a/test/conformance/testing/include/uur/optional_queries.h
+++ b/test/conformance/testing/include/uur/optional_queries.h
@@ -117,4 +117,16 @@ template <> inline bool isQueryOptional(ur_queue_info_t query) {
                    query) != optional_ur_queue_info_t.end();
 }
 
+constexpr std::array optional_ur_profiling_info_t = {
+    UR_PROFILING_INFO_COMMAND_QUEUED,   UR_PROFILING_INFO_COMMAND_SUBMIT,
+    UR_PROFILING_INFO_COMMAND_START,    UR_PROFILING_INFO_COMMAND_END,
+    UR_PROFILING_INFO_COMMAND_COMPLETE,
+};
+
+template <> inline bool isQueryOptional(ur_profiling_info_t query) {
+  return std::find(optional_ur_profiling_info_t.begin(),
+                   optional_ur_profiling_info_t.end(),
+                   query) != optional_ur_profiling_info_t.end();
+}
+
 } // namespace uur