diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake index bdd4a6d15c..ec945ba2c2 100644 --- a/cmake/helpers.cmake +++ b/cmake/helpers.cmake @@ -157,11 +157,6 @@ function(add_ur_target_link_options name) if (UR_DEVELOPER_MODE) target_link_options(${name} PRIVATE -Werror -Wextra) endif() - if (CMAKE_BUILD_TYPE STREQUAL "Release") - target_link_options(${name} PRIVATE - $<$:-pie> - ) - endif() endif() elseif(MSVC) target_link_options(${name} PRIVATE @@ -176,7 +171,15 @@ function(add_ur_target_link_options name) endfunction() function(add_ur_target_exec_options name) - if(MSVC) + if(NOT MSVC) + if(NOT APPLE) + if(CMAKE_BUILD_TYPE STREQUAL "Release") + target_link_options(${name} PRIVATE + $<$:-pie> + ) + endif() + endif() + elseif(MSVC) target_link_options(${name} PRIVATE LINKER:/ALLOWISOLATION ) diff --git a/include/ur_api.h b/include/ur_api.h index d7621bda32..c390ed4410 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -7105,21 +7105,21 @@ typedef enum ur_event_info_t { /////////////////////////////////////////////////////////////////////////////// /// @brief Profiling query information type typedef enum ur_profiling_info_t { - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event is enqueued + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event is enqueued UR_PROFILING_INFO_COMMAND_QUEUED = 0, - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event is submitted + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event is submitted UR_PROFILING_INFO_COMMAND_SUBMIT = 1, - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event starts execution + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event starts execution UR_PROFILING_INFO_COMMAND_START = 2, - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event has finished execution + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event has finished execution UR_PROFILING_INFO_COMMAND_END = 3, - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event and any child events enqueued by this event on the - /// device have finished execution + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event and any child events enqueued by this event + /// on the device have finished execution UR_PROFILING_INFO_COMMAND_COMPLETE = 4, /// @cond UR_PROFILING_INFO_FORCE_UINT32 = 0x7fffffff @@ -7193,6 +7193,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo( /// - ::UR_RESULT_ERROR_INVALID_EVENT /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION +/// + If `propName` is not supported by the adapter. UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( /// [in] handle of the event object ur_event_handle_t hEvent, @@ -10111,6 +10113,7 @@ typedef struct ur_exp_command_buffer_command_handle_t_ /// + `NULL == hContext` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pCommandBufferDesc` /// + `NULL == phCommandBuffer` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_DEVICE @@ -10125,7 +10128,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer); diff --git a/scripts/benchmarks/benches/oneapi.py b/scripts/benchmarks/benches/oneapi.py index b6b3f62c5c..0547f6646e 100644 --- a/scripts/benchmarks/benches/oneapi.py +++ b/scripts/benchmarks/benches/oneapi.py @@ -11,7 +11,7 @@ class OneAPI: # random unique number for benchmark oneAPI installation - ONEAPI_BENCHMARK_INSTANCE_ID = 98765 + ONEAPI_BENCHMARK_INSTANCE_ID = 987654 def __init__(self): self.oneapi_dir = os.path.join(options.workdir, "oneapi") diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py index 0b932a888d..69237c46f6 100755 --- a/scripts/benchmarks/main.py +++ b/scripts/benchmarks/main.py @@ -262,9 +262,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): compare_names.append(saved_name) if options.output_html: - html_content = generate_html( - history.runs, "oneapi-src/unified-runtime", compare_names - ) + html_content = generate_html(history.runs, "intel/llvm", compare_names) with open("benchmark_results.html", "w") as file: file.write(html_content) diff --git a/scripts/benchmarks/options.py b/scripts/benchmarks/options.py index 394766b605..2e92675264 100644 --- a/scripts/benchmarks/options.py +++ b/scripts/benchmarks/options.py @@ -37,7 +37,7 @@ class Options: build_compute_runtime: bool = False extra_ld_libraries: list[str] = field(default_factory=list) extra_env_vars: dict = field(default_factory=dict) - compute_runtime_tag: str = "24.52.32224.10" + compute_runtime_tag: str = "25.05.32567.12" build_igc: bool = False current_run_name: str = "This PR" diff --git a/scripts/benchmarks/requirements.txt b/scripts/benchmarks/requirements.txt new file mode 100644 index 0000000000..99ba0caab5 --- /dev/null +++ b/scripts/benchmarks/requirements.txt @@ -0,0 +1,4 @@ +matplotlib==3.9.2 +mpld3==0.5.10 +dataclasses-json==0.6.7 +PyYAML==6.0.1 diff --git a/scripts/core/event.yml b/scripts/core/event.yml index da2fa457de..1b0eeca23e 100644 --- a/scripts/core/event.yml +++ b/scripts/core/event.yml @@ -121,15 +121,15 @@ name: $x_profiling_info_t typed_etors: True etors: - name: COMMAND_QUEUED - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event is enqueued" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event is enqueued" - name: COMMAND_SUBMIT - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event is submitted" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event is submitted" - name: COMMAND_START - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event starts execution" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event starts execution" - name: COMMAND_END - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event has finished execution" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event has finished execution" - name: COMMAND_COMPLETE - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event and any child events enqueued by this event on the device have finished execution" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event and any child events enqueued by this event on the device have finished execution" --- #-------------------------------------------------------------------------- type: function desc: "Get event object information" @@ -198,6 +198,8 @@ returns: - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_OUT_OF_RESOURCES - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY + - $X_RESULT_ERROR_UNSUPPORTED_ENUMERATION: + - "If `propName` is not supported by the adapter." --- #-------------------------------------------------------------------------- type: function desc: "Wait for a list of events to finish." diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml index 218c626423..680bc60f8d 100644 --- a/scripts/core/exp-command-buffer.yml +++ b/scripts/core/exp-command-buffer.yml @@ -282,7 +282,7 @@ params: desc: "[in] Handle of the device object." - type: "const $x_exp_command_buffer_desc_t*" name: pCommandBufferDesc - desc: "[in][optional] command-buffer descriptor." + desc: "[in] Command-buffer descriptor." - type: "$x_exp_command_buffer_handle_t*" name: phCommandBuffer desc: "[out][alloc] Pointer to command-Buffer handle." diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 0d274e6c15..42ec8dbafc 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -352,10 +352,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_exp_command_buffer_desc_t *pCommandBufferDesc, ur_exp_command_buffer_handle_t *phCommandBuffer) { - - const bool IsUpdatable = - pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; - + const bool IsUpdatable = pCommandBufferDesc->isUpdatable; try { *phCommandBuffer = new ur_exp_command_buffer_handle_t_(hContext, hDevice, IsUpdatable); diff --git a/source/adapters/cuda/common.hpp b/source/adapters/cuda/common.hpp index 67223c45bc..23189ce91f 100644 --- a/source/adapters/cuda/common.hpp +++ b/source/adapters/cuda/common.hpp @@ -12,6 +12,17 @@ #include #include +#include +#include + +#define UMF_RETURN_UMF_ERROR(UmfResult) \ + do { \ + umf_result_t UmfResult_ = (UmfResult); \ + if (UmfResult_ != UMF_RESULT_SUCCESS) { \ + return UmfResult_; \ + } \ + } while (0) + ur_result_t mapErrorUR(CUresult Result); /// Converts CUDA error into UR error codes, and outputs error information @@ -59,3 +70,30 @@ void assertion(bool Condition, const char *Message = nullptr); } // namespace ur } // namespace detail + +namespace umf { + +using cuda_params_unique_handle_t = std::unique_ptr< + umf_cuda_memory_provider_params_t, + std::function>; + +inline umf_result_t setCUMemoryProviderParams( + umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams, + int cuDevice, void *cuContext, umf_usm_memory_type_t memType) { + + umf_result_t UmfResult = + umfCUDAMemoryProviderParamsSetContext(CUMemoryProviderParams, cuContext); + UMF_RETURN_UMF_ERROR(UmfResult); + + UmfResult = + umfCUDAMemoryProviderParamsSetDevice(CUMemoryProviderParams, cuDevice); + UMF_RETURN_UMF_ERROR(UmfResult); + + UmfResult = + umfCUDAMemoryProviderParamsSetMemoryType(CUMemoryProviderParams, memType); + UMF_RETURN_UMF_ERROR(UmfResult); + + return UMF_RESULT_SUCCESS; +} + +} // namespace umf diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp index 96a1464a87..e84b4b7f7a 100644 --- a/source/adapters/cuda/context.hpp +++ b/source/adapters/cuda/context.hpp @@ -77,8 +77,9 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); /// static ur_result_t -CreateHostMemoryProvider(ur_device_handle_t_ *DeviceHandle, - umf_memory_provider_handle_t *MemoryProviderHost) { +CreateHostMemoryProviderPool(ur_device_handle_t_ *DeviceHandle, + umf_memory_provider_handle_t *MemoryProviderHost, + umf_memory_pool_handle_t *MemoryPoolHost) { umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr; *MemoryProviderHost = nullptr; @@ -91,10 +92,20 @@ CreateHostMemoryProvider(ur_device_handle_t_ *DeviceHandle, umf::cuda_params_unique_handle_t CUMemoryProviderParamsUnique( CUMemoryProviderParams, umfCUDAMemoryProviderParamsDestroy); - // create UMF CUDA memory provider for the host memory (UMF_MEMORY_TYPE_HOST) - UmfResult = umf::createMemoryProvider( - CUMemoryProviderParamsUnique.get(), 0 /* cuDevice */, context, - UMF_MEMORY_TYPE_HOST, MemoryProviderHost); + UmfResult = umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(), + 0 /* cuDevice */, context, + UMF_MEMORY_TYPE_HOST); + UMF_RETURN_UR_ERROR(UmfResult); + + // create UMF CUDA memory provider and pool for the host memory + // (UMF_MEMORY_TYPE_HOST) + UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), + CUMemoryProviderParamsUnique.get(), + MemoryProviderHost); + UMF_RETURN_UR_ERROR(UmfResult); + + UmfResult = umfPoolCreate(umfProxyPoolOps(), *MemoryProviderHost, nullptr, 0, + MemoryPoolHost); UMF_RETURN_UR_ERROR(UmfResult); return UR_RESULT_SUCCESS; @@ -112,8 +123,10 @@ struct ur_context_handle_t_ { std::vector Devices; std::atomic_uint32_t RefCount; - // UMF CUDA memory provider for the host memory (UMF_MEMORY_TYPE_HOST) + // UMF CUDA memory provider and pool for the host memory + // (UMF_MEMORY_TYPE_HOST) umf_memory_provider_handle_t MemoryProviderHost = nullptr; + umf_memory_pool_handle_t MemoryPoolHost = nullptr; ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices) : Devices{Devs, Devs + NumDevices}, RefCount{1} { @@ -124,10 +137,14 @@ struct ur_context_handle_t_ { // Create UMF CUDA memory provider for the host memory // (UMF_MEMORY_TYPE_HOST) from any device (Devices[0] is used here, because // it is guaranteed to exist). - UR_CHECK_ERROR(CreateHostMemoryProvider(Devices[0], &MemoryProviderHost)); + UR_CHECK_ERROR(CreateHostMemoryProviderPool(Devices[0], &MemoryProviderHost, + &MemoryPoolHost)); }; ~ur_context_handle_t_() { + if (MemoryPoolHost) { + umfPoolDestroy(MemoryPoolHost); + } if (MemoryProviderHost) { umfMemoryProviderDestroy(MemoryProviderHost); } diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp index eaf4ba6765..e94291367b 100644 --- a/source/adapters/cuda/device.hpp +++ b/source/adapters/cuda/device.hpp @@ -11,6 +11,7 @@ #include +#include #include #include "common.hpp" @@ -84,9 +85,17 @@ struct ur_device_handle_t_ { MemoryProviderDevice = nullptr; MemoryProviderShared = nullptr; + MemoryPoolDevice = nullptr; + MemoryPoolShared = nullptr; } ~ur_device_handle_t_() { + if (MemoryPoolDevice) { + umfPoolDestroy(MemoryPoolDevice); + } + if (MemoryPoolShared) { + umfPoolDestroy(MemoryPoolShared); + } if (MemoryProviderDevice) { umfMemoryProviderDestroy(MemoryProviderDevice); } @@ -131,11 +140,15 @@ struct ur_device_handle_t_ { // bookkeeping for mipmappedArray leaks in Mapping external Memory std::map ChildCuarrayFromMipmapMap; - // UMF CUDA memory provider for the device memory (UMF_MEMORY_TYPE_DEVICE) + // UMF CUDA memory provider and pool for the device memory + // (UMF_MEMORY_TYPE_DEVICE) umf_memory_provider_handle_t MemoryProviderDevice; + umf_memory_pool_handle_t MemoryPoolDevice; - // UMF CUDA memory provider for the shared memory (UMF_MEMORY_TYPE_SHARED) + // UMF CUDA memory provider and pool for the shared memory + // (UMF_MEMORY_TYPE_SHARED) umf_memory_provider_handle_t MemoryProviderShared; + umf_memory_pool_handle_t MemoryPoolShared; }; int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute); diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index a6c2208e8f..37792bf5b2 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -213,6 +213,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return ReturnValue(static_cast(hEvent->getStartTime())); case UR_PROFILING_INFO_COMMAND_END: return ReturnValue(static_cast(hEvent->getEndTime())); + case UR_PROFILING_INFO_COMMAND_COMPLETE: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp index 651fe0f43d..6e68275f3a 100644 --- a/source/adapters/cuda/memory.cpp +++ b/source/adapters/cuda/memory.cpp @@ -50,8 +50,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP)); AllocMode = BufferMem::AllocMode::UseHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { - UMF_CHECK_ERROR(umfMemoryProviderAlloc(hContext->MemoryProviderHost, size, - 0, &HostPtr)); + HostPtr = umfPoolMalloc(hContext->MemoryPoolHost, size); + UMF_CHECK_PTR(HostPtr); AllocMode = BufferMem::AllocMode::AllocHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { AllocMode = BufferMem::AllocMode::CopyIn; @@ -442,8 +442,8 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, CU_MEMHOSTALLOC_DEVICEMAP)); UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0)); } else { - UMF_CHECK_ERROR(umfMemoryProviderAlloc(hDevice->MemoryProviderDevice, - Buffer.Size, 0, (void **)&DevPtr)); + *(void **)&DevPtr = umfPoolMalloc(hDevice->MemoryPoolDevice, Buffer.Size); + UMF_CHECK_PTR(*(void **)&DevPtr); } } else { CUarray ImageArray{}; diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index 6dcaa28414..f0fc14f864 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -158,7 +158,7 @@ struct BufferMem { case AllocMode::Classic: for (auto &DevPtr : Ptrs) { if (DevPtr != native_type{0}) { - UR_CHECK_ERROR(cuMemFree(DevPtr)); + UMF_CHECK_ERROR(umfFree((void *)DevPtr)); } } break; @@ -166,7 +166,7 @@ struct BufferMem { UR_CHECK_ERROR(cuMemHostUnregister(HostPtr)); break; case AllocMode::AllocHostPtr: - UR_CHECK_ERROR(cuMemFreeHost(HostPtr)); + UMF_CHECK_ERROR(umfFree((void *)HostPtr)); } return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp index d53a027160..ac66c39afb 100644 --- a/source/adapters/cuda/platform.cpp +++ b/source/adapters/cuda/platform.cpp @@ -20,7 +20,7 @@ #include static ur_result_t -CreateDeviceMemoryProviders(ur_platform_handle_t_ *Platform) { +CreateDeviceMemoryProvidersPools(ur_platform_handle_t_ *Platform) { umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr; umf_result_t UmfResult = @@ -37,16 +37,40 @@ CreateDeviceMemoryProviders(ur_platform_handle_t_ *Platform) { // create UMF CUDA memory provider for the device memory // (UMF_MEMORY_TYPE_DEVICE) - UmfResult = umf::createMemoryProvider( - CUMemoryProviderParamsUnique.get(), device, context, - UMF_MEMORY_TYPE_DEVICE, &device_handle->MemoryProviderDevice); + UmfResult = + umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(), + device, context, UMF_MEMORY_TYPE_DEVICE); + UMF_RETURN_UR_ERROR(UmfResult); + + UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), + CUMemoryProviderParamsUnique.get(), + &device_handle->MemoryProviderDevice); UMF_RETURN_UR_ERROR(UmfResult); // create UMF CUDA memory provider for the shared memory // (UMF_MEMORY_TYPE_SHARED) - UmfResult = umf::createMemoryProvider( - CUMemoryProviderParamsUnique.get(), device, context, - UMF_MEMORY_TYPE_SHARED, &device_handle->MemoryProviderShared); + UmfResult = + umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(), + device, context, UMF_MEMORY_TYPE_SHARED); + UMF_RETURN_UR_ERROR(UmfResult); + + UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), + CUMemoryProviderParamsUnique.get(), + &device_handle->MemoryProviderShared); + UMF_RETURN_UR_ERROR(UmfResult); + + // create UMF CUDA memory pool for the device memory + // (UMF_MEMORY_TYPE_DEVICE) + UmfResult = + umfPoolCreate(umfProxyPoolOps(), device_handle->MemoryProviderDevice, + nullptr, 0, &device_handle->MemoryPoolDevice); + UMF_RETURN_UR_ERROR(UmfResult); + + // create UMF CUDA memory pool for the shared memory + // (UMF_MEMORY_TYPE_SHARED) + UmfResult = + umfPoolCreate(umfProxyPoolOps(), device_handle->MemoryProviderShared, + nullptr, 0, &device_handle->MemoryPoolShared); UMF_RETURN_UR_ERROR(UmfResult); } @@ -134,7 +158,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, static_cast(i)}); } - UR_CHECK_ERROR(CreateDeviceMemoryProviders(&Platform)); + UR_CHECK_ERROR(CreateDeviceMemoryProvidersPools(&Platform)); } catch (const std::bad_alloc &) { // Signal out-of-memory situation for (int i = 0; i < NumDevices; ++i) { diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp index e40927b7a8..5d2d43442d 100644 --- a/source/adapters/cuda/usm.cpp +++ b/source/adapters/cuda/usm.cpp @@ -102,54 +102,12 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } -ur_result_t USMFreeImpl(ur_context_handle_t hContext, void *Pointer) { - ur_result_t Result = UR_RESULT_SUCCESS; - try { - unsigned int IsManaged; - unsigned int Type; - unsigned int DeviceOrdinal; - const int NumAttributes = 3; - void *AttributeValues[NumAttributes] = {&IsManaged, &Type, &DeviceOrdinal}; - - CUpointer_attribute Attributes[NumAttributes] = { - CU_POINTER_ATTRIBUTE_IS_MANAGED, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL}; - UR_CHECK_ERROR(cuPointerGetAttributes( - NumAttributes, Attributes, AttributeValues, (CUdeviceptr)Pointer)); - UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - std::vector ContextDevices = hContext->getDevices(); - ur_platform_handle_t Platform = ContextDevices[0]->getPlatform(); - unsigned int NumDevices = Platform->Devices.size(); - UR_ASSERT(DeviceOrdinal < NumDevices, UR_RESULT_ERROR_INVALID_DEVICE); - - ur_device_handle_t Device = Platform->Devices[DeviceOrdinal].get(); - umf_memory_provider_handle_t MemoryProvider; - - if (IsManaged) { - MemoryProvider = Device->MemoryProviderShared; - } else if (Type == CU_MEMORYTYPE_DEVICE) { - MemoryProvider = Device->MemoryProviderDevice; - } else { - MemoryProvider = hContext->MemoryProviderHost; - } - - UMF_CHECK_ERROR(umfMemoryProviderFree(MemoryProvider, Pointer, - 0 /* size is unknown */)); - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - /// USM: Frees the given USM pointer associated with the context. /// UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, void *pMem) { - if (auto Pool = umfPoolByPtr(pMem)) - return umf::umf2urResult(umfPoolFree(Pool, pMem)); - return USMFreeImpl(hContext, pMem); + (void)hContext; // unused + return umf::umf2urResult(umfFree(pMem)); } ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, @@ -158,8 +116,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, uint32_t Alignment) { try { ScopedContext Active(Device); - UMF_CHECK_ERROR(umfMemoryProviderAlloc(Device->MemoryProviderDevice, Size, - Alignment, ResultPtr)); + *ResultPtr = umfPoolMalloc(Device->MemoryPoolDevice, Size); + UMF_CHECK_PTR(*ResultPtr); } catch (ur_result_t Err) { return Err; } @@ -180,8 +138,8 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, uint32_t Alignment) { try { ScopedContext Active(Device); - UMF_CHECK_ERROR(umfMemoryProviderAlloc(Device->MemoryProviderShared, Size, - Alignment, ResultPtr)); + *ResultPtr = umfPoolMalloc(Device->MemoryPoolShared, Size); + UMF_CHECK_PTR(*ResultPtr); } catch (ur_result_t Err) { return Err; } @@ -199,8 +157,8 @@ ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t hContext, ur_usm_host_mem_flags_t, size_t Size, uint32_t Alignment) { try { - UMF_CHECK_ERROR(umfMemoryProviderAlloc(hContext->MemoryProviderHost, Size, - Alignment, ResultPtr)); + *ResultPtr = umfPoolMalloc(hContext->MemoryPoolHost, Size); + UMF_CHECK_PTR(*ResultPtr); } catch (ur_result_t Err) { return Err; } @@ -326,73 +284,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMReleaseExp(ur_context_handle_t Context, return UR_RESULT_SUCCESS; } -umf_result_t USMMemoryProvider::initialize(ur_context_handle_t Ctx, - ur_device_handle_t Dev) { - Context = Ctx; - Device = Dev; - // There isn't a way to query this in cuda, and there isn't much info on - // cuda's approach to alignment or transfer granularity between host and - // device. Within UMF this is only used to influence alignment, and since we - // discard that in our alloc implementations it seems we can safely ignore - // this as well, for now. - MinPageSize = 0; - - return UMF_RESULT_SUCCESS; -} - -enum umf_result_t USMMemoryProvider::alloc(size_t Size, size_t Align, - void **Ptr) { - auto Res = allocateImpl(Ptr, Size, Align); - if (Res != UR_RESULT_SUCCESS) { - getLastStatusRef() = Res; - return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; - } - - return UMF_RESULT_SUCCESS; -} - -enum umf_result_t USMMemoryProvider::free(void *Ptr, size_t Size) { - (void)Size; - - auto Res = USMFreeImpl(Context, Ptr); - if (Res != UR_RESULT_SUCCESS) { - getLastStatusRef() = Res; - return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; - } - - return UMF_RESULT_SUCCESS; -} - -void USMMemoryProvider::get_last_native_error(const char **ErrMsg, - int32_t *ErrCode) { - (void)ErrMsg; - *ErrCode = static_cast(getLastStatusRef()); -} - -umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { - (void)Ptr; - *PageSize = MinPageSize; - - return UMF_RESULT_SUCCESS; -} - -ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0, - /*device flags*/ 0, Size, Alignment); -} - -ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size, - Alignment); -} - -ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment); -} - ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, ur_usm_pool_desc_t *PoolDesc) : Context{Context} { @@ -416,36 +307,28 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, pNext = BaseDesc->pNext; } - auto MemProvider = - umf::memoryProviderMakeUnique(Context, nullptr) - .second; - auto UmfHostParamsHandle = getUmfParamsHandle( DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]); - HostMemPool = - umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider), - UmfHostParamsHandle.get()) - .second; + HostMemPool = umf::poolMakeUniqueFromOpsProviderHandle( + umfDisjointPoolOps(), Context->MemoryProviderHost, + UmfHostParamsHandle.get()) + .second; for (const auto &Device : Context->getDevices()) { - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; auto UmfDeviceParamsHandle = getUmfParamsHandle( DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]); - DeviceMemPool = - umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider), - UmfDeviceParamsHandle.get()) - .second; - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; + DeviceMemPool = umf::poolMakeUniqueFromOpsProviderHandle( + umfDisjointPoolOps(), Device->MemoryProviderDevice, + UmfDeviceParamsHandle.get()) + .second; + auto UmfSharedParamsHandle = getUmfParamsHandle( DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]); - SharedMemPool = - umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider), - UmfSharedParamsHandle.get()) - .second; + SharedMemPool = umf::poolMakeUniqueFromOpsProviderHandle( + umfDisjointPoolOps(), Device->MemoryProviderShared, + UmfSharedParamsHandle.get()) + .second; + Context->addPool(this); } } diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp index 7c6a2ea666..8258043d2b 100644 --- a/source/adapters/cuda/usm.hpp +++ b/source/adapters/cuda/usm.hpp @@ -48,80 +48,6 @@ class UsmAllocationException { ur_result_t getError() const { return Error; } }; -// Implements memory allocation via driver API for USM allocator interface. -class USMMemoryProvider { -private: - ur_result_t &getLastStatusRef() { - static thread_local ur_result_t LastStatus = UR_RESULT_SUCCESS; - return LastStatus; - } - -protected: - ur_context_handle_t Context; - ur_device_handle_t Device; - size_t MinPageSize; - - // Internal allocation routine which must be implemented for each allocation - // type - virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) = 0; - -public: - umf_result_t initialize(ur_context_handle_t Ctx, ur_device_handle_t Dev); - umf_result_t alloc(size_t Size, size_t Align, void **Ptr); - umf_result_t free(void *Ptr, size_t Size); - void get_last_native_error(const char **ErrMsg, int32_t *ErrCode); - umf_result_t get_min_page_size(void *, size_t *); - umf_result_t get_recommended_page_size(size_t, size_t *) { - return UMF_RESULT_ERROR_NOT_SUPPORTED; - }; - umf_result_t purge_lazy(void *, size_t) { - return UMF_RESULT_ERROR_NOT_SUPPORTED; - }; - umf_result_t purge_force(void *, size_t) { - return UMF_RESULT_ERROR_NOT_SUPPORTED; - }; - umf_result_t allocation_merge(void *, void *, size_t) { - return UMF_RESULT_ERROR_UNKNOWN; - } - umf_result_t allocation_split(void *, size_t, size_t) { - return UMF_RESULT_ERROR_UNKNOWN; - } - virtual const char *get_name() = 0; - - virtual ~USMMemoryProvider() = default; -}; - -// Allocation routines for shared memory type -class USMSharedMemoryProvider final : public USMMemoryProvider { -public: - const char *get_name() override { return "USMSharedMemoryProvider"; } - -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; -}; - -// Allocation routines for device memory type -class USMDeviceMemoryProvider final : public USMMemoryProvider { -public: - const char *get_name() override { return "USMSharedMemoryProvider"; } - -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; -}; - -// Allocation routines for host memory type -class USMHostMemoryProvider final : public USMMemoryProvider { -public: - const char *get_name() override { return "USMSharedMemoryProvider"; } - -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; -}; - ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, ur_usm_device_mem_flags_t Flags, size_t Size, diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 4c65e61dab..ce07332ce8 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -233,9 +233,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_exp_command_buffer_desc_t *pCommandBufferDesc, ur_exp_command_buffer_handle_t *phCommandBuffer) { - const bool IsUpdatable = - pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; - + const bool IsUpdatable = pCommandBufferDesc->isUpdatable; try { *phCommandBuffer = new ur_exp_command_buffer_handle_t_(hContext, hDevice, IsUpdatable); diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 81c839cf32..d971207a6c 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -234,6 +234,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return ReturnValue(static_cast(hEvent->getStartTime())); case UR_PROFILING_INFO_COMMAND_END: return ReturnValue(static_cast(hEvent->getEndTime())); + case UR_PROFILING_INFO_COMMAND_COMPLETE: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 879ee0f1cc..4705964190 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -586,9 +586,7 @@ bool canBeInOrder(ur_context_handle_t Context, bool CompatibleDriver = Context->getPlatform()->isDriverVersionNewerOrSimilar( 1, 3, L0_DRIVER_INORDER_MIN_VERSION); bool CanUseDriverInOrderLists = CompatibleDriver && DriverInOrderRequested; - return CanUseDriverInOrderLists - ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false) - : false; + return CanUseDriverInOrderLists ? CommandBufferDesc->isInOrder : false; } /** @@ -624,9 +622,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, const ur_exp_command_buffer_desc_t *CommandBufferDesc, ur_exp_command_buffer_handle_t *CommandBuffer) { bool IsInOrder = canBeInOrder(Context, CommandBufferDesc); - bool EnableProfiling = - CommandBufferDesc && CommandBufferDesc->enableProfiling && !IsInOrder; - bool IsUpdatable = CommandBufferDesc && CommandBufferDesc->isUpdatable; + bool EnableProfiling = CommandBufferDesc->enableProfiling && !IsInOrder; + bool IsUpdatable = CommandBufferDesc->isUpdatable; bool ImmediateAppendPath = checkImmediateAppendSupport(Context, Device); const bool WaitEventPath = !ImmediateAppendPath; bool UseCounterBasedEvents = checkCounterBasedEventsSupport(Device) && @@ -910,17 +907,18 @@ ur_result_t setKernelPendingArguments( * @param[in] CommandBuffer The CommandBuffer associated with the new command. * @param[in] Kernel The Kernel associated with the new command. * @param[in] WorkDim Dimensions of the kernel associated with the new command. + * @param[in] GlobalWorkSize Global work size of the kernel associated with the + * new command. * @param[in] LocalWorkSize LocalWorkSize of the kernel associated with the new * command. * @param[out] Command The handle to the new command. * @return UR_RESULT_SUCCESS or an error code on failure */ -ur_result_t -createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer, - ur_kernel_handle_t Kernel, uint32_t WorkDim, - const size_t *LocalWorkSize, uint32_t NumKernelAlternatives, - ur_kernel_handle_t *KernelAlternatives, - ur_exp_command_buffer_command_handle_t *Command) { +ur_result_t createCommandHandle( + ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, + uint32_t WorkDim, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives, + ur_exp_command_buffer_command_handle_t *Command) { assert(CommandBuffer->IsUpdatable); @@ -992,6 +990,8 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer, CommandBuffer, Kernel, CommandId, WorkDim, LocalWorkSize != nullptr, NumKernelAlternatives, KernelAlternatives); + NewCommand->setGlobalWorkSize(GlobalWorkSize); + *Command = NewCommand.get(); CommandBuffer->CommandHandles.push_back(std::move(NewCommand)); @@ -1066,9 +1066,9 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( } if (Command) { - UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize, - NumKernelAlternatives, KernelAlternatives, - Command)); + UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, GlobalWorkSize, + LocalWorkSize, NumKernelAlternatives, + KernelAlternatives, Command)); } std::vector ZeEventList; ze_event_handle_t ZeLaunchEvent = nullptr; @@ -1922,10 +1922,16 @@ ur_result_t updateKernelCommand( Descs.push_back(std::move(MutableGroupSizeDesc)); } - // Check if a new global size is provided and if we need to update the group - // count. + // Check if a new global or local size is provided and if so we need to update + // the group count. ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; - if (NewGlobalWorkSize && Dim > 0) { + if ((NewGlobalWorkSize || NewLocalWorkSize) && Dim > 0) { + // If a new global work size is provided update that in the command, + // otherwise the previous work group size will be used + if (NewGlobalWorkSize) { + Command->WorkDim = Dim; + Command->setGlobalWorkSize(NewGlobalWorkSize); + } // If a new global work size is provided but a new local work size is not // then we still need to update local work size based on the size suggested // by the driver for the kernel. @@ -1935,9 +1941,9 @@ ur_result_t updateKernelCommand( UR_CALL(getZeKernel(ZeDevice, Command->Kernel, &ZeKernel)); uint32_t WG[3]; - UR_CALL(calculateKernelWorkDimensions(ZeKernel, CommandBuffer->Device, - ZeThreadGroupDimensions, WG, Dim, - NewGlobalWorkSize, NewLocalWorkSize)); + UR_CALL(calculateKernelWorkDimensions( + ZeKernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG, Dim, + Command->GlobalWorkSize, NewLocalWorkSize)); auto MutableGroupCountDesc = std::make_unique>(); diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index c9389eaf11..491cb2638e 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -172,8 +172,19 @@ struct kernel_command_handle : public ur_exp_command_buffer_command_handle_t_ { ~kernel_command_handle(); + void setGlobalWorkSize(const size_t *GlobalWorkSizePtr) { + const size_t CopySize = sizeof(size_t) * WorkDim; + std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize); + if (WorkDim < 3) { + const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim); + std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize); + } + } + // Work-dimension the command was originally created with. uint32_t WorkDim; + // Global work size of the kernel + size_t GlobalWorkSize[3]; // Set to true if the user set the local work size on command creation. bool UserDefinedLocalSize; // Currently active kernel handle diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index e12352b6b1..cfcd1d316d 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -609,6 +609,10 @@ ur_result_t urEventGetProfilingInfo( return ReturnValue(ContextEndTime); } + case UR_PROFILING_INFO_COMMAND_COMPLETE: + logger::error("urEventGetProfilingInfo: " + "UR_PROFILING_INFO_COMMAND_COMPLETE not supported"); + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: logger::error("urEventGetProfilingInfo: not supported ParamName"); return UR_RESULT_ERROR_INVALID_VALUE; @@ -672,6 +676,10 @@ ur_result_t urEventGetProfilingInfo( ContextEndTime *= ZeTimerResolution; return ReturnValue(ContextEndTime); } + case UR_PROFILING_INFO_COMMAND_COMPLETE: + logger::error("urEventGetProfilingInfo: " + "UR_PROFILING_INFO_COMMAND_COMPLETE not supported"); + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: logger::error("urEventGetProfilingInfo: not supported ParamName"); return UR_RESULT_ERROR_INVALID_VALUE; @@ -715,6 +723,10 @@ ur_result_t urEventGetProfilingInfo( // enqueue. // return ReturnValue(uint64_t{0}); + case UR_PROFILING_INFO_COMMAND_COMPLETE: + logger::error("urEventGetProfilingInfo: UR_PROFILING_INFO_COMMAND_COMPLETE " + "not supported"); + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: logger::error("urEventGetProfilingInfo: not supported ParamName"); return UR_RESULT_ERROR_INVALID_VALUE; diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index 8a2153e0a5..129db02594 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -170,53 +170,6 @@ ur_result_t urBindlessImagesReleaseExternalSemaphoreExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t urCommandBufferAppendUSMFillExp( - ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory, - const void *pPattern, size_t patternSize, size_t size, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferAppendMemBufferFillExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - const void *pPattern, size_t patternSize, size_t offset, size_t size, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferAppendUSMPrefetchExp( - ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, - size_t size, ur_usm_migration_flags_t flags, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferAppendUSMAdviseExp( - ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, - size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferUpdateKernelLaunchExp( ur_exp_command_buffer_command_handle_t hCommand, const ur_exp_command_buffer_update_kernel_launch_desc_t diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 8253527efe..8511d818b2 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -346,6 +346,115 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( return exceptionToResult(std::current_exception()); } +ur_result_t urCommandBufferAppendUSMFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory, + const void *pPattern, size_t patternSize, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) try { + + // the same issue as in urCommandBufferAppendKernelLaunchExp + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + std::ignore = phCommand; + + UR_CALL(hCommandBuffer->commandListManager.appendUSMFill( + pMemory, patternSize, pPattern, size, 0, nullptr, nullptr)); + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferAppendMemBufferFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, size_t offset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) try { + + // the same issue as in urCommandBufferAppendKernelLaunchExp + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + std::ignore = phCommand; + + UR_CALL(hCommandBuffer->commandListManager.appendMemBufferFill( + hBuffer, pPattern, patternSize, offset, size, 0, nullptr, nullptr)); + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferAppendUSMPrefetchExp( + ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, + size_t size, ur_usm_migration_flags_t flags, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) try { + + // the same issue as in urCommandBufferAppendKernelLaunchExp + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + std::ignore = phCommand; + + UR_CALL(hCommandBuffer->commandListManager.appendUSMPrefetch( + pMemory, size, flags, 0, nullptr, nullptr)); + + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferAppendUSMAdviseExp( + ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, + size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) try { + + // the same issue as in urCommandBufferAppendKernelLaunchExp + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + std::ignore = phCommand; + + UR_CALL(hCommandBuffer->commandListManager.appendUSMAdvise(pMemory, size, + advice, nullptr)); + + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + ur_result_t urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index b1510d0a97..5c6d2330d7 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -31,6 +31,50 @@ ur_command_list_manager::~ur_command_list_manager() { ur::level_zero::urDeviceRelease(device); } +ur_result_t ur_command_list_manager::appendGenericFillUnlocked( + ur_mem_buffer_t *dst, size_t offset, size_t patternSize, + const void *pPattern, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + ur_command_t commandType) { + + auto zeSignalEvent = getSignalEvent(phEvent, commandType); + + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto pDst = ur_cast(dst->getDevicePtr( + device, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size, + [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (zeCommandList.get(), dst, src, size, nullptr, + waitListView.num, waitListView.handles)); + waitListView.clear(); + })); + + // PatternSize must be a power of two for zeCommandListAppendMemoryFill. + // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy. + if (isPowerOf2(patternSize)) { + ZE2UR_CALL(zeCommandListAppendMemoryFill, + (zeCommandList.get(), pDst, pPattern, patternSize, size, + zeSignalEvent, waitListView.num, waitListView.handles)); + } else { + // Copy pattern into every entry in memory array pointed by Ptr. + uint32_t numOfCopySteps = size / patternSize; + const void *src = pPattern; + + for (uint32_t step = 0; step < numOfCopySteps; ++step) { + void *dst = reinterpret_cast(reinterpret_cast(pDst) + + step * patternSize); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (zeCommandList.get(), dst, src, patternSize, + step == numOfCopySteps - 1 ? zeSignalEvent : nullptr, + waitListView.num, waitListView.handles)); + waitListView.clear(); + } + } + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_command_list_manager::appendGenericCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, @@ -209,6 +253,96 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy( return UR_RESULT_SUCCESS; } +ur_result_t ur_command_list_manager::appendMemBufferFill( + ur_mem_handle_t hMem, const void *pPattern, size_t patternSize, + size_t offset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferFill"); + + auto hBuffer = hMem->getBuffer(); + UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); + + std::scoped_lock lock(this->Mutex, + hBuffer->getMutex()); + + return appendGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, size, + numEventsInWaitList, phEventWaitList, + phEvent, UR_COMMAND_MEM_BUFFER_FILL); +} + +ur_result_t ur_command_list_manager::appendUSMFill( + void *pMem, size_t patternSize, const void *pPattern, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill"); + + std::scoped_lock lock(this->Mutex); + + ur_usm_handle_t dstHandle(context, size, pMem); + return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size, + numEventsInWaitList, phEventWaitList, + phEvent, UR_COMMAND_USM_FILL); +} + +ur_result_t ur_command_list_manager::appendUSMPrefetch( + const void *pMem, size_t size, ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch"); + + std::ignore = flags; + + std::scoped_lock lock(this->Mutex); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + if (pWaitEvents) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (zeCommandList.get(), numWaitEvents, pWaitEvents)); + } + // TODO: figure out how to translate "flags" + ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, + (zeCommandList.get(), pMem, size)); + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (zeCommandList.get(), zeSignalEvent)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise"); + + std::scoped_lock lock(this->Mutex); + + auto zeAdvice = ur_cast(advice); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); + + auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); + + if (pWaitEvents) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (zeCommandList.get(), numWaitEvents, pWaitEvents)); + } + + ZE2UR_CALL(zeCommandListAppendMemAdvise, + (zeCommandList.get(), device->ZeDevice, pMem, size, zeAdvice)); + + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (zeCommandList.get(), zeSignalEvent)); + } + return UR_RESULT_SUCCESS; +} + ur_result_t ur_command_list_manager::appendMemBufferRead( ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index e85d9b9049..3d3481cb22 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -99,6 +99,27 @@ struct ur_command_list_manager : public _ur_object { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + ur_result_t appendMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern, + size_t patternSize, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + + ur_result_t appendUSMFill(void *pMem, size_t patternSize, + const void *pPattern, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + + ur_result_t appendUSMPrefetch(const void *pMem, size_t size, + ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + + ur_result_t appendUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent); ze_command_list_handle_t getZeCommandList(); wait_list_view getWaitListView(const ur_event_handle_t *phWaitEvents, @@ -107,6 +128,12 @@ struct ur_command_list_manager : public _ur_object { ur_command_t commandType); private: + ur_result_t appendGenericFillUnlocked( + ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, + const void *pPattern, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + ur_command_t commandType); + ur_result_t appendGenericCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset, size_t dstOffset, size_t size, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index d33ac12f7e..3f88161b90 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -372,16 +372,11 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferFill( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferFill"); - auto hBuffer = hMem->getBuffer(); - - UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); - - std::scoped_lock lock(this->Mutex, - hBuffer->getMutex()); + UR_CALL(commandListManager.appendMemBufferFill( + hMem, pPattern, patternSize, offset, size, numEventsInWaitList, + phEventWaitList, phEvent)); - return enqueueGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, - size, numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_MEM_BUFFER_FILL); + return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageRead( @@ -550,63 +545,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( - ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - ur_command_t commandType) { - auto zeSignalEvent = getSignalEvent(phEvent, commandType); - - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); - - auto pDst = ur_cast(hBuffer->getDevicePtr( - hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size, - [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (commandListManager.getZeCommandList(), dst, src, - size, nullptr, waitListView.num, - waitListView.handles)); - waitListView.clear(); - })); - - // PatternSize must be a power of two for zeCommandListAppendMemoryFill. - // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy. - if (isPowerOf2(patternSize)) { - ZE2UR_CALL(zeCommandListAppendMemoryFill, - (commandListManager.getZeCommandList(), pDst, pPattern, - patternSize, size, zeSignalEvent, waitListView.num, - waitListView.handles)); - } else { - // Copy pattern into every entry in memory array pointed by Ptr. - uint32_t numOfCopySteps = size / patternSize; - const void *src = pPattern; - - for (uint32_t step = 0; step < numOfCopySteps; ++step) { - void *dst = reinterpret_cast(reinterpret_cast(pDst) + - step * patternSize); - ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (commandListManager.getZeCommandList(), dst, src, patternSize, - step == numOfCopySteps - 1 ? zeSignalEvent : nullptr, - waitListView.num, waitListView.handles)); - waitListView.clear(); - } - } - - return UR_RESULT_SUCCESS; -} - ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill( void *pMem, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFill"); - std::scoped_lock lock(this->Mutex); - - ur_usm_handle_t dstHandle(hContext, size, pMem); - return enqueueGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size, - numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_USM_FILL); + UR_CALL(commandListManager.appendUSMFill(pMem, patternSize, pPattern, size, + numEventsInWaitList, phEventWaitList, + phEvent)); + return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy( @@ -628,29 +576,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMPrefetch"); - - std::ignore = flags; - - std::scoped_lock lock(this->Mutex); - - auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); - - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); - - if (pWaitEvents) { - ZE2UR_CALL( - zeCommandListAppendWaitOnEvents, - (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); - } - // TODO: figure out how to translate "flags" - ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (commandListManager.getZeCommandList(), pMem, size)); - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - } - + UR_CALL(commandListManager.appendUSMPrefetch( + pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent)); return UR_RESULT_SUCCESS; } @@ -660,31 +587,7 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMAdvise"); - std::ignore = flags; - - std::scoped_lock lock(this->Mutex); - - auto zeAdvice = ur_cast(advice); - - auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); - - auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); - - if (pWaitEvents) { - ZE2UR_CALL( - zeCommandListAppendWaitOnEvents, - (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); - } - - // TODO: figure out how to translate "flags" - ZE2UR_CALL(zeCommandListAppendMemAdvise, - (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, - pMem, size, zeAdvice)); - - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - } + UR_CALL(commandListManager.appendUSMAdvise(pMem, size, advice, phEvent)); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index 6d5034d07b..9a4ace593e 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -8403,7 +8403,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) try { diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 1130385cfa..d1c83a4655 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -106,9 +106,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize); auto &tp = hQueue->getDevice()->tp; const size_t numParallelThreads = tp.num_threads(); - hKernel->updateMemPool(numParallelThreads); std::vector> futures; - std::vector> groups; + std::vector> groups; auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0]; auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1]; auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2]; @@ -119,8 +118,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto event = new ur_event_handle_t_(hQueue, UR_COMMAND_KERNEL_LAUNCH); event->tick_start(); + // Create a copy of the kernel and its arguments. + auto kernel = std::make_unique(*hKernel); + kernel->updateMemPool(numParallelThreads); + #ifndef NATIVECPU_USE_OCK - hKernel->handleLocalArgs(1, 0); for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < numWG0; g0++) { @@ -128,7 +130,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) { for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) { state.update(g0, g1, g2, local0, local1, local2); - hKernel->_subhandler(hKernel->getArgs().data(), &state); + kernel->_subhandler(kernel->getArgs(1, 0).data(), &state); } } } @@ -139,7 +141,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( bool isLocalSizeOne = ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1; if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads && - !hKernel->hasLocalArgs()) { + !kernel->hasLocalArgs()) { // If the local size is one, we make the assumption that we are running a // parallel_for over a sycl::range. // Todo: we could add more compiler checks and @@ -160,7 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) { futures.emplace_back(tp.schedule_task( - [ndr, itemsPerThread, kernel = *hKernel, g0, g1, g2](size_t) { + [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) { native_cpu::state resized_state = getResizedState(ndr, itemsPerThread); resized_state.update(g0, g1, g2); @@ -172,7 +174,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0; g0++) { state.update(g0, g1, g2); - hKernel->_subhandler(hKernel->getArgs().data(), &state); + kernel->_subhandler(kernel->getArgs().data(), &state); } } } @@ -185,12 +187,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { futures.emplace_back( - tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2, + tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2, numParallelThreads](size_t threadId) mutable { for (unsigned g0 = 0; g0 < numWG0; g0++) { - kernel.handleLocalArgs(numParallelThreads, threadId); state.update(g0, g1, g2); - kernel._subhandler(kernel.getArgs().data(), &state); + kernel._subhandler( + kernel.getArgs(numParallelThreads, threadId).data(), + &state); } })); } @@ -202,13 +205,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < numWG0; g0++) { - groups.push_back( - [state, g0, g1, g2, numParallelThreads]( - size_t threadId, ur_kernel_handle_t_ kernel) mutable { - kernel.handleLocalArgs(numParallelThreads, threadId); - state.update(g0, g1, g2); - kernel._subhandler(kernel.getArgs().data(), &state); - }); + groups.push_back([state, g0, g1, g2, numParallelThreads]( + size_t threadId, + ur_kernel_handle_t_ &kernel) mutable { + state.update(g0, g1, g2); + kernel._subhandler( + kernel.getArgs(numParallelThreads, threadId).data(), &state); + }); } } } @@ -218,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned thread = 0; thread < numParallelThreads; thread++) { futures.emplace_back( tp.schedule_task([groups, thread, groupsPerThread, - kernel = *hKernel](size_t threadId) { + &kernel = *kernel](size_t threadId) { for (unsigned i = 0; i < groupsPerThread; i++) { auto index = thread * groupsPerThread + i; groups[index](threadId, kernel); @@ -231,7 +234,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( futures.emplace_back( tp.schedule_task([groups, remainder, scheduled = numParallelThreads * groupsPerThread, - kernel = *hKernel](size_t threadId) { + &kernel = *kernel](size_t threadId) { for (unsigned i = 0; i < remainder; i++) { auto index = scheduled + i; groups[index](threadId, kernel); @@ -247,7 +250,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (phEvent) { *phEvent = event; } - event->set_callback([hKernel, event]() { + event->set_callback([kernel = std::move(kernel), hKernel, event]() { event->tick_end(); // TODO: avoid calling clear() here. hKernel->_localArgInfo.clear(); @@ -266,7 +269,7 @@ ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, ur_event_handle_t *phEvent, const std::function &f) { urEventWait(numEventsInWaitList, phEventWaitList); - ur_event_handle_t event; + ur_event_handle_t event = nullptr; if (phEvent) { event = new ur_event_handle_t_(hQueue, command_type); event->tick_start(); diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp index 37eaf1f6d1..f981d24f42 100644 --- a/source/adapters/native_cpu/event.cpp +++ b/source/adapters/native_cpu/event.cpp @@ -52,6 +52,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( case UR_PROFILING_INFO_COMMAND_QUEUED: case UR_PROFILING_INFO_COMMAND_SUBMIT: case UR_PROFILING_INFO_COMMAND_COMPLETE: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } @@ -146,7 +147,7 @@ void ur_event_handle_t_::wait() { // The callback may need to acquire the lock, so we unlock it here lock.unlock(); - if (callback) + if (callback.valid()) callback(); } diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp index 60176a33a6..479c671b38 100644 --- a/source/adapters/native_cpu/event.hpp +++ b/source/adapters/native_cpu/event.hpp @@ -21,7 +21,9 @@ struct ur_event_handle_t_ : RefCounted { ~ur_event_handle_t_(); - void set_callback(const std::function &cb) { callback = cb; } + template auto set_callback(T &&cb) { + callback = std::packaged_task(std::forward(cb)); + } void wait(); @@ -60,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted { bool done; std::mutex mutex; std::vector> futures; - std::function callback; + std::packaged_task callback; uint64_t timestamp_start = 0; uint64_t timestamp_end = 0; }; diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index 4d2dec85cb..9e13286f3e 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -35,18 +35,9 @@ struct ur_kernel_handle_t_ : RefCounted { ur_kernel_handle_t_(const ur_kernel_handle_t_ &other) : Args(other.Args), hProgram(other.hProgram), _name(other._name), _subhandler(other._subhandler), _localArgInfo(other._localArgInfo), - _localMemPool(other._localMemPool), - _localMemPoolSize(other._localMemPoolSize), - ReqdWGSize(other.ReqdWGSize) { - incrementReferenceCount(); - } + ReqdWGSize(other.ReqdWGSize) {} - ~ur_kernel_handle_t_() { - if (decrementReferenceCount() == 0) { - free(_localMemPool); - Args.deallocate(); - } - } + ~ur_kernel_handle_t_() { free(_localMemPool); } ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler, @@ -64,27 +55,62 @@ struct ur_kernel_handle_t_ : RefCounted { std::vector OwnsMem; static constexpr size_t MaxAlign = 16 * sizeof(double); + arguments() = default; + + arguments(const arguments &Other) + : Indices(Other.Indices), ParamSizes(Other.ParamSizes), + OwnsMem(Other.OwnsMem.size(), false) { + for (size_t Index = 0; Index < Indices.size(); Index++) { + if (!Other.OwnsMem[Index]) { + continue; + } + addArg(Index, ParamSizes[Index], Indices[Index]); + } + } + + arguments(arguments &&Other) : arguments() { + std::swap(Indices, Other.Indices); + std::swap(ParamSizes, Other.ParamSizes); + std::swap(OwnsMem, Other.OwnsMem); + } + + ~arguments() { + assert(OwnsMem.size() == Indices.size() && "Size mismatch"); + for (size_t Index = 0; Index < Indices.size(); Index++) { + if (!OwnsMem[Index]) { + continue; + } + native_cpu::aligned_free(Indices[Index]); + } + } + /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. /// Implicit offset argument is kept at the back of the indices collection. void addArg(size_t Index, size_t Size, const void *Arg) { + bool NeedAlloc = true; if (Index + 1 > Indices.size()) { Indices.resize(Index + 1); OwnsMem.resize(Index + 1); ParamSizes.resize(Index + 1); - - // Update the stored value for the argument - Indices[Index] = native_cpu::aligned_malloc(MaxAlign, Size); - OwnsMem[Index] = true; - ParamSizes[Index] = Size; - } else { - if (ParamSizes[Index] != Size) { - Indices[Index] = realloc(Indices[Index], Size); - ParamSizes[Index] = Size; + } else if (OwnsMem[Index]) { + if (ParamSizes[Index] == Size) { + NeedAlloc = false; + } else { + native_cpu::aligned_free(Indices[Index]); } } + if (NeedAlloc) { + size_t Align = MaxAlign; + while (Align > Size) { + Align >>= 1; + } + Indices[Index] = native_cpu::aligned_malloc(Align, Size); + ParamSizes[Index] = Size; + OwnsMem[Index] = true; + } std::memcpy(Indices[Index], Arg, Size); } @@ -100,17 +126,6 @@ struct ur_kernel_handle_t_ : RefCounted { Indices[Index] = Arg; } - // This is called by the destructor of ur_kernel_handle_t_, since - // ur_kernel_handle_t_ implements reference counting and we want - // to deallocate only when the reference count is 0. - void deallocate() { - assert(OwnsMem.size() == Indices.size() && "Size mismatch"); - for (size_t Index = 0; Index < Indices.size(); Index++) { - if (OwnsMem[Index]) - native_cpu::aligned_free(Indices[Index]); - } - } - const args_index_t &getIndices() const noexcept { return Indices; } } Args; @@ -144,19 +159,26 @@ struct ur_kernel_handle_t_ : RefCounted { bool hasLocalArgs() const { return !_localArgInfo.empty(); } - // To be called before executing a work group if local args are present - void handleLocalArgs(size_t numParallelThread, size_t threadId) { + const std::vector &getArgs() const { + assert(!hasLocalArgs() && "For kernels with local arguments, thread " + "information must be supplied."); + return Args.getIndices(); + } + + std::vector getArgs(size_t numThreads, size_t threadId) const { + auto Result = Args.getIndices(); + // For each local argument we have size*numthreads size_t offset = 0; for (auto &entry : _localArgInfo) { - Args.Indices[entry.argIndex] = + Result[entry.argIndex] = _localMemPool + offset + (entry.argSize * threadId); // update offset in the memory pool - offset += entry.argSize * numParallelThread; + offset += entry.argSize * numThreads; } - } - const std::vector &getArgs() const { return Args.getIndices(); } + return Result; + } void addArg(const void *Ptr, size_t Index, size_t Size) { Args.addArg(Index, Size, Ptr); diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index d78ef0121b..3707dd4e5d 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -34,7 +34,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_exp_command_buffer_handle_t *phCommandBuffer) { ur_queue_handle_t Queue = nullptr; - UR_RETURN_ON_FAILURE(urQueueCreate(hContext, hDevice, nullptr, &Queue)); + ur_queue_properties_t QueueProperties = {UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + nullptr, 0}; + const bool IsInOrder = + pCommandBufferDesc ? pCommandBufferDesc->isInOrder : false; + if (!IsInOrder) { + QueueProperties.flags = UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + } + UR_RETURN_ON_FAILURE( + urQueueCreate(hContext, hDevice, &QueueProperties, &Queue)); cl_context CLContext = cl_adapter::cast(hContext); cl_ext::clCreateCommandBufferKHR_fn clCreateCommandBufferKHR = nullptr; @@ -43,8 +51,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( CLContext, cl_ext::ExtFuncPtrCache->clCreateCommandBufferKHRCache, cl_ext::CreateCommandBufferName, &clCreateCommandBufferKHR)); - const bool IsUpdatable = - pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; + const bool IsUpdatable = pCommandBufferDesc->isUpdatable; ur_device_command_buffer_update_capability_flags_t UpdateCapabilities; cl_device_id CLDevice = cl_adapter::cast(hDevice); @@ -67,7 +74,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( try { auto URCommandBuffer = std::make_unique( - Queue, hContext, hDevice, CLCommandBuffer, IsUpdatable); + Queue, hContext, hDevice, CLCommandBuffer, IsUpdatable, IsInOrder); *phCommandBuffer = URCommandBuffer.release(); } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; @@ -148,11 +155,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( cl_command_properties_khr *Properties = hCommandBuffer->IsUpdatable ? UpdateProperties : nullptr; + + const bool IsInOrder = hCommandBuffer->IsInOrder; + cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = + IsInOrder ? nullptr : pSyncPointWaitList; + uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandNDRangeKernelKHR( hCommandBuffer->CLCommandBuffer, nullptr, Properties, cl_adapter::cast(hKernel), workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint, OutCommandHandle)); + pGlobalWorkSize, pLocalWorkSize, WaitListSize, SyncPointWaitList, + RetSyncPoint, OutCommandHandle)); try { auto Handle = std::make_unique( @@ -219,11 +232,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( CLContext, cl_ext::ExtFuncPtrCache->clCommandCopyBufferKHRCache, cl_ext::CommandCopyBufferName, &clCommandCopyBufferKHR)); + const bool IsInOrder = hCommandBuffer->IsInOrder; + cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = + IsInOrder ? nullptr : pSyncPointWaitList; + uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandCopyBufferKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, cl_adapter::cast(hSrcMem), cl_adapter::cast(hDstMem), - srcOffset, dstOffset, size, numSyncPointsInWaitList, pSyncPointWaitList, - pSyncPoint, nullptr)); + srcOffset, dstOffset, size, WaitListSize, SyncPointWaitList, RetSyncPoint, + nullptr)); return UR_RESULT_SUCCESS; } @@ -257,12 +275,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( CLContext, cl_ext::ExtFuncPtrCache->clCommandCopyBufferRectKHRCache, cl_ext::CommandCopyBufferRectName, &clCommandCopyBufferRectKHR)); + const bool IsInOrder = hCommandBuffer->IsInOrder; + cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = + IsInOrder ? nullptr : pSyncPointWaitList; + uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandCopyBufferRectKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, cl_adapter::cast(hSrcMem), cl_adapter::cast(hDstMem), OpenCLOriginRect, OpenCLDstRect, OpenCLRegion, srcRowPitch, srcSlicePitch, - dstRowPitch, dstSlicePitch, numSyncPointsInWaitList, pSyncPointWaitList, - pSyncPoint, nullptr)); + dstRowPitch, dstSlicePitch, WaitListSize, SyncPointWaitList, RetSyncPoint, + nullptr)); return UR_RESULT_SUCCESS; } @@ -361,10 +384,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( CLContext, cl_ext::ExtFuncPtrCache->clCommandFillBufferKHRCache, cl_ext::CommandFillBufferName, &clCommandFillBufferKHR)); + const bool IsInOrder = hCommandBuffer->IsInOrder; + cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = + IsInOrder ? nullptr : pSyncPointWaitList; + uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandFillBufferKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, cl_adapter::cast(hBuffer), pPattern, patternSize, offset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, nullptr)); + WaitListSize, SyncPointWaitList, RetSyncPoint, nullptr)); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/opencl/command_buffer.hpp b/source/adapters/opencl/command_buffer.hpp index 83fedc3c1e..6fbdc5d259 100644 --- a/source/adapters/opencl/command_buffer.hpp +++ b/source/adapters/opencl/command_buffer.hpp @@ -46,6 +46,8 @@ struct ur_exp_command_buffer_handle_t_ { /// Set to true if the kernel commands in the command-buffer can be updated, /// false otherwise bool IsUpdatable; + /// Set to true if the command-buffer was created with an in-order queue. + bool IsInOrder; /// Set to true if the command-buffer has been finalized, false otherwise bool IsFinalized; /// List of commands in the command-buffer. @@ -58,10 +60,10 @@ struct ur_exp_command_buffer_handle_t_ { ur_context_handle_t hContext, ur_device_handle_t hDevice, cl_command_buffer_khr CLCommandBuffer, - bool IsUpdatable) + bool IsUpdatable, bool IsInOrder) : hInternalQueue(hQueue), hContext(hContext), hDevice(hDevice), CLCommandBuffer(CLCommandBuffer), IsUpdatable(IsUpdatable), - IsFinalized(false), RefCount(0) {} + IsInOrder(IsInOrder), IsFinalized(false), RefCount(0) {} ~ur_exp_command_buffer_handle_t_(); diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 417016d630..5443e9de79 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -40,11 +40,11 @@ if (NOT DEFINED UMF_REPO) endif() if (NOT DEFINED UMF_TAG) - # commit ace9f4a60b686463fdad15cd016c548237cb79e0 - # Author: RafaƂ Rudnicki - # Date: Mon Feb 10 11:39:15 2025 +0100 - # Merge pull request #1088 from ldorau/Fix_remove_CUDA_ERROR_INVALID_RESOURCE_TYPE - set(UMF_TAG ace9f4a60b686463fdad15cd016c548237cb79e0) + # commit 5a515c56c92be75944c8246535c408cee7711114 + # Author: Lukasz Dorau + # Date: Mon Feb 17 10:56:05 2025 +0100 + # Merge pull request #1086 from vinser52/svinogra_l0_linking + set(UMF_TAG 5a515c56c92be75944c8246535c408cee7711114) endif() message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}") diff --git a/source/common/umf_helpers.hpp b/source/common/umf_helpers.hpp index 2433560a39..12432bd933 100644 --- a/source/common/umf_helpers.hpp +++ b/source/common/umf_helpers.hpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include "logger/ur_logger.hpp" @@ -30,11 +30,10 @@ #define UMF_CHECK_ERROR(UmfResult) UR_CHECK_ERROR(umf::umf2urResult(UmfResult)); -#define UMF_RETURN_UMF_ERROR(UmfResult) \ +#define UMF_CHECK_PTR(ptr) \ do { \ - umf_result_t UmfResult_ = (UmfResult); \ - if (UmfResult_ != UMF_RESULT_SUCCESS) { \ - return UmfResult_; \ + if ((ptr) == nullptr) { \ + UR_CHECK_ERROR(UR_RESULT_ERROR_OUT_OF_HOST_MEMORY); \ } \ } while (0) @@ -54,9 +53,6 @@ using pool_unique_handle_t = using provider_unique_handle_t = std::unique_ptr>; -using cuda_params_unique_handle_t = std::unique_ptr< - umf_cuda_memory_provider_params_t, - std::function>; #define DEFINE_CHECK_OP(op) \ template class HAS_OP_##op { \ @@ -243,6 +239,21 @@ static inline auto poolMakeUniqueFromOps(umf_memory_pool_ops_t *ops, UMF_RESULT_SUCCESS, pool_unique_handle_t(hPool, umfPoolDestroy)}; } +static inline auto +poolMakeUniqueFromOpsProviderHandle(umf_memory_pool_ops_t *ops, + umf_memory_provider_handle_t provider, + void *params) { + umf_memory_pool_handle_t hPool; + auto ret = umfPoolCreate(ops, provider, params, 0, &hPool); + if (ret != UMF_RESULT_SUCCESS) { + return std::pair{ + ret, pool_unique_handle_t(nullptr, nullptr)}; + } + + return std::pair{ + UMF_RESULT_SUCCESS, pool_unique_handle_t(hPool, umfPoolDestroy)}; +} + static inline auto providerMakeUniqueFromOps(umf_memory_provider_ops_t *ops, void *params) { umf_memory_provider_handle_t hProvider; @@ -301,33 +312,6 @@ inline ur_result_t umf2urResult(umf_result_t umfResult) { }; } -inline umf_result_t createMemoryProvider( - umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams, - int cuDevice, void *cuContext, umf_usm_memory_type_t memType, - umf_memory_provider_handle_t *provider) { - - umf_result_t UmfResult = - umfCUDAMemoryProviderParamsSetContext(CUMemoryProviderParams, cuContext); - UMF_RETURN_UMF_ERROR(UmfResult); - - UmfResult = - umfCUDAMemoryProviderParamsSetDevice(CUMemoryProviderParams, cuDevice); - UMF_RETURN_UMF_ERROR(UmfResult); - - UmfResult = - umfCUDAMemoryProviderParamsSetMemoryType(CUMemoryProviderParams, memType); - UMF_RETURN_UMF_ERROR(UmfResult); - - umf_memory_provider_handle_t umfCUDAprovider = nullptr; - UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), - CUMemoryProviderParams, &umfCUDAprovider); - UMF_RETURN_UMF_ERROR(UmfResult); - - *provider = umfCUDAprovider; - - return UMF_RESULT_SUCCESS; -} - } // namespace umf #endif /* UMF_HELPERS_H */ diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index d5d5c2c729..d99c0545c3 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -175,7 +175,10 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context, if (!AllocInfoItOp) { // "Addr" might be a host pointer ReportBadFree(Addr, GetCurrentBacktrace(), nullptr); - return UR_RESULT_ERROR_INVALID_ARGUMENT; + if (getOptions().HaltOnError) { + exitWithErrors(); + } + return UR_RESULT_SUCCESS; } auto AllocInfoIt = *AllocInfoItOp; @@ -190,17 +193,26 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context, // "Addr" might be a host pointer ReportBadFree(Addr, GetCurrentBacktrace(), nullptr); } - return UR_RESULT_ERROR_INVALID_ARGUMENT; + if (getOptions().HaltOnError) { + exitWithErrors(); + } + return UR_RESULT_SUCCESS; } if (Addr != AllocInfo->UserBegin) { ReportBadFree(Addr, GetCurrentBacktrace(), AllocInfo); - return UR_RESULT_ERROR_INVALID_ARGUMENT; + if (getOptions().HaltOnError) { + exitWithErrors(); + } + return UR_RESULT_SUCCESS; } if (AllocInfo->IsReleased) { ReportDoubleFree(Addr, GetCurrentBacktrace(), AllocInfo); - return UR_RESULT_ERROR_INVALID_ARGUMENT; + if (getOptions().HaltOnError) { + exitWithErrors(); + } + return UR_RESULT_SUCCESS; } AllocInfo->IsReleased = true; diff --git a/source/loader/layers/sanitizer/asan/asan_options.cpp b/source/loader/layers/sanitizer/asan/asan_options.cpp index 7704547d38..799c892ff1 100644 --- a/source/loader/layers/sanitizer/asan/asan_options.cpp +++ b/source/loader/layers/sanitizer/asan/asan_options.cpp @@ -90,6 +90,7 @@ AsanOptions::AsanOptions() { SetBoolOption("detect_privates", DetectPrivates); SetBoolOption("print_stats", PrintStats); SetBoolOption("detect_leaks", DetectLeaks); + SetBoolOption("halt_on_error", HaltOnError); auto KV = OptionsEnvMap->find("quarantine_size_mb"); if (KV != OptionsEnvMap->end()) { diff --git a/source/loader/layers/sanitizer/asan/asan_options.hpp b/source/loader/layers/sanitizer/asan/asan_options.hpp index 4da32ca278..cea30351d3 100644 --- a/source/loader/layers/sanitizer/asan/asan_options.hpp +++ b/source/loader/layers/sanitizer/asan/asan_options.hpp @@ -28,6 +28,7 @@ struct AsanOptions { bool PrintStats = false; bool DetectKernelArguments = true; bool DetectLeaks = true; + bool HaltOnError = true; explicit AsanOptions(); }; diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.cpp b/source/loader/layers/sanitizer/msan/msan_shadow.cpp index 8b210a2717..d4446d4dcf 100644 --- a/source/loader/layers/sanitizer/msan/msan_shadow.cpp +++ b/source/loader/layers/sanitizer/msan/msan_shadow.cpp @@ -141,7 +141,7 @@ ur_result_t MsanShadowMemoryGPU::Setup() { // To reserve very large amount of GPU virtual memroy, the pStart param // should be beyond the SVM range, so that GFX driver will automatically // switch to reservation on the GPU heap. - const void *StartAddress = (void *)(0x100'0000'0000'0000ULL); + const void *StartAddress = (void *)GetStartAddress(); // TODO: Protect Bad Zone auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve( Context, StartAddress, ShadowSize, (void **)&ShadowBegin); diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.hpp b/source/loader/layers/sanitizer/msan/msan_shadow.hpp index 51847e4907..29b4a6d96a 100644 --- a/source/loader/layers/sanitizer/msan/msan_shadow.hpp +++ b/source/loader/layers/sanitizer/msan/msan_shadow.hpp @@ -103,6 +103,8 @@ struct MsanShadowMemoryGPU : public MsanShadowMemory { virtual size_t GetShadowSize() = 0; + virtual uptr GetStartAddress() { return 0; } + private: ur_result_t EnqueueMapShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size, std::vector &EventWaitList, @@ -134,6 +136,8 @@ struct MsanShadowMemoryPVC final : public MsanShadowMemoryGPU { uptr MemToShadow(uptr Ptr) override; size_t GetShadowSize() override { return 0x8000'0000'0000ULL; } + + uptr GetStartAddress() override { return 0x100'0000'0000'0000ULL; } }; /// Shadow Memory layout of GPU DG2 device diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index f4a7b7e60a..e716eaaa49 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -6978,7 +6978,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) { diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index eb2bd4c353..acb1f65c94 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -7615,7 +7615,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) { @@ -7632,6 +7632,9 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( if (NULL == hDevice) return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + if (NULL == pCommandBufferDesc) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (NULL == phCommandBuffer) return UR_RESULT_ERROR_INVALID_NULL_POINTER; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index ec6081509f..c1e21fd58b 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -7044,7 +7044,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) { diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 5761dab3a4..e579753763 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -4733,6 +4733,8 @@ ur_result_t UR_APICALL urEventGetInfo( /// - ::UR_RESULT_ERROR_INVALID_EVENT /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION +/// + If `propName` is not supported by the adapter. ur_result_t UR_APICALL urEventGetProfilingInfo( /// [in] handle of the event object ur_event_handle_t hEvent, @@ -7619,6 +7621,7 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( /// + `NULL == hContext` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pCommandBufferDesc` /// + `NULL == phCommandBuffer` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_DEVICE @@ -7633,7 +7636,7 @@ ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) try { diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 7023161cb1..c5651c0fc4 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -4138,6 +4138,8 @@ ur_result_t UR_APICALL urEventGetInfo( /// - ::UR_RESULT_ERROR_INVALID_EVENT /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION +/// + If `propName` is not supported by the adapter. ur_result_t UR_APICALL urEventGetProfilingInfo( /// [in] handle of the event object ur_event_handle_t hEvent, @@ -6664,6 +6666,7 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( /// + `NULL == hContext` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pCommandBufferDesc` /// + `NULL == phCommandBuffer` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_DEVICE @@ -6678,7 +6681,7 @@ ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) { diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp index 16723df5cf..e63b6078de 100644 --- a/test/conformance/event/urEventGetProfilingInfo.cpp +++ b/test/conformance/event/urEventGetProfilingInfo.cpp @@ -11,8 +11,6 @@ using urEventGetProfilingInfoTest = uur::event::urEventTest; UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEventGetProfilingInfoTest); TEST_P(urEventGetProfilingInfoTest, SuccessCommandQueued) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::NativeCPU{}); - const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_QUEUED; size_t property_size = 0; @@ -29,8 +27,6 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandQueued) { } TEST_P(urEventGetProfilingInfoTest, SuccessCommandSubmit) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::NativeCPU{}); - const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_SUBMIT; size_t property_size = 0; @@ -79,9 +75,6 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandEnd) { } TEST_P(urEventGetProfilingInfoTest, SuccessCommandComplete) { - UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::HIP{}, uur::LevelZero{}, - uur::NativeCPU{}); - const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_COMPLETE; size_t property_size = 0; @@ -98,41 +91,41 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandComplete) { } TEST_P(urEventGetProfilingInfoTest, Success) { - UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::HIP{}, uur::LevelZero{}, - uur::LevelZeroV2{}, uur::NativeCPU{}); - - uint8_t size = 8; - - uint64_t queued_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo( - event, UR_PROFILING_INFO_COMMAND_QUEUED, size, &queued_value, nullptr)); - ASSERT_NE(queued_value, 0); - - uint64_t submit_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo( - event, UR_PROFILING_INFO_COMMAND_SUBMIT, size, &submit_value, nullptr)); - ASSERT_NE(submit_value, 0); - - uint64_t start_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START, - size, &start_value, nullptr)); - ASSERT_NE(start_value, 0); + // AMD devices may report a "start" time before the "submit" time + UUR_KNOWN_FAILURE_ON(uur::HIP{}); + + // If a and b are supported, asserts that a <= b + auto test_timing = [=](ur_profiling_info_t a, ur_profiling_info_t b) { + std::stringstream trace{"Profiling Info: "}; + trace << a << " <= " << b; + SCOPED_TRACE(trace.str()); + uint64_t a_time; + auto result = + urEventGetProfilingInfo(event, a, sizeof(a_time), &a_time, nullptr); + if (result == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) { + return; + } + ASSERT_SUCCESS(result); - uint64_t end_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END, - size, &end_value, nullptr)); - ASSERT_NE(end_value, 0); + uint64_t b_time; + result = + urEventGetProfilingInfo(event, b, sizeof(b_time), &b_time, nullptr); + if (result == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) { + return; + } + ASSERT_SUCCESS(result); - uint64_t complete_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo(event, - UR_PROFILING_INFO_COMMAND_COMPLETE, - size, &complete_value, nullptr)); - ASSERT_NE(complete_value, 0); + // Note: This assumes that the counter doesn't overflow + ASSERT_LE(a_time, b_time); + }; - ASSERT_LE(queued_value, submit_value); - ASSERT_LT(submit_value, start_value); - ASSERT_LT(start_value, end_value); - ASSERT_LE(end_value, complete_value); + test_timing(UR_PROFILING_INFO_COMMAND_QUEUED, + UR_PROFILING_INFO_COMMAND_SUBMIT); + test_timing(UR_PROFILING_INFO_COMMAND_SUBMIT, + UR_PROFILING_INFO_COMMAND_START); + test_timing(UR_PROFILING_INFO_COMMAND_START, UR_PROFILING_INFO_COMMAND_END); + test_timing(UR_PROFILING_INFO_COMMAND_END, + UR_PROFILING_INFO_COMMAND_COMPLETE); } TEST_P(urEventGetProfilingInfoTest, InvalidNullHandle) { diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h index 0cd4d7a6b3..e0ddc0b389 100644 --- a/test/conformance/exp_command_buffer/fixtures.h +++ b/test/conformance/exp_command_buffer/fixtures.h @@ -60,8 +60,12 @@ struct urCommandBufferExpTest : uur::urContextTest { UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp()); UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); + + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false, + }; ASSERT_SUCCESS( - urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle)); + urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); } @@ -83,8 +87,11 @@ struct urCommandBufferExpTestWithParam : urQueueTestWithParam { UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam::SetUp()); UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(this->device)); - ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device, - nullptr, &cmd_buf_handle)); + + ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, + nullptr, false, false, false}; + ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device, &desc, + &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); } @@ -105,8 +112,11 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); + + ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, + nullptr, false, false, false}; ASSERT_SUCCESS( - urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle)); + urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); } @@ -333,15 +343,8 @@ struct urCommandEventSyncTest : urCommandBufferExpTest { ASSERT_NE(buffer, nullptr); } - // Create a command-buffer with update enabled. - ur_exp_command_buffer_desc_t desc{ - /*.stype=*/UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, - /*.pNext =*/nullptr, - /*.isUpdatable =*/false, - /*.isInOrder =*/false, - /*.enableProfiling =*/false, - }; - + ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, + nullptr, true, false, false}; ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc, &second_cmd_buf_handle)); ASSERT_NE(second_cmd_buf_handle, nullptr); diff --git a/test/conformance/exp_command_buffer/kernel_event_sync.cpp b/test/conformance/exp_command_buffer/kernel_event_sync.cpp index 84941ab26d..60844ffdd7 100644 --- a/test/conformance/exp_command_buffer/kernel_event_sync.cpp +++ b/test/conformance/exp_command_buffer/kernel_event_sync.cpp @@ -40,7 +40,10 @@ struct KernelCommandEventSyncTest ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 3, nullptr, device_ptrs[1])); // Create second command-buffer - ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr, + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false, + }; + ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc, &second_cmd_buf_handle)); ASSERT_NE(second_cmd_buf_handle, nullptr); } diff --git a/test/conformance/exp_command_buffer/update/invalid_update.cpp b/test/conformance/exp_command_buffer/update/invalid_update.cpp index 9c3da1ddd5..b93ffc26d4 100644 --- a/test/conformance/exp_command_buffer/update/invalid_update.cpp +++ b/test/conformance/exp_command_buffer/update/invalid_update.cpp @@ -109,8 +109,11 @@ TEST_P(InvalidUpdateTest, NotFinalizedCommandBuffer) { TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) { // Create a command-buffer without isUpdatable ur_exp_command_buffer_handle_t test_cmd_buf_handle = nullptr; + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false, + }; ASSERT_SUCCESS( - urCommandBufferCreateExp(context, device, nullptr, &test_cmd_buf_handle)); + urCommandBufferCreateExp(context, device, &desc, &test_cmd_buf_handle)); EXPECT_NE(test_cmd_buf_handle, nullptr); // Append a kernel commands to command-buffer and close command-buffer diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index 50c690fb89..1621b902a9 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -378,7 +378,6 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) { // Test updating A,X,Y parameters to new values and local memory parameters // to new smaller values. TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, @@ -1081,11 +1080,6 @@ struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase { UUR_RETURN_ON_FATAL_FAILURE( urUpdatableCommandBufferExpExecutionTest::SetUp()); - if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { - GTEST_SKIP() - << "Local memory argument update not supported on Level Zero."; - } - // HIP has extra args for local memory so we define an offset for arg // indices here for updating hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; diff --git a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp index 45309c3dee..0094caa274 100644 --- a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp +++ b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp @@ -311,9 +311,12 @@ TEST_P(urMultiDeviceCommandBufferExpTest, Enqueue) { } // Create command-buffer + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false, + }; uur::raii::CommandBuffer cmd_buf_handle; - ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr, - cmd_buf_handle.ptr())); + ASSERT_SUCCESS( + urCommandBufferCreateExp(context, device, &desc, cmd_buf_handle.ptr())); // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( diff --git a/test/conformance/testing/include/uur/optional_queries.h b/test/conformance/testing/include/uur/optional_queries.h index d3f670545a..e0a78c18f5 100644 --- a/test/conformance/testing/include/uur/optional_queries.h +++ b/test/conformance/testing/include/uur/optional_queries.h @@ -117,4 +117,16 @@ template <> inline bool isQueryOptional(ur_queue_info_t query) { query) != optional_ur_queue_info_t.end(); } +constexpr std::array optional_ur_profiling_info_t = { + UR_PROFILING_INFO_COMMAND_QUEUED, UR_PROFILING_INFO_COMMAND_SUBMIT, + UR_PROFILING_INFO_COMMAND_START, UR_PROFILING_INFO_COMMAND_END, + UR_PROFILING_INFO_COMMAND_COMPLETE, +}; + +template <> inline bool isQueryOptional(ur_profiling_info_t query) { + return std::find(optional_ur_profiling_info_t.begin(), + optional_ur_profiling_info_t.end(), + query) != optional_ur_profiling_info_t.end(); +} + } // namespace uur