From 3a1b93e35c7e2f9969bef1da389601048327a0c2 Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Wed, 19 Feb 2025 11:37:19 +0000 Subject: [PATCH 01/13] Fix kernel argument passing. (#16995) We were reading the kernel arguments at kernel execution time, but kernel arguments are allowed to change between enqueuing and executing. Make sure to create a copy of kernel arguments ahead of time. This was previously approved as a unified-runtime PR: https://github.com/oneapi-src/unified-runtime/pull/2700 --- source/adapters/native_cpu/enqueue.cpp | 43 ++++++------ source/adapters/native_cpu/event.cpp | 2 +- source/adapters/native_cpu/event.hpp | 6 +- source/adapters/native_cpu/kernel.hpp | 96 ++++++++++++++++---------- 4 files changed, 87 insertions(+), 60 deletions(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 1130385cfa..0ffe9415f1 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -106,9 +106,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize); auto &tp = hQueue->getDevice()->tp; const size_t numParallelThreads = tp.num_threads(); - hKernel->updateMemPool(numParallelThreads); std::vector> futures; - std::vector> groups; + std::vector> groups; auto numWG0 = ndr.GlobalSize[0] / ndr.LocalSize[0]; auto numWG1 = ndr.GlobalSize[1] / ndr.LocalSize[1]; auto numWG2 = ndr.GlobalSize[2] / ndr.LocalSize[2]; @@ -119,8 +118,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( auto event = new ur_event_handle_t_(hQueue, UR_COMMAND_KERNEL_LAUNCH); event->tick_start(); + // Create a copy of the kernel and its arguments. + auto kernel = std::make_unique(*hKernel); + kernel->updateMemPool(numParallelThreads); + #ifndef NATIVECPU_USE_OCK - hKernel->handleLocalArgs(1, 0); for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < numWG0; g0++) { @@ -128,7 +130,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned local1 = 0; local1 < ndr.LocalSize[1]; local1++) { for (unsigned local0 = 0; local0 < ndr.LocalSize[0]; local0++) { state.update(g0, g1, g2, local0, local1, local2); - hKernel->_subhandler(hKernel->getArgs().data(), &state); + kernel->_subhandler(kernel->getArgs(1, 0).data(), &state); } } } @@ -139,7 +141,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( bool isLocalSizeOne = ndr.LocalSize[0] == 1 && ndr.LocalSize[1] == 1 && ndr.LocalSize[2] == 1; if (isLocalSizeOne && ndr.GlobalSize[0] > numParallelThreads && - !hKernel->hasLocalArgs()) { + !kernel->hasLocalArgs()) { // If the local size is one, we make the assumption that we are running a // parallel_for over a sycl::range. // Todo: we could add more compiler checks and @@ -160,7 +162,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < new_num_work_groups_0; g0 += 1) { futures.emplace_back(tp.schedule_task( - [ndr, itemsPerThread, kernel = *hKernel, g0, g1, g2](size_t) { + [ndr, itemsPerThread, &kernel = *kernel, g0, g1, g2](size_t) { native_cpu::state resized_state = getResizedState(ndr, itemsPerThread); resized_state.update(g0, g1, g2); @@ -172,7 +174,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g0 = new_num_work_groups_0 * itemsPerThread; g0 < numWG0; g0++) { state.update(g0, g1, g2); - hKernel->_subhandler(hKernel->getArgs().data(), &state); + kernel->_subhandler(kernel->getArgs().data(), &state); } } } @@ -185,12 +187,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { futures.emplace_back( - tp.schedule_task([state, kernel = *hKernel, numWG0, g1, g2, + tp.schedule_task([state, &kernel = *kernel, numWG0, g1, g2, numParallelThreads](size_t threadId) mutable { for (unsigned g0 = 0; g0 < numWG0; g0++) { - kernel.handleLocalArgs(numParallelThreads, threadId); state.update(g0, g1, g2); - kernel._subhandler(kernel.getArgs().data(), &state); + kernel._subhandler( + kernel.getArgs(numParallelThreads, threadId).data(), + &state); } })); } @@ -202,13 +205,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned g2 = 0; g2 < numWG2; g2++) { for (unsigned g1 = 0; g1 < numWG1; g1++) { for (unsigned g0 = 0; g0 < numWG0; g0++) { - groups.push_back( - [state, g0, g1, g2, numParallelThreads]( - size_t threadId, ur_kernel_handle_t_ kernel) mutable { - kernel.handleLocalArgs(numParallelThreads, threadId); - state.update(g0, g1, g2); - kernel._subhandler(kernel.getArgs().data(), &state); - }); + groups.push_back([state, g0, g1, g2, numParallelThreads]( + size_t threadId, + ur_kernel_handle_t_ &kernel) mutable { + state.update(g0, g1, g2); + kernel._subhandler( + kernel.getArgs(numParallelThreads, threadId).data(), &state); + }); } } } @@ -218,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( for (unsigned thread = 0; thread < numParallelThreads; thread++) { futures.emplace_back( tp.schedule_task([groups, thread, groupsPerThread, - kernel = *hKernel](size_t threadId) { + &kernel = *kernel](size_t threadId) { for (unsigned i = 0; i < groupsPerThread; i++) { auto index = thread * groupsPerThread + i; groups[index](threadId, kernel); @@ -231,7 +234,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( futures.emplace_back( tp.schedule_task([groups, remainder, scheduled = numParallelThreads * groupsPerThread, - kernel = *hKernel](size_t threadId) { + &kernel = *kernel](size_t threadId) { for (unsigned i = 0; i < remainder; i++) { auto index = scheduled + i; groups[index](threadId, kernel); @@ -247,7 +250,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( if (phEvent) { *phEvent = event; } - event->set_callback([hKernel, event]() { + event->set_callback([kernel = std::move(kernel), hKernel, event]() { event->tick_end(); // TODO: avoid calling clear() here. hKernel->_localArgInfo.clear(); diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp index 37eaf1f6d1..87865905c1 100644 --- a/source/adapters/native_cpu/event.cpp +++ b/source/adapters/native_cpu/event.cpp @@ -146,7 +146,7 @@ void ur_event_handle_t_::wait() { // The callback may need to acquire the lock, so we unlock it here lock.unlock(); - if (callback) + if (callback.valid()) callback(); } diff --git a/source/adapters/native_cpu/event.hpp b/source/adapters/native_cpu/event.hpp index 60176a33a6..479c671b38 100644 --- a/source/adapters/native_cpu/event.hpp +++ b/source/adapters/native_cpu/event.hpp @@ -21,7 +21,9 @@ struct ur_event_handle_t_ : RefCounted { ~ur_event_handle_t_(); - void set_callback(const std::function &cb) { callback = cb; } + template auto set_callback(T &&cb) { + callback = std::packaged_task(std::forward(cb)); + } void wait(); @@ -60,7 +62,7 @@ struct ur_event_handle_t_ : RefCounted { bool done; std::mutex mutex; std::vector> futures; - std::function callback; + std::packaged_task callback; uint64_t timestamp_start = 0; uint64_t timestamp_end = 0; }; diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index 4d2dec85cb..9e13286f3e 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -35,18 +35,9 @@ struct ur_kernel_handle_t_ : RefCounted { ur_kernel_handle_t_(const ur_kernel_handle_t_ &other) : Args(other.Args), hProgram(other.hProgram), _name(other._name), _subhandler(other._subhandler), _localArgInfo(other._localArgInfo), - _localMemPool(other._localMemPool), - _localMemPoolSize(other._localMemPoolSize), - ReqdWGSize(other.ReqdWGSize) { - incrementReferenceCount(); - } + ReqdWGSize(other.ReqdWGSize) {} - ~ur_kernel_handle_t_() { - if (decrementReferenceCount() == 0) { - free(_localMemPool); - Args.deallocate(); - } - } + ~ur_kernel_handle_t_() { free(_localMemPool); } ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, nativecpu_task_t subhandler, @@ -64,27 +55,62 @@ struct ur_kernel_handle_t_ : RefCounted { std::vector OwnsMem; static constexpr size_t MaxAlign = 16 * sizeof(double); + arguments() = default; + + arguments(const arguments &Other) + : Indices(Other.Indices), ParamSizes(Other.ParamSizes), + OwnsMem(Other.OwnsMem.size(), false) { + for (size_t Index = 0; Index < Indices.size(); Index++) { + if (!Other.OwnsMem[Index]) { + continue; + } + addArg(Index, ParamSizes[Index], Indices[Index]); + } + } + + arguments(arguments &&Other) : arguments() { + std::swap(Indices, Other.Indices); + std::swap(ParamSizes, Other.ParamSizes); + std::swap(OwnsMem, Other.OwnsMem); + } + + ~arguments() { + assert(OwnsMem.size() == Indices.size() && "Size mismatch"); + for (size_t Index = 0; Index < Indices.size(); Index++) { + if (!OwnsMem[Index]) { + continue; + } + native_cpu::aligned_free(Indices[Index]); + } + } + /// Add an argument to the kernel. /// If the argument existed before, it is replaced. /// Otherwise, it is added. /// Gaps are filled with empty arguments. /// Implicit offset argument is kept at the back of the indices collection. void addArg(size_t Index, size_t Size, const void *Arg) { + bool NeedAlloc = true; if (Index + 1 > Indices.size()) { Indices.resize(Index + 1); OwnsMem.resize(Index + 1); ParamSizes.resize(Index + 1); - - // Update the stored value for the argument - Indices[Index] = native_cpu::aligned_malloc(MaxAlign, Size); - OwnsMem[Index] = true; - ParamSizes[Index] = Size; - } else { - if (ParamSizes[Index] != Size) { - Indices[Index] = realloc(Indices[Index], Size); - ParamSizes[Index] = Size; + } else if (OwnsMem[Index]) { + if (ParamSizes[Index] == Size) { + NeedAlloc = false; + } else { + native_cpu::aligned_free(Indices[Index]); } } + if (NeedAlloc) { + size_t Align = MaxAlign; + while (Align > Size) { + Align >>= 1; + } + Indices[Index] = native_cpu::aligned_malloc(Align, Size); + ParamSizes[Index] = Size; + OwnsMem[Index] = true; + } std::memcpy(Indices[Index], Arg, Size); } @@ -100,17 +126,6 @@ struct ur_kernel_handle_t_ : RefCounted { Indices[Index] = Arg; } - // This is called by the destructor of ur_kernel_handle_t_, since - // ur_kernel_handle_t_ implements reference counting and we want - // to deallocate only when the reference count is 0. - void deallocate() { - assert(OwnsMem.size() == Indices.size() && "Size mismatch"); - for (size_t Index = 0; Index < Indices.size(); Index++) { - if (OwnsMem[Index]) - native_cpu::aligned_free(Indices[Index]); - } - } - const args_index_t &getIndices() const noexcept { return Indices; } } Args; @@ -144,19 +159,26 @@ struct ur_kernel_handle_t_ : RefCounted { bool hasLocalArgs() const { return !_localArgInfo.empty(); } - // To be called before executing a work group if local args are present - void handleLocalArgs(size_t numParallelThread, size_t threadId) { + const std::vector &getArgs() const { + assert(!hasLocalArgs() && "For kernels with local arguments, thread " + "information must be supplied."); + return Args.getIndices(); + } + + std::vector getArgs(size_t numThreads, size_t threadId) const { + auto Result = Args.getIndices(); + // For each local argument we have size*numthreads size_t offset = 0; for (auto &entry : _localArgInfo) { - Args.Indices[entry.argIndex] = + Result[entry.argIndex] = _localMemPool + offset + (entry.argSize * threadId); // update offset in the memory pool - offset += entry.argSize * numParallelThread; + offset += entry.argSize * numThreads; } - } - const std::vector &getArgs() const { return Args.getIndices(); } + return Result; + } void addArg(const void *Ptr, size_t Index, size_t Size) { Args.addArg(Index, Size, Ptr); From dcb022fddc4f1d3d2b278f3d57b3b7879011ec63 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Thu, 20 Feb 2025 12:36:11 +0800 Subject: [PATCH 02/13] Support device memory sanitizer for DG2 GPU device (#16619) --- source/loader/layers/sanitizer/msan/msan_shadow.cpp | 2 +- source/loader/layers/sanitizer/msan/msan_shadow.hpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.cpp b/source/loader/layers/sanitizer/msan/msan_shadow.cpp index 8b210a2717..d4446d4dcf 100644 --- a/source/loader/layers/sanitizer/msan/msan_shadow.cpp +++ b/source/loader/layers/sanitizer/msan/msan_shadow.cpp @@ -141,7 +141,7 @@ ur_result_t MsanShadowMemoryGPU::Setup() { // To reserve very large amount of GPU virtual memroy, the pStart param // should be beyond the SVM range, so that GFX driver will automatically // switch to reservation on the GPU heap. - const void *StartAddress = (void *)(0x100'0000'0000'0000ULL); + const void *StartAddress = (void *)GetStartAddress(); // TODO: Protect Bad Zone auto Result = getContext()->urDdiTable.VirtualMem.pfnReserve( Context, StartAddress, ShadowSize, (void **)&ShadowBegin); diff --git a/source/loader/layers/sanitizer/msan/msan_shadow.hpp b/source/loader/layers/sanitizer/msan/msan_shadow.hpp index 51847e4907..29b4a6d96a 100644 --- a/source/loader/layers/sanitizer/msan/msan_shadow.hpp +++ b/source/loader/layers/sanitizer/msan/msan_shadow.hpp @@ -103,6 +103,8 @@ struct MsanShadowMemoryGPU : public MsanShadowMemory { virtual size_t GetShadowSize() = 0; + virtual uptr GetStartAddress() { return 0; } + private: ur_result_t EnqueueMapShadow(ur_queue_handle_t Queue, uptr Ptr, uptr Size, std::vector &EventWaitList, @@ -134,6 +136,8 @@ struct MsanShadowMemoryPVC final : public MsanShadowMemoryGPU { uptr MemToShadow(uptr Ptr) override; size_t GetShadowSize() override { return 0x8000'0000'0000ULL; } + + uptr GetStartAddress() override { return 0x100'0000'0000'0000ULL; } }; /// Shadow Memory layout of GPU DG2 device From 298b026e1eea11a837ab8a3e551ba0af8c659c70 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 20 Feb 2025 10:01:49 +0100 Subject: [PATCH 03/13] Update UMF in UR to fix issue in LLVM (#17034) Update UMF to the commit: ``` commit 5a515c56c92be75944c8246535c408cee7711114 Author: Lukasz Dorau Date: Mon Feb 17 10:56:05 2025 +0100 Merge pull request #1086 from vinser52/svinogra_l0_linking ``` to fix the issue in LLVM (SYCL/CUDA): https://github.com/intel/llvm/issues/16944 [SYCL][CUDA] Nsys profiling broken after memory providers change Moved from: https://github.com/oneapi-src/unified-runtime/pull/2708 Fixes: https://github.com/intel/llvm/issues/16944 Signed-off-by: Lukasz Dorau --- source/common/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 417016d630..5443e9de79 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -40,11 +40,11 @@ if (NOT DEFINED UMF_REPO) endif() if (NOT DEFINED UMF_TAG) - # commit ace9f4a60b686463fdad15cd016c548237cb79e0 - # Author: RafaƂ Rudnicki - # Date: Mon Feb 10 11:39:15 2025 +0100 - # Merge pull request #1088 from ldorau/Fix_remove_CUDA_ERROR_INVALID_RESOURCE_TYPE - set(UMF_TAG ace9f4a60b686463fdad15cd016c548237cb79e0) + # commit 5a515c56c92be75944c8246535c408cee7711114 + # Author: Lukasz Dorau + # Date: Mon Feb 17 10:56:05 2025 +0100 + # Merge pull request #1086 from vinser52/svinogra_l0_linking + set(UMF_TAG 5a515c56c92be75944c8246535c408cee7711114) endif() message(STATUS "Will fetch Unified Memory Framework from ${UMF_REPO}") From 441720f0f719613b43e3311bfd125a999010d753 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Komar?= <69756491+Xewar313@users.noreply.github.com> Date: Thu, 20 Feb 2025 10:02:46 +0100 Subject: [PATCH 04/13] Add remaining calls shared with queue in level-zero v2 adapter (#17061) Adds implements calls shared between command buffer and queue in unified-runtime level-zero v2 adapter and moves the shared code to `command_list_manager.cpp` --- source/adapters/level_zero/v2/api.cpp | 47 ------ .../adapters/level_zero/v2/command_buffer.cpp | 109 ++++++++++++++ .../level_zero/v2/command_list_manager.cpp | 134 ++++++++++++++++++ .../level_zero/v2/command_list_manager.hpp | 27 ++++ .../v2/queue_immediate_in_order.cpp | 119 ++-------------- 5 files changed, 281 insertions(+), 155 deletions(-) diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index 8a2153e0a5..129db02594 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -170,53 +170,6 @@ ur_result_t urBindlessImagesReleaseExternalSemaphoreExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t urCommandBufferAppendUSMFillExp( - ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory, - const void *pPattern, size_t patternSize, size_t size, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferAppendMemBufferFillExp( - ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - const void *pPattern, size_t patternSize, size_t offset, size_t size, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferAppendUSMPrefetchExp( - ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, - size_t size, ur_usm_migration_flags_t flags, - uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t urCommandBufferAppendUSMAdviseExp( - ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, - size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList, - const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, - uint32_t NumEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, - ur_exp_command_buffer_command_handle_t *phCommand) { - logger::error("{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t urCommandBufferUpdateKernelLaunchExp( ur_exp_command_buffer_command_handle_t hCommand, const ur_exp_command_buffer_update_kernel_launch_desc_t diff --git a/source/adapters/level_zero/v2/command_buffer.cpp b/source/adapters/level_zero/v2/command_buffer.cpp index 8253527efe..8511d818b2 100644 --- a/source/adapters/level_zero/v2/command_buffer.cpp +++ b/source/adapters/level_zero/v2/command_buffer.cpp @@ -346,6 +346,115 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( return exceptionToResult(std::current_exception()); } +ur_result_t urCommandBufferAppendUSMFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory, + const void *pPattern, size_t patternSize, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) try { + + // the same issue as in urCommandBufferAppendKernelLaunchExp + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + std::ignore = phCommand; + + UR_CALL(hCommandBuffer->commandListManager.appendUSMFill( + pMemory, patternSize, pPattern, size, 0, nullptr, nullptr)); + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferAppendMemBufferFillExp( + ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, size_t offset, size_t size, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) try { + + // the same issue as in urCommandBufferAppendKernelLaunchExp + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + std::ignore = phCommand; + + UR_CALL(hCommandBuffer->commandListManager.appendMemBufferFill( + hBuffer, pPattern, patternSize, offset, size, 0, nullptr, nullptr)); + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferAppendUSMPrefetchExp( + ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, + size_t size, ur_usm_migration_flags_t flags, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) try { + + // the same issue as in urCommandBufferAppendKernelLaunchExp + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + std::ignore = phCommand; + + UR_CALL(hCommandBuffer->commandListManager.appendUSMPrefetch( + pMemory, size, flags, 0, nullptr, nullptr)); + + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t urCommandBufferAppendUSMAdviseExp( + ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, + size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t *phEvent, + ur_exp_command_buffer_command_handle_t *phCommand) try { + + // the same issue as in urCommandBufferAppendKernelLaunchExp + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + // sync mechanic can be ignored, because all lists are in-order + std::ignore = numSyncPointsInWaitList; + std::ignore = pSyncPointWaitList; + std::ignore = pSyncPoint; + + std::ignore = phCommand; + + UR_CALL(hCommandBuffer->commandListManager.appendUSMAdvise(pMemory, size, + advice, nullptr)); + + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + ur_result_t urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_exp_command_buffer_info_t propName, diff --git a/source/adapters/level_zero/v2/command_list_manager.cpp b/source/adapters/level_zero/v2/command_list_manager.cpp index b1510d0a97..5c6d2330d7 100644 --- a/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/source/adapters/level_zero/v2/command_list_manager.cpp @@ -31,6 +31,50 @@ ur_command_list_manager::~ur_command_list_manager() { ur::level_zero::urDeviceRelease(device); } +ur_result_t ur_command_list_manager::appendGenericFillUnlocked( + ur_mem_buffer_t *dst, size_t offset, size_t patternSize, + const void *pPattern, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + ur_command_t commandType) { + + auto zeSignalEvent = getSignalEvent(phEvent, commandType); + + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto pDst = ur_cast(dst->getDevicePtr( + device, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size, + [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (zeCommandList.get(), dst, src, size, nullptr, + waitListView.num, waitListView.handles)); + waitListView.clear(); + })); + + // PatternSize must be a power of two for zeCommandListAppendMemoryFill. + // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy. + if (isPowerOf2(patternSize)) { + ZE2UR_CALL(zeCommandListAppendMemoryFill, + (zeCommandList.get(), pDst, pPattern, patternSize, size, + zeSignalEvent, waitListView.num, waitListView.handles)); + } else { + // Copy pattern into every entry in memory array pointed by Ptr. + uint32_t numOfCopySteps = size / patternSize; + const void *src = pPattern; + + for (uint32_t step = 0; step < numOfCopySteps; ++step) { + void *dst = reinterpret_cast(reinterpret_cast(pDst) + + step * patternSize); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (zeCommandList.get(), dst, src, patternSize, + step == numOfCopySteps - 1 ? zeSignalEvent : nullptr, + waitListView.num, waitListView.handles)); + waitListView.clear(); + } + } + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_command_list_manager::appendGenericCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, @@ -209,6 +253,96 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy( return UR_RESULT_SUCCESS; } +ur_result_t ur_command_list_manager::appendMemBufferFill( + ur_mem_handle_t hMem, const void *pPattern, size_t patternSize, + size_t offset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferFill"); + + auto hBuffer = hMem->getBuffer(); + UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); + + std::scoped_lock lock(this->Mutex, + hBuffer->getMutex()); + + return appendGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, size, + numEventsInWaitList, phEventWaitList, + phEvent, UR_COMMAND_MEM_BUFFER_FILL); +} + +ur_result_t ur_command_list_manager::appendUSMFill( + void *pMem, size_t patternSize, const void *pPattern, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill"); + + std::scoped_lock lock(this->Mutex); + + ur_usm_handle_t dstHandle(context, size, pMem); + return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size, + numEventsInWaitList, phEventWaitList, + phEvent, UR_COMMAND_USM_FILL); +} + +ur_result_t ur_command_list_manager::appendUSMPrefetch( + const void *pMem, size_t size, ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch"); + + std::ignore = flags; + + std::scoped_lock lock(this->Mutex); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + if (pWaitEvents) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (zeCommandList.get(), numWaitEvents, pWaitEvents)); + } + // TODO: figure out how to translate "flags" + ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, + (zeCommandList.get(), pMem, size)); + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (zeCommandList.get(), zeSignalEvent)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise"); + + std::scoped_lock lock(this->Mutex); + + auto zeAdvice = ur_cast(advice); + + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); + + auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); + + if (pWaitEvents) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (zeCommandList.get(), numWaitEvents, pWaitEvents)); + } + + ZE2UR_CALL(zeCommandListAppendMemAdvise, + (zeCommandList.get(), device->ZeDevice, pMem, size, zeAdvice)); + + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (zeCommandList.get(), zeSignalEvent)); + } + return UR_RESULT_SUCCESS; +} + ur_result_t ur_command_list_manager::appendMemBufferRead( ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, diff --git a/source/adapters/level_zero/v2/command_list_manager.hpp b/source/adapters/level_zero/v2/command_list_manager.hpp index e85d9b9049..3d3481cb22 100644 --- a/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/source/adapters/level_zero/v2/command_list_manager.hpp @@ -99,6 +99,27 @@ struct ur_command_list_manager : public _ur_object { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + ur_result_t appendMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern, + size_t patternSize, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + + ur_result_t appendUSMFill(void *pMem, size_t patternSize, + const void *pPattern, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + + ur_result_t appendUSMPrefetch(const void *pMem, size_t size, + ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + + ur_result_t appendUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent); ze_command_list_handle_t getZeCommandList(); wait_list_view getWaitListView(const ur_event_handle_t *phWaitEvents, @@ -107,6 +128,12 @@ struct ur_command_list_manager : public _ur_object { ur_command_t commandType); private: + ur_result_t appendGenericFillUnlocked( + ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, + const void *pPattern, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + ur_command_t commandType); + ur_result_t appendGenericCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset, size_t dstOffset, size_t size, diff --git a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index d33ac12f7e..3f88161b90 100644 --- a/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -372,16 +372,11 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferFill( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferFill"); - auto hBuffer = hMem->getBuffer(); - - UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); - - std::scoped_lock lock(this->Mutex, - hBuffer->getMutex()); + UR_CALL(commandListManager.appendMemBufferFill( + hMem, pPattern, patternSize, offset, size, numEventsInWaitList, + phEventWaitList, phEvent)); - return enqueueGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, - size, numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_MEM_BUFFER_FILL); + return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageRead( @@ -550,63 +545,16 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked( - ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - ur_command_t commandType) { - auto zeSignalEvent = getSignalEvent(phEvent, commandType); - - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); - - auto pDst = ur_cast(hBuffer->getDevicePtr( - hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size, - [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (commandListManager.getZeCommandList(), dst, src, - size, nullptr, waitListView.num, - waitListView.handles)); - waitListView.clear(); - })); - - // PatternSize must be a power of two for zeCommandListAppendMemoryFill. - // When it's not, the fill is emulated with zeCommandListAppendMemoryCopy. - if (isPowerOf2(patternSize)) { - ZE2UR_CALL(zeCommandListAppendMemoryFill, - (commandListManager.getZeCommandList(), pDst, pPattern, - patternSize, size, zeSignalEvent, waitListView.num, - waitListView.handles)); - } else { - // Copy pattern into every entry in memory array pointed by Ptr. - uint32_t numOfCopySteps = size / patternSize; - const void *src = pPattern; - - for (uint32_t step = 0; step < numOfCopySteps; ++step) { - void *dst = reinterpret_cast(reinterpret_cast(pDst) + - step * patternSize); - ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (commandListManager.getZeCommandList(), dst, src, patternSize, - step == numOfCopySteps - 1 ? zeSignalEvent : nullptr, - waitListView.num, waitListView.handles)); - waitListView.clear(); - } - } - - return UR_RESULT_SUCCESS; -} - ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill( void *pMem, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFill"); - std::scoped_lock lock(this->Mutex); - - ur_usm_handle_t dstHandle(hContext, size, pMem); - return enqueueGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size, - numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_USM_FILL); + UR_CALL(commandListManager.appendUSMFill(pMem, patternSize, pPattern, size, + numEventsInWaitList, phEventWaitList, + phEvent)); + return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy( @@ -628,29 +576,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMPrefetch"); - - std::ignore = flags; - - std::scoped_lock lock(this->Mutex); - - auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); - - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); - - if (pWaitEvents) { - ZE2UR_CALL( - zeCommandListAppendWaitOnEvents, - (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); - } - // TODO: figure out how to translate "flags" - ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (commandListManager.getZeCommandList(), pMem, size)); - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - } - + UR_CALL(commandListManager.appendUSMPrefetch( + pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent)); return UR_RESULT_SUCCESS; } @@ -660,31 +587,7 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMAdvise"); - std::ignore = flags; - - std::scoped_lock lock(this->Mutex); - - auto zeAdvice = ur_cast(advice); - - auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); - - auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); - - if (pWaitEvents) { - ZE2UR_CALL( - zeCommandListAppendWaitOnEvents, - (commandListManager.getZeCommandList(), numWaitEvents, pWaitEvents)); - } - - // TODO: figure out how to translate "flags" - ZE2UR_CALL(zeCommandListAppendMemAdvise, - (commandListManager.getZeCommandList(), this->hDevice->ZeDevice, - pMem, size, zeAdvice)); - - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListManager.getZeCommandList(), zeSignalEvent)); - } + UR_CALL(commandListManager.appendUSMAdvise(pMem, size, advice, phEvent)); return UR_RESULT_SUCCESS; } From b77fc2bbd49dde749f7a9b286d972a92d5780a3b Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 20 Feb 2025 09:03:14 +0000 Subject: [PATCH 05/13] Make command-buffer creation descriptor mandatory (#17058) As discussed in https://github.com/oneapi-src/unified-runtime/pull/2670#discussion_r1946398932 the `pCommandBufferDesc` parameter to `urCommandBufferCreateExp` is optional. However, the UR spec doesn't state what the configuration of the created command-buffer is when this isn't passed, and being optional is also inconsistent with the description parameters to urSamplerCreate & urMemImageCreate which are not optional. This PR updates the descriptor parameter to command-buffer creation to be mandatory to address these concerns. Closes https://github.com/oneapi-src/unified-runtime/issues/2673 **Note**: This UR patch was previously approved and ready-to-merge in https://github.com/oneapi-src/unified-runtime/pull/2676 prior to the repo move --- include/ur_api.h | 3 +- scripts/core/exp-command-buffer.yml | 2 +- source/adapters/cuda/command_buffer.cpp | 5 +--- source/adapters/hip/command_buffer.cpp | 4 +-- source/adapters/level_zero/command_buffer.cpp | 9 ++---- source/adapters/mock/ur_mockddi.cpp | 2 +- source/adapters/opencl/command_buffer.cpp | 3 +- source/loader/layers/tracing/ur_trcddi.cpp | 2 +- source/loader/layers/validation/ur_valddi.cpp | 5 +++- source/loader/ur_ldrddi.cpp | 2 +- source/loader/ur_libapi.cpp | 3 +- source/ur_api.cpp | 3 +- .../conformance/exp_command_buffer/fixtures.h | 29 ++++++++++--------- .../exp_command_buffer/kernel_event_sync.cpp | 5 +++- .../update/invalid_update.cpp | 5 +++- .../urMultiDeviceProgramCreateWithBinary.cpp | 7 +++-- 16 files changed, 49 insertions(+), 40 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index d7621bda32..6c15e39e09 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -10111,6 +10111,7 @@ typedef struct ur_exp_command_buffer_command_handle_t_ /// + `NULL == hContext` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pCommandBufferDesc` /// + `NULL == phCommandBuffer` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_DEVICE @@ -10125,7 +10126,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer); diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml index 218c626423..680bc60f8d 100644 --- a/scripts/core/exp-command-buffer.yml +++ b/scripts/core/exp-command-buffer.yml @@ -282,7 +282,7 @@ params: desc: "[in] Handle of the device object." - type: "const $x_exp_command_buffer_desc_t*" name: pCommandBufferDesc - desc: "[in][optional] command-buffer descriptor." + desc: "[in] Command-buffer descriptor." - type: "$x_exp_command_buffer_handle_t*" name: phCommandBuffer desc: "[out][alloc] Pointer to command-Buffer handle." diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 0d274e6c15..42ec8dbafc 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -352,10 +352,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_exp_command_buffer_desc_t *pCommandBufferDesc, ur_exp_command_buffer_handle_t *phCommandBuffer) { - - const bool IsUpdatable = - pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; - + const bool IsUpdatable = pCommandBufferDesc->isUpdatable; try { *phCommandBuffer = new ur_exp_command_buffer_handle_t_(hContext, hDevice, IsUpdatable); diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index 4c65e61dab..ce07332ce8 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -233,9 +233,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_exp_command_buffer_desc_t *pCommandBufferDesc, ur_exp_command_buffer_handle_t *phCommandBuffer) { - const bool IsUpdatable = - pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; - + const bool IsUpdatable = pCommandBufferDesc->isUpdatable; try { *phCommandBuffer = new ur_exp_command_buffer_handle_t_(hContext, hDevice, IsUpdatable); diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 879ee0f1cc..5d6ad6c68b 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -586,9 +586,7 @@ bool canBeInOrder(ur_context_handle_t Context, bool CompatibleDriver = Context->getPlatform()->isDriverVersionNewerOrSimilar( 1, 3, L0_DRIVER_INORDER_MIN_VERSION); bool CanUseDriverInOrderLists = CompatibleDriver && DriverInOrderRequested; - return CanUseDriverInOrderLists - ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false) - : false; + return CanUseDriverInOrderLists ? CommandBufferDesc->isInOrder : false; } /** @@ -624,9 +622,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, const ur_exp_command_buffer_desc_t *CommandBufferDesc, ur_exp_command_buffer_handle_t *CommandBuffer) { bool IsInOrder = canBeInOrder(Context, CommandBufferDesc); - bool EnableProfiling = - CommandBufferDesc && CommandBufferDesc->enableProfiling && !IsInOrder; - bool IsUpdatable = CommandBufferDesc && CommandBufferDesc->isUpdatable; + bool EnableProfiling = CommandBufferDesc->enableProfiling && !IsInOrder; + bool IsUpdatable = CommandBufferDesc->isUpdatable; bool ImmediateAppendPath = checkImmediateAppendSupport(Context, Device); const bool WaitEventPath = !ImmediateAppendPath; bool UseCounterBasedEvents = checkCounterBasedEventsSupport(Device) && diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index 6d5034d07b..9a4ace593e 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -8403,7 +8403,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) try { diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index d78ef0121b..1c1bb0d18a 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -43,8 +43,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( CLContext, cl_ext::ExtFuncPtrCache->clCreateCommandBufferKHRCache, cl_ext::CreateCommandBufferName, &clCreateCommandBufferKHR)); - const bool IsUpdatable = - pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; + const bool IsUpdatable = pCommandBufferDesc->isUpdatable; ur_device_command_buffer_update_capability_flags_t UpdateCapabilities; cl_device_id CLDevice = cl_adapter::cast(hDevice); diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index f4a7b7e60a..e716eaaa49 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -6978,7 +6978,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) { diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index eb2bd4c353..acb1f65c94 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -7615,7 +7615,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) { @@ -7632,6 +7632,9 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( if (NULL == hDevice) return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + if (NULL == pCommandBufferDesc) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (NULL == phCommandBuffer) return UR_RESULT_ERROR_INVALID_NULL_POINTER; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index ec6081509f..c1e21fd58b 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -7044,7 +7044,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) { diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 5761dab3a4..42d905964f 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -7619,6 +7619,7 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( /// + `NULL == hContext` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pCommandBufferDesc` /// + `NULL == phCommandBuffer` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_DEVICE @@ -7633,7 +7634,7 @@ ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) try { diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 7023161cb1..80c034cbd0 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -6664,6 +6664,7 @@ ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( /// + `NULL == hContext` /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pCommandBufferDesc` /// + `NULL == phCommandBuffer` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_DEVICE @@ -6678,7 +6679,7 @@ ur_result_t UR_APICALL urCommandBufferCreateExp( ur_context_handle_t hContext, /// [in] Handle of the device object. ur_device_handle_t hDevice, - /// [in][optional] command-buffer descriptor. + /// [in] Command-buffer descriptor. const ur_exp_command_buffer_desc_t *pCommandBufferDesc, /// [out][alloc] Pointer to command-Buffer handle. ur_exp_command_buffer_handle_t *phCommandBuffer) { diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h index 0cd4d7a6b3..e0ddc0b389 100644 --- a/test/conformance/exp_command_buffer/fixtures.h +++ b/test/conformance/exp_command_buffer/fixtures.h @@ -60,8 +60,12 @@ struct urCommandBufferExpTest : uur::urContextTest { UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp()); UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); + + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false, + }; ASSERT_SUCCESS( - urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle)); + urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); } @@ -83,8 +87,11 @@ struct urCommandBufferExpTestWithParam : urQueueTestWithParam { UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam::SetUp()); UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(this->device)); - ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device, - nullptr, &cmd_buf_handle)); + + ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, + nullptr, false, false, false}; + ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device, &desc, + &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); } @@ -105,8 +112,11 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); + + ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, + nullptr, false, false, false}; ASSERT_SUCCESS( - urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle)); + urCommandBufferCreateExp(context, device, &desc, &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); } @@ -333,15 +343,8 @@ struct urCommandEventSyncTest : urCommandBufferExpTest { ASSERT_NE(buffer, nullptr); } - // Create a command-buffer with update enabled. - ur_exp_command_buffer_desc_t desc{ - /*.stype=*/UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, - /*.pNext =*/nullptr, - /*.isUpdatable =*/false, - /*.isInOrder =*/false, - /*.enableProfiling =*/false, - }; - + ur_exp_command_buffer_desc_t desc{UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, + nullptr, true, false, false}; ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc, &second_cmd_buf_handle)); ASSERT_NE(second_cmd_buf_handle, nullptr); diff --git a/test/conformance/exp_command_buffer/kernel_event_sync.cpp b/test/conformance/exp_command_buffer/kernel_event_sync.cpp index 84941ab26d..60844ffdd7 100644 --- a/test/conformance/exp_command_buffer/kernel_event_sync.cpp +++ b/test/conformance/exp_command_buffer/kernel_event_sync.cpp @@ -40,7 +40,10 @@ struct KernelCommandEventSyncTest ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 3, nullptr, device_ptrs[1])); // Create second command-buffer - ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr, + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false, + }; + ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc, &second_cmd_buf_handle)); ASSERT_NE(second_cmd_buf_handle, nullptr); } diff --git a/test/conformance/exp_command_buffer/update/invalid_update.cpp b/test/conformance/exp_command_buffer/update/invalid_update.cpp index 9c3da1ddd5..b93ffc26d4 100644 --- a/test/conformance/exp_command_buffer/update/invalid_update.cpp +++ b/test/conformance/exp_command_buffer/update/invalid_update.cpp @@ -109,8 +109,11 @@ TEST_P(InvalidUpdateTest, NotFinalizedCommandBuffer) { TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) { // Create a command-buffer without isUpdatable ur_exp_command_buffer_handle_t test_cmd_buf_handle = nullptr; + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false, + }; ASSERT_SUCCESS( - urCommandBufferCreateExp(context, device, nullptr, &test_cmd_buf_handle)); + urCommandBufferCreateExp(context, device, &desc, &test_cmd_buf_handle)); EXPECT_NE(test_cmd_buf_handle, nullptr); // Append a kernel commands to command-buffer and close command-buffer diff --git a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp index 45309c3dee..0094caa274 100644 --- a/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp +++ b/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp @@ -311,9 +311,12 @@ TEST_P(urMultiDeviceCommandBufferExpTest, Enqueue) { } // Create command-buffer + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, false, false, false, + }; uur::raii::CommandBuffer cmd_buf_handle; - ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr, - cmd_buf_handle.ptr())); + ASSERT_SUCCESS( + urCommandBufferCreateExp(context, device, &desc, cmd_buf_handle.ptr())); // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( From eb471a7127f9f8c5abda89f68e22a1ffe40647ac Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Thu, 20 Feb 2025 10:18:34 +0000 Subject: [PATCH 06/13] In-order path for OpenCL command-buffers (#17056) After the [spec bump of cl_khr_command_buffer to 0.9.7](https://github.com/KhronosGroup/OpenCL-Docs/), in the OpenCL adapter we no longer need to worry about the in-order/out-of-order property of the internal queue used on command-command-buffer creation matching the queue used to enqueue the command-buffer. We can therefore take advantage of the in-order flag passed on UR command-buffer creation to use an in-order queue for command-buffer creation, and omit using sync points. **Note:** This UR patch was previously approved and ready-to-merge prior to the UR repo move in https://github.com/oneapi-src/unified-runtime/pull/2681 --- source/adapters/opencl/command_buffer.cpp | 47 ++++++++++++++++++----- source/adapters/opencl/command_buffer.hpp | 6 ++- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index 1c1bb0d18a..3707dd4e5d 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -34,7 +34,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( ur_exp_command_buffer_handle_t *phCommandBuffer) { ur_queue_handle_t Queue = nullptr; - UR_RETURN_ON_FAILURE(urQueueCreate(hContext, hDevice, nullptr, &Queue)); + ur_queue_properties_t QueueProperties = {UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + nullptr, 0}; + const bool IsInOrder = + pCommandBufferDesc ? pCommandBufferDesc->isInOrder : false; + if (!IsInOrder) { + QueueProperties.flags = UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + } + UR_RETURN_ON_FAILURE( + urQueueCreate(hContext, hDevice, &QueueProperties, &Queue)); cl_context CLContext = cl_adapter::cast(hContext); cl_ext::clCreateCommandBufferKHR_fn clCreateCommandBufferKHR = nullptr; @@ -66,7 +74,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( try { auto URCommandBuffer = std::make_unique( - Queue, hContext, hDevice, CLCommandBuffer, IsUpdatable); + Queue, hContext, hDevice, CLCommandBuffer, IsUpdatable, IsInOrder); *phCommandBuffer = URCommandBuffer.release(); } catch (...) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; @@ -147,11 +155,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( cl_command_properties_khr *Properties = hCommandBuffer->IsUpdatable ? UpdateProperties : nullptr; + + const bool IsInOrder = hCommandBuffer->IsInOrder; + cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = + IsInOrder ? nullptr : pSyncPointWaitList; + uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandNDRangeKernelKHR( hCommandBuffer->CLCommandBuffer, nullptr, Properties, cl_adapter::cast(hKernel), workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint, OutCommandHandle)); + pGlobalWorkSize, pLocalWorkSize, WaitListSize, SyncPointWaitList, + RetSyncPoint, OutCommandHandle)); try { auto Handle = std::make_unique( @@ -218,11 +232,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( CLContext, cl_ext::ExtFuncPtrCache->clCommandCopyBufferKHRCache, cl_ext::CommandCopyBufferName, &clCommandCopyBufferKHR)); + const bool IsInOrder = hCommandBuffer->IsInOrder; + cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = + IsInOrder ? nullptr : pSyncPointWaitList; + uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandCopyBufferKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, cl_adapter::cast(hSrcMem), cl_adapter::cast(hDstMem), - srcOffset, dstOffset, size, numSyncPointsInWaitList, pSyncPointWaitList, - pSyncPoint, nullptr)); + srcOffset, dstOffset, size, WaitListSize, SyncPointWaitList, RetSyncPoint, + nullptr)); return UR_RESULT_SUCCESS; } @@ -256,12 +275,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( CLContext, cl_ext::ExtFuncPtrCache->clCommandCopyBufferRectKHRCache, cl_ext::CommandCopyBufferRectName, &clCommandCopyBufferRectKHR)); + const bool IsInOrder = hCommandBuffer->IsInOrder; + cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = + IsInOrder ? nullptr : pSyncPointWaitList; + uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandCopyBufferRectKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, cl_adapter::cast(hSrcMem), cl_adapter::cast(hDstMem), OpenCLOriginRect, OpenCLDstRect, OpenCLRegion, srcRowPitch, srcSlicePitch, - dstRowPitch, dstSlicePitch, numSyncPointsInWaitList, pSyncPointWaitList, - pSyncPoint, nullptr)); + dstRowPitch, dstSlicePitch, WaitListSize, SyncPointWaitList, RetSyncPoint, + nullptr)); return UR_RESULT_SUCCESS; } @@ -360,10 +384,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( CLContext, cl_ext::ExtFuncPtrCache->clCommandFillBufferKHRCache, cl_ext::CommandFillBufferName, &clCommandFillBufferKHR)); + const bool IsInOrder = hCommandBuffer->IsInOrder; + cl_sync_point_khr *RetSyncPoint = IsInOrder ? nullptr : pSyncPoint; + const cl_sync_point_khr *SyncPointWaitList = + IsInOrder ? nullptr : pSyncPointWaitList; + uint32_t WaitListSize = IsInOrder ? 0 : numSyncPointsInWaitList; CL_RETURN_ON_FAILURE(clCommandFillBufferKHR( hCommandBuffer->CLCommandBuffer, nullptr, nullptr, cl_adapter::cast(hBuffer), pPattern, patternSize, offset, size, - numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, nullptr)); + WaitListSize, SyncPointWaitList, RetSyncPoint, nullptr)); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/opencl/command_buffer.hpp b/source/adapters/opencl/command_buffer.hpp index 83fedc3c1e..6fbdc5d259 100644 --- a/source/adapters/opencl/command_buffer.hpp +++ b/source/adapters/opencl/command_buffer.hpp @@ -46,6 +46,8 @@ struct ur_exp_command_buffer_handle_t_ { /// Set to true if the kernel commands in the command-buffer can be updated, /// false otherwise bool IsUpdatable; + /// Set to true if the command-buffer was created with an in-order queue. + bool IsInOrder; /// Set to true if the command-buffer has been finalized, false otherwise bool IsFinalized; /// List of commands in the command-buffer. @@ -58,10 +60,10 @@ struct ur_exp_command_buffer_handle_t_ { ur_context_handle_t hContext, ur_device_handle_t hDevice, cl_command_buffer_khr CLCommandBuffer, - bool IsUpdatable) + bool IsUpdatable, bool IsInOrder) : hInternalQueue(hQueue), hContext(hContext), hDevice(hDevice), CLCommandBuffer(CLCommandBuffer), IsUpdatable(IsUpdatable), - IsFinalized(false), RefCount(0) {} + IsInOrder(IsInOrder), IsFinalized(false), RefCount(0) {} ~ur_exp_command_buffer_handle_t_(); From a239c8f728d1d7be4814bf660332d4635baefbd4 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Thu, 20 Feb 2025 10:47:46 +0000 Subject: [PATCH 07/13] Don't set -pie on shared objects (#16880) Fixes #16677 by only setting `-pie` linker option in Release builds on executables rather than on any type of target. --- cmake/helpers.cmake | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake index bdd4a6d15c..ec945ba2c2 100644 --- a/cmake/helpers.cmake +++ b/cmake/helpers.cmake @@ -157,11 +157,6 @@ function(add_ur_target_link_options name) if (UR_DEVELOPER_MODE) target_link_options(${name} PRIVATE -Werror -Wextra) endif() - if (CMAKE_BUILD_TYPE STREQUAL "Release") - target_link_options(${name} PRIVATE - $<$:-pie> - ) - endif() endif() elseif(MSVC) target_link_options(${name} PRIVATE @@ -176,7 +171,15 @@ function(add_ur_target_link_options name) endfunction() function(add_ur_target_exec_options name) - if(MSVC) + if(NOT MSVC) + if(NOT APPLE) + if(CMAKE_BUILD_TYPE STREQUAL "Release") + target_link_options(${name} PRIVATE + $<$:-pie> + ) + endif() + endif() + elseif(MSVC) target_link_options(${name} PRIVATE LINKER:/ALLOWISOLATION ) From a6e7ea77bcf8c6737fb6712dcdbbd878710303fa Mon Sep 17 00:00:00 2001 From: Yang Zhao Date: Thu, 20 Feb 2025 18:49:49 +0800 Subject: [PATCH 08/13] Fix throw "UR_RESULT_ERROR_INVALID_ARGUMENT" exception when catching free related error (#16706) UR: https://github.com/oneapi-src/unified-runtime/pull/2592 --------- Co-authored-by: Kenneth Benzie (Benie) --- .../sanitizer/asan/asan_interceptor.cpp | 20 +++++++++++++++---- .../layers/sanitizer/asan/asan_options.cpp | 1 + .../layers/sanitizer/asan/asan_options.hpp | 1 + 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index d5d5c2c729..d99c0545c3 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -175,7 +175,10 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context, if (!AllocInfoItOp) { // "Addr" might be a host pointer ReportBadFree(Addr, GetCurrentBacktrace(), nullptr); - return UR_RESULT_ERROR_INVALID_ARGUMENT; + if (getOptions().HaltOnError) { + exitWithErrors(); + } + return UR_RESULT_SUCCESS; } auto AllocInfoIt = *AllocInfoItOp; @@ -190,17 +193,26 @@ ur_result_t AsanInterceptor::releaseMemory(ur_context_handle_t Context, // "Addr" might be a host pointer ReportBadFree(Addr, GetCurrentBacktrace(), nullptr); } - return UR_RESULT_ERROR_INVALID_ARGUMENT; + if (getOptions().HaltOnError) { + exitWithErrors(); + } + return UR_RESULT_SUCCESS; } if (Addr != AllocInfo->UserBegin) { ReportBadFree(Addr, GetCurrentBacktrace(), AllocInfo); - return UR_RESULT_ERROR_INVALID_ARGUMENT; + if (getOptions().HaltOnError) { + exitWithErrors(); + } + return UR_RESULT_SUCCESS; } if (AllocInfo->IsReleased) { ReportDoubleFree(Addr, GetCurrentBacktrace(), AllocInfo); - return UR_RESULT_ERROR_INVALID_ARGUMENT; + if (getOptions().HaltOnError) { + exitWithErrors(); + } + return UR_RESULT_SUCCESS; } AllocInfo->IsReleased = true; diff --git a/source/loader/layers/sanitizer/asan/asan_options.cpp b/source/loader/layers/sanitizer/asan/asan_options.cpp index 7704547d38..799c892ff1 100644 --- a/source/loader/layers/sanitizer/asan/asan_options.cpp +++ b/source/loader/layers/sanitizer/asan/asan_options.cpp @@ -90,6 +90,7 @@ AsanOptions::AsanOptions() { SetBoolOption("detect_privates", DetectPrivates); SetBoolOption("print_stats", PrintStats); SetBoolOption("detect_leaks", DetectLeaks); + SetBoolOption("halt_on_error", HaltOnError); auto KV = OptionsEnvMap->find("quarantine_size_mb"); if (KV != OptionsEnvMap->end()) { diff --git a/source/loader/layers/sanitizer/asan/asan_options.hpp b/source/loader/layers/sanitizer/asan/asan_options.hpp index 4da32ca278..cea30351d3 100644 --- a/source/loader/layers/sanitizer/asan/asan_options.hpp +++ b/source/loader/layers/sanitizer/asan/asan_options.hpp @@ -28,6 +28,7 @@ struct AsanOptions { bool PrintStats = false; bool DetectKernelArguments = true; bool DetectLeaks = true; + bool HaltOnError = true; explicit AsanOptions(); }; From 7697d3787208a19c0ab01723f4848e2529974dd2 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Thu, 20 Feb 2025 12:21:26 +0000 Subject: [PATCH 09/13] Make each profiling info variant for urEventGetProfilingInfo optional and improve its conformance test (#17067) Migrated from https://github.com/oneapi-src/unified-runtime/pull/2533 This patch turns all of the values returned by urEventGetProfilingInfo to be optional and updates adapters to handle this by returning the appropriate enum when it is not supported. The tests have also been updated, to ensure that returning a counter of "0" or values equal to the previous profiling event is no longer considered a failure. --- include/ur_api.h | 24 ++++--- scripts/core/event.yml | 12 ++-- source/adapters/cuda/event.cpp | 2 + source/adapters/hip/event.cpp | 2 + source/adapters/level_zero/event.cpp | 12 ++++ source/adapters/native_cpu/event.cpp | 1 + source/loader/ur_libapi.cpp | 2 + source/ur_api.cpp | 2 + .../event/urEventGetProfilingInfo.cpp | 71 +++++++++---------- .../testing/include/uur/optional_queries.h | 12 ++++ 10 files changed, 85 insertions(+), 55 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index 6c15e39e09..c390ed4410 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -7105,21 +7105,21 @@ typedef enum ur_event_info_t { /////////////////////////////////////////////////////////////////////////////// /// @brief Profiling query information type typedef enum ur_profiling_info_t { - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event is enqueued + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event is enqueued UR_PROFILING_INFO_COMMAND_QUEUED = 0, - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event is submitted + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event is submitted UR_PROFILING_INFO_COMMAND_SUBMIT = 1, - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event starts execution + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event starts execution UR_PROFILING_INFO_COMMAND_START = 2, - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event has finished execution + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event has finished execution UR_PROFILING_INFO_COMMAND_END = 3, - /// [uint64_t] A 64-bit value of current device counter in nanoseconds - /// when the event and any child events enqueued by this event on the - /// device have finished execution + /// [uint64_t][optional-query] A 64-bit value of current device counter in + /// nanoseconds when the event and any child events enqueued by this event + /// on the device have finished execution UR_PROFILING_INFO_COMMAND_COMPLETE = 4, /// @cond UR_PROFILING_INFO_FORCE_UINT32 = 0x7fffffff @@ -7193,6 +7193,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo( /// - ::UR_RESULT_ERROR_INVALID_EVENT /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION +/// + If `propName` is not supported by the adapter. UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( /// [in] handle of the event object ur_event_handle_t hEvent, diff --git a/scripts/core/event.yml b/scripts/core/event.yml index da2fa457de..1b0eeca23e 100644 --- a/scripts/core/event.yml +++ b/scripts/core/event.yml @@ -121,15 +121,15 @@ name: $x_profiling_info_t typed_etors: True etors: - name: COMMAND_QUEUED - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event is enqueued" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event is enqueued" - name: COMMAND_SUBMIT - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event is submitted" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event is submitted" - name: COMMAND_START - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event starts execution" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event starts execution" - name: COMMAND_END - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event has finished execution" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event has finished execution" - name: COMMAND_COMPLETE - desc: "[uint64_t] A 64-bit value of current device counter in nanoseconds when the event and any child events enqueued by this event on the device have finished execution" + desc: "[uint64_t][optional-query] A 64-bit value of current device counter in nanoseconds when the event and any child events enqueued by this event on the device have finished execution" --- #-------------------------------------------------------------------------- type: function desc: "Get event object information" @@ -198,6 +198,8 @@ returns: - $X_RESULT_ERROR_INVALID_EVENT - $X_RESULT_ERROR_OUT_OF_RESOURCES - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY + - $X_RESULT_ERROR_UNSUPPORTED_ENUMERATION: + - "If `propName` is not supported by the adapter." --- #-------------------------------------------------------------------------- type: function desc: "Wait for a list of events to finish." diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index a6c2208e8f..37792bf5b2 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -213,6 +213,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return ReturnValue(static_cast(hEvent->getStartTime())); case UR_PROFILING_INFO_COMMAND_END: return ReturnValue(static_cast(hEvent->getEndTime())); + case UR_PROFILING_INFO_COMMAND_COMPLETE: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 81c839cf32..d971207a6c 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -234,6 +234,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return ReturnValue(static_cast(hEvent->getStartTime())); case UR_PROFILING_INFO_COMMAND_END: return ReturnValue(static_cast(hEvent->getEndTime())); + case UR_PROFILING_INFO_COMMAND_COMPLETE: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index e12352b6b1..cfcd1d316d 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -609,6 +609,10 @@ ur_result_t urEventGetProfilingInfo( return ReturnValue(ContextEndTime); } + case UR_PROFILING_INFO_COMMAND_COMPLETE: + logger::error("urEventGetProfilingInfo: " + "UR_PROFILING_INFO_COMMAND_COMPLETE not supported"); + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: logger::error("urEventGetProfilingInfo: not supported ParamName"); return UR_RESULT_ERROR_INVALID_VALUE; @@ -672,6 +676,10 @@ ur_result_t urEventGetProfilingInfo( ContextEndTime *= ZeTimerResolution; return ReturnValue(ContextEndTime); } + case UR_PROFILING_INFO_COMMAND_COMPLETE: + logger::error("urEventGetProfilingInfo: " + "UR_PROFILING_INFO_COMMAND_COMPLETE not supported"); + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: logger::error("urEventGetProfilingInfo: not supported ParamName"); return UR_RESULT_ERROR_INVALID_VALUE; @@ -715,6 +723,10 @@ ur_result_t urEventGetProfilingInfo( // enqueue. // return ReturnValue(uint64_t{0}); + case UR_PROFILING_INFO_COMMAND_COMPLETE: + logger::error("urEventGetProfilingInfo: UR_PROFILING_INFO_COMMAND_COMPLETE " + "not supported"); + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: logger::error("urEventGetProfilingInfo: not supported ParamName"); return UR_RESULT_ERROR_INVALID_VALUE; diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp index 87865905c1..f981d24f42 100644 --- a/source/adapters/native_cpu/event.cpp +++ b/source/adapters/native_cpu/event.cpp @@ -52,6 +52,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( case UR_PROFILING_INFO_COMMAND_QUEUED: case UR_PROFILING_INFO_COMMAND_SUBMIT: case UR_PROFILING_INFO_COMMAND_COMPLETE: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: break; } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 42d905964f..e579753763 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -4733,6 +4733,8 @@ ur_result_t UR_APICALL urEventGetInfo( /// - ::UR_RESULT_ERROR_INVALID_EVENT /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION +/// + If `propName` is not supported by the adapter. ur_result_t UR_APICALL urEventGetProfilingInfo( /// [in] handle of the event object ur_event_handle_t hEvent, diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 80c034cbd0..c5651c0fc4 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -4138,6 +4138,8 @@ ur_result_t UR_APICALL urEventGetInfo( /// - ::UR_RESULT_ERROR_INVALID_EVENT /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY +/// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION +/// + If `propName` is not supported by the adapter. ur_result_t UR_APICALL urEventGetProfilingInfo( /// [in] handle of the event object ur_event_handle_t hEvent, diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp index 16723df5cf..e63b6078de 100644 --- a/test/conformance/event/urEventGetProfilingInfo.cpp +++ b/test/conformance/event/urEventGetProfilingInfo.cpp @@ -11,8 +11,6 @@ using urEventGetProfilingInfoTest = uur::event::urEventTest; UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEventGetProfilingInfoTest); TEST_P(urEventGetProfilingInfoTest, SuccessCommandQueued) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::NativeCPU{}); - const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_QUEUED; size_t property_size = 0; @@ -29,8 +27,6 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandQueued) { } TEST_P(urEventGetProfilingInfoTest, SuccessCommandSubmit) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::NativeCPU{}); - const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_SUBMIT; size_t property_size = 0; @@ -79,9 +75,6 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandEnd) { } TEST_P(urEventGetProfilingInfoTest, SuccessCommandComplete) { - UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::HIP{}, uur::LevelZero{}, - uur::NativeCPU{}); - const ur_profiling_info_t property_name = UR_PROFILING_INFO_COMMAND_COMPLETE; size_t property_size = 0; @@ -98,41 +91,41 @@ TEST_P(urEventGetProfilingInfoTest, SuccessCommandComplete) { } TEST_P(urEventGetProfilingInfoTest, Success) { - UUR_KNOWN_FAILURE_ON(uur::CUDA{}, uur::HIP{}, uur::LevelZero{}, - uur::LevelZeroV2{}, uur::NativeCPU{}); - - uint8_t size = 8; - - uint64_t queued_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo( - event, UR_PROFILING_INFO_COMMAND_QUEUED, size, &queued_value, nullptr)); - ASSERT_NE(queued_value, 0); - - uint64_t submit_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo( - event, UR_PROFILING_INFO_COMMAND_SUBMIT, size, &submit_value, nullptr)); - ASSERT_NE(submit_value, 0); - - uint64_t start_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START, - size, &start_value, nullptr)); - ASSERT_NE(start_value, 0); + // AMD devices may report a "start" time before the "submit" time + UUR_KNOWN_FAILURE_ON(uur::HIP{}); + + // If a and b are supported, asserts that a <= b + auto test_timing = [=](ur_profiling_info_t a, ur_profiling_info_t b) { + std::stringstream trace{"Profiling Info: "}; + trace << a << " <= " << b; + SCOPED_TRACE(trace.str()); + uint64_t a_time; + auto result = + urEventGetProfilingInfo(event, a, sizeof(a_time), &a_time, nullptr); + if (result == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) { + return; + } + ASSERT_SUCCESS(result); - uint64_t end_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END, - size, &end_value, nullptr)); - ASSERT_NE(end_value, 0); + uint64_t b_time; + result = + urEventGetProfilingInfo(event, b, sizeof(b_time), &b_time, nullptr); + if (result == UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION) { + return; + } + ASSERT_SUCCESS(result); - uint64_t complete_value = 0; - ASSERT_SUCCESS(urEventGetProfilingInfo(event, - UR_PROFILING_INFO_COMMAND_COMPLETE, - size, &complete_value, nullptr)); - ASSERT_NE(complete_value, 0); + // Note: This assumes that the counter doesn't overflow + ASSERT_LE(a_time, b_time); + }; - ASSERT_LE(queued_value, submit_value); - ASSERT_LT(submit_value, start_value); - ASSERT_LT(start_value, end_value); - ASSERT_LE(end_value, complete_value); + test_timing(UR_PROFILING_INFO_COMMAND_QUEUED, + UR_PROFILING_INFO_COMMAND_SUBMIT); + test_timing(UR_PROFILING_INFO_COMMAND_SUBMIT, + UR_PROFILING_INFO_COMMAND_START); + test_timing(UR_PROFILING_INFO_COMMAND_START, UR_PROFILING_INFO_COMMAND_END); + test_timing(UR_PROFILING_INFO_COMMAND_END, + UR_PROFILING_INFO_COMMAND_COMPLETE); } TEST_P(urEventGetProfilingInfoTest, InvalidNullHandle) { diff --git a/test/conformance/testing/include/uur/optional_queries.h b/test/conformance/testing/include/uur/optional_queries.h index d3f670545a..e0a78c18f5 100644 --- a/test/conformance/testing/include/uur/optional_queries.h +++ b/test/conformance/testing/include/uur/optional_queries.h @@ -117,4 +117,16 @@ template <> inline bool isQueryOptional(ur_queue_info_t query) { query) != optional_ur_queue_info_t.end(); } +constexpr std::array optional_ur_profiling_info_t = { + UR_PROFILING_INFO_COMMAND_QUEUED, UR_PROFILING_INFO_COMMAND_SUBMIT, + UR_PROFILING_INFO_COMMAND_START, UR_PROFILING_INFO_COMMAND_END, + UR_PROFILING_INFO_COMMAND_COMPLETE, +}; + +template <> inline bool isQueryOptional(ur_profiling_info_t query) { + return std::find(optional_ur_profiling_info_t.begin(), + optional_ur_profiling_info_t.end(), + query) != optional_ur_profiling_info_t.end(); +} + } // namespace uur From fd4508a5ef7e9c3fd22d8bfec8c7634b2b045194 Mon Sep 17 00:00:00 2001 From: Ben Tracy Date: Thu, 20 Feb 2025 12:48:49 +0000 Subject: [PATCH 10/13] Fix issue with command-buffer local mem update (#17069) - Fix group count not being recalculated when a user only passes a new local work size and no new global size - Remove CTS test skips for local update on L0 --- source/adapters/level_zero/command_buffer.cpp | 39 ++++++++++++------- source/adapters/level_zero/command_buffer.hpp | 11 ++++++ .../update/local_memory_update.cpp | 6 --- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 5d6ad6c68b..4705964190 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -907,17 +907,18 @@ ur_result_t setKernelPendingArguments( * @param[in] CommandBuffer The CommandBuffer associated with the new command. * @param[in] Kernel The Kernel associated with the new command. * @param[in] WorkDim Dimensions of the kernel associated with the new command. + * @param[in] GlobalWorkSize Global work size of the kernel associated with the + * new command. * @param[in] LocalWorkSize LocalWorkSize of the kernel associated with the new * command. * @param[out] Command The handle to the new command. * @return UR_RESULT_SUCCESS or an error code on failure */ -ur_result_t -createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer, - ur_kernel_handle_t Kernel, uint32_t WorkDim, - const size_t *LocalWorkSize, uint32_t NumKernelAlternatives, - ur_kernel_handle_t *KernelAlternatives, - ur_exp_command_buffer_command_handle_t *Command) { +ur_result_t createCommandHandle( + ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, + uint32_t WorkDim, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + uint32_t NumKernelAlternatives, ur_kernel_handle_t *KernelAlternatives, + ur_exp_command_buffer_command_handle_t *Command) { assert(CommandBuffer->IsUpdatable); @@ -989,6 +990,8 @@ createCommandHandle(ur_exp_command_buffer_handle_t CommandBuffer, CommandBuffer, Kernel, CommandId, WorkDim, LocalWorkSize != nullptr, NumKernelAlternatives, KernelAlternatives); + NewCommand->setGlobalWorkSize(GlobalWorkSize); + *Command = NewCommand.get(); CommandBuffer->CommandHandles.push_back(std::move(NewCommand)); @@ -1063,9 +1066,9 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( } if (Command) { - UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, LocalWorkSize, - NumKernelAlternatives, KernelAlternatives, - Command)); + UR_CALL(createCommandHandle(CommandBuffer, Kernel, WorkDim, GlobalWorkSize, + LocalWorkSize, NumKernelAlternatives, + KernelAlternatives, Command)); } std::vector ZeEventList; ze_event_handle_t ZeLaunchEvent = nullptr; @@ -1919,10 +1922,16 @@ ur_result_t updateKernelCommand( Descs.push_back(std::move(MutableGroupSizeDesc)); } - // Check if a new global size is provided and if we need to update the group - // count. + // Check if a new global or local size is provided and if so we need to update + // the group count. ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; - if (NewGlobalWorkSize && Dim > 0) { + if ((NewGlobalWorkSize || NewLocalWorkSize) && Dim > 0) { + // If a new global work size is provided update that in the command, + // otherwise the previous work group size will be used + if (NewGlobalWorkSize) { + Command->WorkDim = Dim; + Command->setGlobalWorkSize(NewGlobalWorkSize); + } // If a new global work size is provided but a new local work size is not // then we still need to update local work size based on the size suggested // by the driver for the kernel. @@ -1932,9 +1941,9 @@ ur_result_t updateKernelCommand( UR_CALL(getZeKernel(ZeDevice, Command->Kernel, &ZeKernel)); uint32_t WG[3]; - UR_CALL(calculateKernelWorkDimensions(ZeKernel, CommandBuffer->Device, - ZeThreadGroupDimensions, WG, Dim, - NewGlobalWorkSize, NewLocalWorkSize)); + UR_CALL(calculateKernelWorkDimensions( + ZeKernel, CommandBuffer->Device, ZeThreadGroupDimensions, WG, Dim, + Command->GlobalWorkSize, NewLocalWorkSize)); auto MutableGroupCountDesc = std::make_unique>(); diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index c9389eaf11..491cb2638e 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -172,8 +172,19 @@ struct kernel_command_handle : public ur_exp_command_buffer_command_handle_t_ { ~kernel_command_handle(); + void setGlobalWorkSize(const size_t *GlobalWorkSizePtr) { + const size_t CopySize = sizeof(size_t) * WorkDim; + std::memcpy(GlobalWorkSize, GlobalWorkSizePtr, CopySize); + if (WorkDim < 3) { + const size_t ZeroSize = sizeof(size_t) * (3 - WorkDim); + std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize); + } + } + // Work-dimension the command was originally created with. uint32_t WorkDim; + // Global work size of the kernel + size_t GlobalWorkSize[3]; // Set to true if the user set the local work size on command creation. bool UserDefinedLocalSize; // Currently active kernel handle diff --git a/test/conformance/exp_command_buffer/update/local_memory_update.cpp b/test/conformance/exp_command_buffer/update/local_memory_update.cpp index 50c690fb89..1621b902a9 100644 --- a/test/conformance/exp_command_buffer/update/local_memory_update.cpp +++ b/test/conformance/exp_command_buffer/update/local_memory_update.cpp @@ -378,7 +378,6 @@ TEST_P(LocalMemoryUpdateTest, UpdateParametersEmptyLocalSize) { // Test updating A,X,Y parameters to new values and local memory parameters // to new smaller values. TEST_P(LocalMemoryUpdateTest, UpdateParametersSmallerLocalSize) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, @@ -1081,11 +1080,6 @@ struct LocalMemoryUpdateTestBaseOutOfOrder : LocalMemoryUpdateTestBase { UUR_RETURN_ON_FATAL_FAILURE( urUpdatableCommandBufferExpExecutionTest::SetUp()); - if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { - GTEST_SKIP() - << "Local memory argument update not supported on Level Zero."; - } - // HIP has extra args for local memory so we define an offset for arg // indices here for updating hip_arg_offset = backend == UR_PLATFORM_BACKEND_HIP ? 3 : 0; From cf84b75d00b290389a7aeb81e55b7839d4b7b6fe Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Thu, 20 Feb 2025 14:07:34 +0000 Subject: [PATCH 11/13] Fix MSVC build. (#17090) MSVC warns about a possible uninitialized variable. This is a false positive but explicitly initializing always is harmless, so do this. --- source/adapters/native_cpu/enqueue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 0ffe9415f1..d1c83a4655 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -269,7 +269,7 @@ ur_result_t withTimingEvent(ur_command_t command_type, ur_queue_handle_t hQueue, ur_event_handle_t *phEvent, const std::function &f) { urEventWait(numEventsInWaitList, phEventWaitList); - ur_event_handle_t event; + ur_event_handle_t event = nullptr; if (phEvent) { event = new ur_event_handle_t_(hQueue, command_type); event->tick_start(); From 99fa00b3a26080ff0de0592e78a9cb2d38321d95 Mon Sep 17 00:00:00 2001 From: Piotr Balcer Date: Fri, 21 Feb 2025 07:04:28 +0100 Subject: [PATCH 12/13] add manually triggered benchmark action (#17088) This is a first step towards reenabling UR performance testing CI. This introduces the reusable yml workflow and a way to trigger it manually. Here's an example how it looks: https://github.com/pbalcer/llvm/pull/2#issuecomment-2671230460 --- scripts/benchmarks/benches/oneapi.py | 2 +- scripts/benchmarks/main.py | 4 +--- scripts/benchmarks/options.py | 2 +- scripts/benchmarks/requirements.txt | 4 ++++ 4 files changed, 7 insertions(+), 5 deletions(-) create mode 100644 scripts/benchmarks/requirements.txt diff --git a/scripts/benchmarks/benches/oneapi.py b/scripts/benchmarks/benches/oneapi.py index b6b3f62c5c..0547f6646e 100644 --- a/scripts/benchmarks/benches/oneapi.py +++ b/scripts/benchmarks/benches/oneapi.py @@ -11,7 +11,7 @@ class OneAPI: # random unique number for benchmark oneAPI installation - ONEAPI_BENCHMARK_INSTANCE_ID = 98765 + ONEAPI_BENCHMARK_INSTANCE_ID = 987654 def __init__(self): self.oneapi_dir = os.path.join(options.workdir, "oneapi") diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py index 0b932a888d..69237c46f6 100755 --- a/scripts/benchmarks/main.py +++ b/scripts/benchmarks/main.py @@ -262,9 +262,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): compare_names.append(saved_name) if options.output_html: - html_content = generate_html( - history.runs, "oneapi-src/unified-runtime", compare_names - ) + html_content = generate_html(history.runs, "intel/llvm", compare_names) with open("benchmark_results.html", "w") as file: file.write(html_content) diff --git a/scripts/benchmarks/options.py b/scripts/benchmarks/options.py index 394766b605..2e92675264 100644 --- a/scripts/benchmarks/options.py +++ b/scripts/benchmarks/options.py @@ -37,7 +37,7 @@ class Options: build_compute_runtime: bool = False extra_ld_libraries: list[str] = field(default_factory=list) extra_env_vars: dict = field(default_factory=dict) - compute_runtime_tag: str = "24.52.32224.10" + compute_runtime_tag: str = "25.05.32567.12" build_igc: bool = False current_run_name: str = "This PR" diff --git a/scripts/benchmarks/requirements.txt b/scripts/benchmarks/requirements.txt new file mode 100644 index 0000000000..99ba0caab5 --- /dev/null +++ b/scripts/benchmarks/requirements.txt @@ -0,0 +1,4 @@ +matplotlib==3.9.2 +mpld3==0.5.10 +dataclasses-json==0.6.7 +PyYAML==6.0.1 From dd7d5c6256f7ec90be9753e6e5a24d06a6b36fd6 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Fri, 21 Feb 2025 12:06:17 +0100 Subject: [PATCH 13/13] Use UMF Proxy pool manager with UMF CUDA memory provider in UR (#17015) Use UMF Proxy pool manager with UMF CUDA memory provider in UR. UMF Proxy pool manager is just a wrapper for the UMF memory provider (CUDA memory provider in this case) plus it adds also tracking of memory allocations. Moved from: https://github.com/oneapi-src/unified-runtime/pull/2659 Signed-off-by: Lukasz Dorau --- source/adapters/cuda/common.hpp | 38 +++++++ source/adapters/cuda/context.hpp | 33 ++++-- source/adapters/cuda/device.hpp | 17 +++- source/adapters/cuda/memory.cpp | 8 +- source/adapters/cuda/memory.hpp | 4 +- source/adapters/cuda/platform.cpp | 40 ++++++-- source/adapters/cuda/usm.cpp | 161 ++++-------------------------- source/adapters/cuda/usm.hpp | 74 -------------- source/common/umf_helpers.hpp | 54 ++++------ 9 files changed, 157 insertions(+), 272 deletions(-) diff --git a/source/adapters/cuda/common.hpp b/source/adapters/cuda/common.hpp index 67223c45bc..23189ce91f 100644 --- a/source/adapters/cuda/common.hpp +++ b/source/adapters/cuda/common.hpp @@ -12,6 +12,17 @@ #include #include +#include +#include + +#define UMF_RETURN_UMF_ERROR(UmfResult) \ + do { \ + umf_result_t UmfResult_ = (UmfResult); \ + if (UmfResult_ != UMF_RESULT_SUCCESS) { \ + return UmfResult_; \ + } \ + } while (0) + ur_result_t mapErrorUR(CUresult Result); /// Converts CUDA error into UR error codes, and outputs error information @@ -59,3 +70,30 @@ void assertion(bool Condition, const char *Message = nullptr); } // namespace ur } // namespace detail + +namespace umf { + +using cuda_params_unique_handle_t = std::unique_ptr< + umf_cuda_memory_provider_params_t, + std::function>; + +inline umf_result_t setCUMemoryProviderParams( + umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams, + int cuDevice, void *cuContext, umf_usm_memory_type_t memType) { + + umf_result_t UmfResult = + umfCUDAMemoryProviderParamsSetContext(CUMemoryProviderParams, cuContext); + UMF_RETURN_UMF_ERROR(UmfResult); + + UmfResult = + umfCUDAMemoryProviderParamsSetDevice(CUMemoryProviderParams, cuDevice); + UMF_RETURN_UMF_ERROR(UmfResult); + + UmfResult = + umfCUDAMemoryProviderParamsSetMemoryType(CUMemoryProviderParams, memType); + UMF_RETURN_UMF_ERROR(UmfResult); + + return UMF_RESULT_SUCCESS; +} + +} // namespace umf diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp index 96a1464a87..e84b4b7f7a 100644 --- a/source/adapters/cuda/context.hpp +++ b/source/adapters/cuda/context.hpp @@ -77,8 +77,9 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); /// static ur_result_t -CreateHostMemoryProvider(ur_device_handle_t_ *DeviceHandle, - umf_memory_provider_handle_t *MemoryProviderHost) { +CreateHostMemoryProviderPool(ur_device_handle_t_ *DeviceHandle, + umf_memory_provider_handle_t *MemoryProviderHost, + umf_memory_pool_handle_t *MemoryPoolHost) { umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr; *MemoryProviderHost = nullptr; @@ -91,10 +92,20 @@ CreateHostMemoryProvider(ur_device_handle_t_ *DeviceHandle, umf::cuda_params_unique_handle_t CUMemoryProviderParamsUnique( CUMemoryProviderParams, umfCUDAMemoryProviderParamsDestroy); - // create UMF CUDA memory provider for the host memory (UMF_MEMORY_TYPE_HOST) - UmfResult = umf::createMemoryProvider( - CUMemoryProviderParamsUnique.get(), 0 /* cuDevice */, context, - UMF_MEMORY_TYPE_HOST, MemoryProviderHost); + UmfResult = umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(), + 0 /* cuDevice */, context, + UMF_MEMORY_TYPE_HOST); + UMF_RETURN_UR_ERROR(UmfResult); + + // create UMF CUDA memory provider and pool for the host memory + // (UMF_MEMORY_TYPE_HOST) + UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), + CUMemoryProviderParamsUnique.get(), + MemoryProviderHost); + UMF_RETURN_UR_ERROR(UmfResult); + + UmfResult = umfPoolCreate(umfProxyPoolOps(), *MemoryProviderHost, nullptr, 0, + MemoryPoolHost); UMF_RETURN_UR_ERROR(UmfResult); return UR_RESULT_SUCCESS; @@ -112,8 +123,10 @@ struct ur_context_handle_t_ { std::vector Devices; std::atomic_uint32_t RefCount; - // UMF CUDA memory provider for the host memory (UMF_MEMORY_TYPE_HOST) + // UMF CUDA memory provider and pool for the host memory + // (UMF_MEMORY_TYPE_HOST) umf_memory_provider_handle_t MemoryProviderHost = nullptr; + umf_memory_pool_handle_t MemoryPoolHost = nullptr; ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices) : Devices{Devs, Devs + NumDevices}, RefCount{1} { @@ -124,10 +137,14 @@ struct ur_context_handle_t_ { // Create UMF CUDA memory provider for the host memory // (UMF_MEMORY_TYPE_HOST) from any device (Devices[0] is used here, because // it is guaranteed to exist). - UR_CHECK_ERROR(CreateHostMemoryProvider(Devices[0], &MemoryProviderHost)); + UR_CHECK_ERROR(CreateHostMemoryProviderPool(Devices[0], &MemoryProviderHost, + &MemoryPoolHost)); }; ~ur_context_handle_t_() { + if (MemoryPoolHost) { + umfPoolDestroy(MemoryPoolHost); + } if (MemoryProviderHost) { umfMemoryProviderDestroy(MemoryProviderHost); } diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp index eaf4ba6765..e94291367b 100644 --- a/source/adapters/cuda/device.hpp +++ b/source/adapters/cuda/device.hpp @@ -11,6 +11,7 @@ #include +#include #include #include "common.hpp" @@ -84,9 +85,17 @@ struct ur_device_handle_t_ { MemoryProviderDevice = nullptr; MemoryProviderShared = nullptr; + MemoryPoolDevice = nullptr; + MemoryPoolShared = nullptr; } ~ur_device_handle_t_() { + if (MemoryPoolDevice) { + umfPoolDestroy(MemoryPoolDevice); + } + if (MemoryPoolShared) { + umfPoolDestroy(MemoryPoolShared); + } if (MemoryProviderDevice) { umfMemoryProviderDestroy(MemoryProviderDevice); } @@ -131,11 +140,15 @@ struct ur_device_handle_t_ { // bookkeeping for mipmappedArray leaks in Mapping external Memory std::map ChildCuarrayFromMipmapMap; - // UMF CUDA memory provider for the device memory (UMF_MEMORY_TYPE_DEVICE) + // UMF CUDA memory provider and pool for the device memory + // (UMF_MEMORY_TYPE_DEVICE) umf_memory_provider_handle_t MemoryProviderDevice; + umf_memory_pool_handle_t MemoryPoolDevice; - // UMF CUDA memory provider for the shared memory (UMF_MEMORY_TYPE_SHARED) + // UMF CUDA memory provider and pool for the shared memory + // (UMF_MEMORY_TYPE_SHARED) umf_memory_provider_handle_t MemoryProviderShared; + umf_memory_pool_handle_t MemoryPoolShared; }; int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute); diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp index 651fe0f43d..6e68275f3a 100644 --- a/source/adapters/cuda/memory.cpp +++ b/source/adapters/cuda/memory.cpp @@ -50,8 +50,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP)); AllocMode = BufferMem::AllocMode::UseHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { - UMF_CHECK_ERROR(umfMemoryProviderAlloc(hContext->MemoryProviderHost, size, - 0, &HostPtr)); + HostPtr = umfPoolMalloc(hContext->MemoryPoolHost, size); + UMF_CHECK_PTR(HostPtr); AllocMode = BufferMem::AllocMode::AllocHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { AllocMode = BufferMem::AllocMode::CopyIn; @@ -442,8 +442,8 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, CU_MEMHOSTALLOC_DEVICEMAP)); UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0)); } else { - UMF_CHECK_ERROR(umfMemoryProviderAlloc(hDevice->MemoryProviderDevice, - Buffer.Size, 0, (void **)&DevPtr)); + *(void **)&DevPtr = umfPoolMalloc(hDevice->MemoryPoolDevice, Buffer.Size); + UMF_CHECK_PTR(*(void **)&DevPtr); } } else { CUarray ImageArray{}; diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index 6dcaa28414..f0fc14f864 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -158,7 +158,7 @@ struct BufferMem { case AllocMode::Classic: for (auto &DevPtr : Ptrs) { if (DevPtr != native_type{0}) { - UR_CHECK_ERROR(cuMemFree(DevPtr)); + UMF_CHECK_ERROR(umfFree((void *)DevPtr)); } } break; @@ -166,7 +166,7 @@ struct BufferMem { UR_CHECK_ERROR(cuMemHostUnregister(HostPtr)); break; case AllocMode::AllocHostPtr: - UR_CHECK_ERROR(cuMemFreeHost(HostPtr)); + UMF_CHECK_ERROR(umfFree((void *)HostPtr)); } return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp index d53a027160..ac66c39afb 100644 --- a/source/adapters/cuda/platform.cpp +++ b/source/adapters/cuda/platform.cpp @@ -20,7 +20,7 @@ #include static ur_result_t -CreateDeviceMemoryProviders(ur_platform_handle_t_ *Platform) { +CreateDeviceMemoryProvidersPools(ur_platform_handle_t_ *Platform) { umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams = nullptr; umf_result_t UmfResult = @@ -37,16 +37,40 @@ CreateDeviceMemoryProviders(ur_platform_handle_t_ *Platform) { // create UMF CUDA memory provider for the device memory // (UMF_MEMORY_TYPE_DEVICE) - UmfResult = umf::createMemoryProvider( - CUMemoryProviderParamsUnique.get(), device, context, - UMF_MEMORY_TYPE_DEVICE, &device_handle->MemoryProviderDevice); + UmfResult = + umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(), + device, context, UMF_MEMORY_TYPE_DEVICE); + UMF_RETURN_UR_ERROR(UmfResult); + + UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), + CUMemoryProviderParamsUnique.get(), + &device_handle->MemoryProviderDevice); UMF_RETURN_UR_ERROR(UmfResult); // create UMF CUDA memory provider for the shared memory // (UMF_MEMORY_TYPE_SHARED) - UmfResult = umf::createMemoryProvider( - CUMemoryProviderParamsUnique.get(), device, context, - UMF_MEMORY_TYPE_SHARED, &device_handle->MemoryProviderShared); + UmfResult = + umf::setCUMemoryProviderParams(CUMemoryProviderParamsUnique.get(), + device, context, UMF_MEMORY_TYPE_SHARED); + UMF_RETURN_UR_ERROR(UmfResult); + + UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), + CUMemoryProviderParamsUnique.get(), + &device_handle->MemoryProviderShared); + UMF_RETURN_UR_ERROR(UmfResult); + + // create UMF CUDA memory pool for the device memory + // (UMF_MEMORY_TYPE_DEVICE) + UmfResult = + umfPoolCreate(umfProxyPoolOps(), device_handle->MemoryProviderDevice, + nullptr, 0, &device_handle->MemoryPoolDevice); + UMF_RETURN_UR_ERROR(UmfResult); + + // create UMF CUDA memory pool for the shared memory + // (UMF_MEMORY_TYPE_SHARED) + UmfResult = + umfPoolCreate(umfProxyPoolOps(), device_handle->MemoryProviderShared, + nullptr, 0, &device_handle->MemoryPoolShared); UMF_RETURN_UR_ERROR(UmfResult); } @@ -134,7 +158,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, static_cast(i)}); } - UR_CHECK_ERROR(CreateDeviceMemoryProviders(&Platform)); + UR_CHECK_ERROR(CreateDeviceMemoryProvidersPools(&Platform)); } catch (const std::bad_alloc &) { // Signal out-of-memory situation for (int i = 0; i < NumDevices; ++i) { diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp index e40927b7a8..5d2d43442d 100644 --- a/source/adapters/cuda/usm.cpp +++ b/source/adapters/cuda/usm.cpp @@ -102,54 +102,12 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } -ur_result_t USMFreeImpl(ur_context_handle_t hContext, void *Pointer) { - ur_result_t Result = UR_RESULT_SUCCESS; - try { - unsigned int IsManaged; - unsigned int Type; - unsigned int DeviceOrdinal; - const int NumAttributes = 3; - void *AttributeValues[NumAttributes] = {&IsManaged, &Type, &DeviceOrdinal}; - - CUpointer_attribute Attributes[NumAttributes] = { - CU_POINTER_ATTRIBUTE_IS_MANAGED, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL}; - UR_CHECK_ERROR(cuPointerGetAttributes( - NumAttributes, Attributes, AttributeValues, (CUdeviceptr)Pointer)); - UR_ASSERT(Type == CU_MEMORYTYPE_DEVICE || Type == CU_MEMORYTYPE_HOST, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - - std::vector ContextDevices = hContext->getDevices(); - ur_platform_handle_t Platform = ContextDevices[0]->getPlatform(); - unsigned int NumDevices = Platform->Devices.size(); - UR_ASSERT(DeviceOrdinal < NumDevices, UR_RESULT_ERROR_INVALID_DEVICE); - - ur_device_handle_t Device = Platform->Devices[DeviceOrdinal].get(); - umf_memory_provider_handle_t MemoryProvider; - - if (IsManaged) { - MemoryProvider = Device->MemoryProviderShared; - } else if (Type == CU_MEMORYTYPE_DEVICE) { - MemoryProvider = Device->MemoryProviderDevice; - } else { - MemoryProvider = hContext->MemoryProviderHost; - } - - UMF_CHECK_ERROR(umfMemoryProviderFree(MemoryProvider, Pointer, - 0 /* size is unknown */)); - } catch (ur_result_t Err) { - Result = Err; - } - return Result; -} - /// USM: Frees the given USM pointer associated with the context. /// UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, void *pMem) { - if (auto Pool = umfPoolByPtr(pMem)) - return umf::umf2urResult(umfPoolFree(Pool, pMem)); - return USMFreeImpl(hContext, pMem); + (void)hContext; // unused + return umf::umf2urResult(umfFree(pMem)); } ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, @@ -158,8 +116,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, uint32_t Alignment) { try { ScopedContext Active(Device); - UMF_CHECK_ERROR(umfMemoryProviderAlloc(Device->MemoryProviderDevice, Size, - Alignment, ResultPtr)); + *ResultPtr = umfPoolMalloc(Device->MemoryPoolDevice, Size); + UMF_CHECK_PTR(*ResultPtr); } catch (ur_result_t Err) { return Err; } @@ -180,8 +138,8 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, uint32_t Alignment) { try { ScopedContext Active(Device); - UMF_CHECK_ERROR(umfMemoryProviderAlloc(Device->MemoryProviderShared, Size, - Alignment, ResultPtr)); + *ResultPtr = umfPoolMalloc(Device->MemoryPoolShared, Size); + UMF_CHECK_PTR(*ResultPtr); } catch (ur_result_t Err) { return Err; } @@ -199,8 +157,8 @@ ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t hContext, ur_usm_host_mem_flags_t, size_t Size, uint32_t Alignment) { try { - UMF_CHECK_ERROR(umfMemoryProviderAlloc(hContext->MemoryProviderHost, Size, - Alignment, ResultPtr)); + *ResultPtr = umfPoolMalloc(hContext->MemoryPoolHost, Size); + UMF_CHECK_PTR(*ResultPtr); } catch (ur_result_t Err) { return Err; } @@ -326,73 +284,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMReleaseExp(ur_context_handle_t Context, return UR_RESULT_SUCCESS; } -umf_result_t USMMemoryProvider::initialize(ur_context_handle_t Ctx, - ur_device_handle_t Dev) { - Context = Ctx; - Device = Dev; - // There isn't a way to query this in cuda, and there isn't much info on - // cuda's approach to alignment or transfer granularity between host and - // device. Within UMF this is only used to influence alignment, and since we - // discard that in our alloc implementations it seems we can safely ignore - // this as well, for now. - MinPageSize = 0; - - return UMF_RESULT_SUCCESS; -} - -enum umf_result_t USMMemoryProvider::alloc(size_t Size, size_t Align, - void **Ptr) { - auto Res = allocateImpl(Ptr, Size, Align); - if (Res != UR_RESULT_SUCCESS) { - getLastStatusRef() = Res; - return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; - } - - return UMF_RESULT_SUCCESS; -} - -enum umf_result_t USMMemoryProvider::free(void *Ptr, size_t Size) { - (void)Size; - - auto Res = USMFreeImpl(Context, Ptr); - if (Res != UR_RESULT_SUCCESS) { - getLastStatusRef() = Res; - return UMF_RESULT_ERROR_MEMORY_PROVIDER_SPECIFIC; - } - - return UMF_RESULT_SUCCESS; -} - -void USMMemoryProvider::get_last_native_error(const char **ErrMsg, - int32_t *ErrCode) { - (void)ErrMsg; - *ErrCode = static_cast(getLastStatusRef()); -} - -umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { - (void)Ptr; - *PageSize = MinPageSize; - - return UMF_RESULT_SUCCESS; -} - -ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0, - /*device flags*/ 0, Size, Alignment); -} - -ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size, - Alignment); -} - -ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment); -} - ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, ur_usm_pool_desc_t *PoolDesc) : Context{Context} { @@ -416,36 +307,28 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, pNext = BaseDesc->pNext; } - auto MemProvider = - umf::memoryProviderMakeUnique(Context, nullptr) - .second; - auto UmfHostParamsHandle = getUmfParamsHandle( DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]); - HostMemPool = - umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider), - UmfHostParamsHandle.get()) - .second; + HostMemPool = umf::poolMakeUniqueFromOpsProviderHandle( + umfDisjointPoolOps(), Context->MemoryProviderHost, + UmfHostParamsHandle.get()) + .second; for (const auto &Device : Context->getDevices()) { - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; auto UmfDeviceParamsHandle = getUmfParamsHandle( DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]); - DeviceMemPool = - umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider), - UmfDeviceParamsHandle.get()) - .second; - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; + DeviceMemPool = umf::poolMakeUniqueFromOpsProviderHandle( + umfDisjointPoolOps(), Device->MemoryProviderDevice, + UmfDeviceParamsHandle.get()) + .second; + auto UmfSharedParamsHandle = getUmfParamsHandle( DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]); - SharedMemPool = - umf::poolMakeUniqueFromOps(umfDisjointPoolOps(), std::move(MemProvider), - UmfSharedParamsHandle.get()) - .second; + SharedMemPool = umf::poolMakeUniqueFromOpsProviderHandle( + umfDisjointPoolOps(), Device->MemoryProviderShared, + UmfSharedParamsHandle.get()) + .second; + Context->addPool(this); } } diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp index 7c6a2ea666..8258043d2b 100644 --- a/source/adapters/cuda/usm.hpp +++ b/source/adapters/cuda/usm.hpp @@ -48,80 +48,6 @@ class UsmAllocationException { ur_result_t getError() const { return Error; } }; -// Implements memory allocation via driver API for USM allocator interface. -class USMMemoryProvider { -private: - ur_result_t &getLastStatusRef() { - static thread_local ur_result_t LastStatus = UR_RESULT_SUCCESS; - return LastStatus; - } - -protected: - ur_context_handle_t Context; - ur_device_handle_t Device; - size_t MinPageSize; - - // Internal allocation routine which must be implemented for each allocation - // type - virtual ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) = 0; - -public: - umf_result_t initialize(ur_context_handle_t Ctx, ur_device_handle_t Dev); - umf_result_t alloc(size_t Size, size_t Align, void **Ptr); - umf_result_t free(void *Ptr, size_t Size); - void get_last_native_error(const char **ErrMsg, int32_t *ErrCode); - umf_result_t get_min_page_size(void *, size_t *); - umf_result_t get_recommended_page_size(size_t, size_t *) { - return UMF_RESULT_ERROR_NOT_SUPPORTED; - }; - umf_result_t purge_lazy(void *, size_t) { - return UMF_RESULT_ERROR_NOT_SUPPORTED; - }; - umf_result_t purge_force(void *, size_t) { - return UMF_RESULT_ERROR_NOT_SUPPORTED; - }; - umf_result_t allocation_merge(void *, void *, size_t) { - return UMF_RESULT_ERROR_UNKNOWN; - } - umf_result_t allocation_split(void *, size_t, size_t) { - return UMF_RESULT_ERROR_UNKNOWN; - } - virtual const char *get_name() = 0; - - virtual ~USMMemoryProvider() = default; -}; - -// Allocation routines for shared memory type -class USMSharedMemoryProvider final : public USMMemoryProvider { -public: - const char *get_name() override { return "USMSharedMemoryProvider"; } - -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; -}; - -// Allocation routines for device memory type -class USMDeviceMemoryProvider final : public USMMemoryProvider { -public: - const char *get_name() override { return "USMSharedMemoryProvider"; } - -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; -}; - -// Allocation routines for host memory type -class USMHostMemoryProvider final : public USMMemoryProvider { -public: - const char *get_name() override { return "USMSharedMemoryProvider"; } - -protected: - ur_result_t allocateImpl(void **ResultPtr, size_t Size, - uint32_t Alignment) override; -}; - ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, ur_usm_device_mem_flags_t Flags, size_t Size, diff --git a/source/common/umf_helpers.hpp b/source/common/umf_helpers.hpp index 2433560a39..12432bd933 100644 --- a/source/common/umf_helpers.hpp +++ b/source/common/umf_helpers.hpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include "logger/ur_logger.hpp" @@ -30,11 +30,10 @@ #define UMF_CHECK_ERROR(UmfResult) UR_CHECK_ERROR(umf::umf2urResult(UmfResult)); -#define UMF_RETURN_UMF_ERROR(UmfResult) \ +#define UMF_CHECK_PTR(ptr) \ do { \ - umf_result_t UmfResult_ = (UmfResult); \ - if (UmfResult_ != UMF_RESULT_SUCCESS) { \ - return UmfResult_; \ + if ((ptr) == nullptr) { \ + UR_CHECK_ERROR(UR_RESULT_ERROR_OUT_OF_HOST_MEMORY); \ } \ } while (0) @@ -54,9 +53,6 @@ using pool_unique_handle_t = using provider_unique_handle_t = std::unique_ptr>; -using cuda_params_unique_handle_t = std::unique_ptr< - umf_cuda_memory_provider_params_t, - std::function>; #define DEFINE_CHECK_OP(op) \ template class HAS_OP_##op { \ @@ -243,6 +239,21 @@ static inline auto poolMakeUniqueFromOps(umf_memory_pool_ops_t *ops, UMF_RESULT_SUCCESS, pool_unique_handle_t(hPool, umfPoolDestroy)}; } +static inline auto +poolMakeUniqueFromOpsProviderHandle(umf_memory_pool_ops_t *ops, + umf_memory_provider_handle_t provider, + void *params) { + umf_memory_pool_handle_t hPool; + auto ret = umfPoolCreate(ops, provider, params, 0, &hPool); + if (ret != UMF_RESULT_SUCCESS) { + return std::pair{ + ret, pool_unique_handle_t(nullptr, nullptr)}; + } + + return std::pair{ + UMF_RESULT_SUCCESS, pool_unique_handle_t(hPool, umfPoolDestroy)}; +} + static inline auto providerMakeUniqueFromOps(umf_memory_provider_ops_t *ops, void *params) { umf_memory_provider_handle_t hProvider; @@ -301,33 +312,6 @@ inline ur_result_t umf2urResult(umf_result_t umfResult) { }; } -inline umf_result_t createMemoryProvider( - umf_cuda_memory_provider_params_handle_t CUMemoryProviderParams, - int cuDevice, void *cuContext, umf_usm_memory_type_t memType, - umf_memory_provider_handle_t *provider) { - - umf_result_t UmfResult = - umfCUDAMemoryProviderParamsSetContext(CUMemoryProviderParams, cuContext); - UMF_RETURN_UMF_ERROR(UmfResult); - - UmfResult = - umfCUDAMemoryProviderParamsSetDevice(CUMemoryProviderParams, cuDevice); - UMF_RETURN_UMF_ERROR(UmfResult); - - UmfResult = - umfCUDAMemoryProviderParamsSetMemoryType(CUMemoryProviderParams, memType); - UMF_RETURN_UMF_ERROR(UmfResult); - - umf_memory_provider_handle_t umfCUDAprovider = nullptr; - UmfResult = umfMemoryProviderCreate(umfCUDAMemoryProviderOps(), - CUMemoryProviderParams, &umfCUDAprovider); - UMF_RETURN_UMF_ERROR(UmfResult); - - *provider = umfCUDAprovider; - - return UMF_RESULT_SUCCESS; -} - } // namespace umf #endif /* UMF_HELPERS_H */