Skip to content

Commit a239583

Browse files
[L0] Add initial USM alloc enqueue API
Co-authored-by: Michał Staniewski <michal.staniewski@intel.com>
1 parent cb5651f commit a239583

File tree

8 files changed

+458
-9
lines changed

8 files changed

+458
-9
lines changed

include/ur_api.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5784,10 +5784,10 @@ typedef enum ur_command_t {
57845784
UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp
57855785
UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp
57865786
UR_COMMAND_ENQUEUE_NATIVE_EXP = 0x2004, ///< Event created by ::urEnqueueNativeCommandExp
5787-
UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP = 0x2008, ///< Event created by ::urEnqueueDeviceAllocExp
5788-
UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP = 0x2010, ///< Event created by ::urEnqueueSharedAllocExp
5789-
UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP = 0x2011, ///< Event created by ::urEnqueueHostAllocExp
5790-
UR_COMMAND_ENQUEUE_USM_FREE_EXP = 0x2012, ///< Event created by ::urEnqueueFreeExp
5787+
UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP = 0x2008, ///< Event created by ::urEnqueueUSMDeviceAllocExp
5788+
UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP = 0x2010, ///< Event created by ::urEnqueueUSMSharedAllocExp
5789+
UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP = 0x2011, ///< Event created by ::urEnqueueUSMHostAllocExp
5790+
UR_COMMAND_ENQUEUE_USM_FREE_EXP = 0x2012, ///< Event created by ::urEnqueueUSMFreeExp
57915791
/// @cond
57925792
UR_COMMAND_FORCE_UINT32 = 0x7fffffff
57935793
/// @endcond

scripts/core/exp-async-alloc.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,16 @@ name: $x_command_t
4141
etors:
4242
- name: ENQUEUE_USM_DEVICE_ALLOC_EXP
4343
value: "0x2008"
44-
desc: Event created by $xEnqueueDeviceAllocExp
44+
desc: Event created by $xEnqueueUSMDeviceAllocExp
4545
- name: ENQUEUE_USM_SHARED_ALLOC_EXP
4646
value: "0x2010"
47-
desc: Event created by $xEnqueueSharedAllocExp
47+
desc: Event created by $xEnqueueUSMSharedAllocExp
4848
- name: ENQUEUE_USM_HOST_ALLOC_EXP
4949
value: "0x2011"
50-
desc: Event created by $xEnqueueHostAllocExp
50+
desc: Event created by $xEnqueueUSMHostAllocExp
5151
- name: ENQUEUE_USM_FREE_EXP
5252
value: "0x2012"
53-
desc: Event created by $xEnqueueFreeExp
53+
desc: Event created by $xEnqueueUSMFreeExp
5454

5555
--- #--------------------------------------------------------------------------
5656
type: enum

source/adapters/level_zero/context.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,17 @@ ur_result_t ur_context_handle_t_::initialize() {
201201
.Configs[usm::DisjointPoolMemType::Device])
202202
.second));
203203

204+
MemProvider = umf::memoryProviderMakeUnique<L0DeviceMemoryProvider>(
205+
reinterpret_cast<ur_context_handle_t>(this), Device)
206+
.second;
207+
AsyncDeviceMemPools.emplace(
208+
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
209+
std::make_tuple(umf::poolMakeUniqueFromOps(
210+
umfDisjointPoolOps(), std::move(MemProvider),
211+
&DisjointPoolConfigInstance
212+
.Configs[usm::DisjointPoolMemType::Device])
213+
.second));
214+
204215
MemProvider = umf::memoryProviderMakeUnique<L0SharedMemoryProvider>(
205216
reinterpret_cast<ur_context_handle_t>(this), Device)
206217
.second;
@@ -212,6 +223,17 @@ ur_result_t ur_context_handle_t_::initialize() {
212223
.Configs[usm::DisjointPoolMemType::Shared])
213224
.second));
214225

226+
MemProvider = umf::memoryProviderMakeUnique<L0SharedMemoryProvider>(
227+
reinterpret_cast<ur_context_handle_t>(this), Device)
228+
.second;
229+
AsyncSharedMemPools.emplace(
230+
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
231+
std::make_tuple(umf::poolMakeUniqueFromOps(
232+
umfDisjointPoolOps(), std::move(MemProvider),
233+
&DisjointPoolConfigInstance
234+
.Configs[usm::DisjointPoolMemType::Shared])
235+
.second));
236+
215237
MemProvider = umf::memoryProviderMakeUnique<L0SharedReadOnlyMemoryProvider>(
216238
reinterpret_cast<ur_context_handle_t>(this), Device)
217239
.second;
@@ -224,6 +246,18 @@ ur_result_t ur_context_handle_t_::initialize() {
224246
.Configs[usm::DisjointPoolMemType::SharedReadOnly])
225247
.second));
226248

249+
MemProvider = umf::memoryProviderMakeUnique<L0SharedReadOnlyMemoryProvider>(
250+
reinterpret_cast<ur_context_handle_t>(this), Device)
251+
.second;
252+
AsyncSharedReadOnlyMemPools.emplace(
253+
std::piecewise_construct, std::make_tuple(Device->ZeDevice),
254+
std::make_tuple(
255+
umf::poolMakeUniqueFromOps(
256+
umfDisjointPoolOps(), std::move(MemProvider),
257+
&DisjointPoolConfigInstance
258+
.Configs[usm::DisjointPoolMemType::SharedReadOnly])
259+
.second));
260+
227261
MemProvider = umf::memoryProviderMakeUnique<L0DeviceMemoryProvider>(
228262
reinterpret_cast<ur_context_handle_t>(this), Device)
229263
.second;
@@ -276,6 +310,15 @@ ur_result_t ur_context_handle_t_::initialize() {
276310
&DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host])
277311
.second;
278312

313+
MemProvider = umf::memoryProviderMakeUnique<L0HostMemoryProvider>(
314+
reinterpret_cast<ur_context_handle_t>(this), nullptr)
315+
.second;
316+
AsyncHostMemPool =
317+
umf::poolMakeUniqueFromOps(
318+
umfDisjointPoolOps(), std::move(MemProvider),
319+
&DisjointPoolConfigInstance.Configs[usm::DisjointPoolMemType::Host])
320+
.second;
321+
279322
MemProvider = umf::memoryProviderMakeUnique<L0HostMemoryProvider>(
280323
reinterpret_cast<ur_context_handle_t>(this), nullptr)
281324
.second;

source/adapters/level_zero/context.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,15 @@ struct ur_context_handle_t_ : _ur_object {
124124
SharedReadOnlyMemProxyPools;
125125
umf::pool_unique_handle_t HostMemProxyPool;
126126

127+
// USM pools for async allocations.
128+
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
129+
AsyncDeviceMemPools;
130+
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
131+
AsyncSharedMemPools;
132+
std::unordered_map<ze_device_handle_t, umf::pool_unique_handle_t>
133+
AsyncSharedReadOnlyMemPools;
134+
umf::pool_unique_handle_t AsyncHostMemPool;
135+
127136
// Map associating pools created with urUsmPoolCreate and internal pools
128137
std::list<ur_usm_pool_handle_t> UsmPoolHandles{};
129138

source/adapters/level_zero/queue.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1327,7 +1327,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
13271327
Device->Platform->ContextsMutex, std::defer_lock);
13281328

13291329
if (IndirectAccessTrackingEnabled) {
1330-
// We are going to submit kernels for execution. If indirect access flag is
1330+
// We are going to submit kernels for execution. If indirect access flag is
13311331
// set for a kernel then we need to make a snapshot of existing memory
13321332
// allocations in all contexts in the platform. We need to lock the mutex
13331333
// guarding the list of contexts in the platform to prevent creation of new

source/adapters/level_zero/usm.cpp

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -787,6 +787,255 @@ ur_result_t urUSMReleaseExp(ur_context_handle_t Context, void *HostPtr) {
787787
Context->getPlatform()->ZeDriverHandleExpTranslated, HostPtr);
788788
return UR_RESULT_SUCCESS;
789789
}
790+
791+
enum class USMAllocType { Host = 0, Device = 1, Shared = 2 };
792+
793+
static ur_result_t USMAllocHelper(ur_context_handle_t Context,
794+
ur_device_handle_t Device, size_t Size,
795+
void **RetMem, USMAllocType Type) {
796+
auto &Platform = Device->Platform;
797+
798+
// TODO: Should alignemnt be passed in 'ur_exp_async_usm_alloc_properties_t'?
799+
uint32_t Alignment = 0;
800+
801+
std::shared_lock<ur_shared_mutex> ContextLock(Context->Mutex,
802+
std::defer_lock);
803+
std::unique_lock<ur_shared_mutex> IndirectAccessTrackingLock(
804+
Platform->ContextsMutex, std::defer_lock);
805+
if (IndirectAccessTrackingEnabled) {
806+
IndirectAccessTrackingLock.lock();
807+
UR_CALL(ur::level_zero::urContextRetain(Context));
808+
} else {
809+
ContextLock.lock();
810+
}
811+
812+
umf_memory_pool_handle_t hPoolInternal = nullptr;
813+
switch (Type) {
814+
case USMAllocType::Host:
815+
hPoolInternal = Context->AsyncHostMemPool.get();
816+
break;
817+
case USMAllocType::Device: {
818+
auto It = Context->AsyncDeviceMemPools.find(Device->ZeDevice);
819+
if (It == Context->AsyncDeviceMemPools.end()) {
820+
return UR_RESULT_ERROR_INVALID_VALUE;
821+
}
822+
hPoolInternal = It->second.get();
823+
} break;
824+
case USMAllocType::Shared: {
825+
auto It = Context->AsyncSharedMemPools.find(Device->ZeDevice);
826+
if (It == Context->AsyncSharedMemPools.end()) {
827+
return UR_RESULT_ERROR_INVALID_VALUE;
828+
}
829+
hPoolInternal = It->second.get();
830+
} break;
831+
};
832+
833+
*RetMem = umfPoolAlignedMalloc(hPoolInternal, Size, Alignment);
834+
if (*RetMem == nullptr) {
835+
auto umfRet = umfPoolGetLastAllocationError(hPoolInternal);
836+
return umf2urResult(umfRet);
837+
}
838+
839+
if (IndirectAccessTrackingEnabled) {
840+
// Keep track of all memory allocations in the context
841+
Context->MemAllocs.emplace(std::piecewise_construct,
842+
std::forward_as_tuple(*RetMem),
843+
std::forward_as_tuple(Context));
844+
}
845+
846+
return UR_RESULT_SUCCESS;
847+
}
848+
849+
static ur_result_t enqueueUSMAllocHelper(
850+
ur_queue_handle_t Queue, ur_usm_pool_handle_t Pool, const size_t Size,
851+
const ur_exp_async_usm_alloc_properties_t *Properties,
852+
uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList,
853+
void **RetMem, ur_event_handle_t *OutEvent, USMAllocType Type) {
854+
std::ignore = Pool;
855+
std::ignore = Properties;
856+
857+
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
858+
859+
bool UseCopyEngine = false;
860+
_ur_ze_event_list_t TmpWaitList;
861+
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
862+
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
863+
864+
// Get a new command list to be used on this call
865+
ur_command_list_ptr_t CommandList{};
866+
UR_CALL(Queue->Context->getAvailableCommandList(
867+
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
868+
869+
ze_event_handle_t ZeEvent = nullptr;
870+
ur_event_handle_t InternalEvent{};
871+
bool IsInternal = OutEvent == nullptr;
872+
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
873+
874+
ur_command_t CommandType = UR_COMMAND_FORCE_UINT32;
875+
switch (Type) {
876+
case USMAllocType::Host:
877+
CommandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP;
878+
break;
879+
case USMAllocType::Device:
880+
CommandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP;
881+
break;
882+
case USMAllocType::Shared:
883+
CommandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP;
884+
break;
885+
}
886+
UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
887+
IsInternal, false));
888+
ZeEvent = (*Event)->ZeEvent;
889+
(*Event)->WaitList = TmpWaitList;
890+
891+
// Allocate USM memory
892+
auto Ret = USMAllocHelper(Queue->Context, Queue->Device, Size, RetMem, Type);
893+
if (Ret) {
894+
return Ret;
895+
}
896+
897+
// Signal that USM allocation event was finished
898+
ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, ZeEvent));
899+
900+
UR_CALL(Queue->executeCommandList(CommandList, false));
901+
902+
return UR_RESULT_SUCCESS;
903+
}
904+
905+
ur_result_t urEnqueueUSMDeviceAllocExp(
906+
ur_queue_handle_t Queue, ///< [in] handle of the queue object
907+
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
908+
const size_t Size, ///< [in] minimum size in bytes of the USM memory object
909+
///< to be allocated
910+
const ur_exp_async_usm_alloc_properties_t
911+
*Properties, ///< [in][optional] pointer to the enqueue async alloc
912+
///< properties
913+
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
914+
const ur_event_handle_t
915+
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
916+
///< pointer to a list of events that must be complete
917+
///< before the kernel execution. If nullptr, the
918+
///< numEventsInWaitList must be 0, indicating no wait
919+
///< events.
920+
void **Mem, ///< [out] pointer to USM memory object
921+
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
922+
///< identifies the async alloc
923+
) {
924+
return enqueueUSMAllocHelper(Queue, Pool, Size, Properties,
925+
NumEventsInWaitList, EventWaitList, Mem,
926+
OutEvent, USMAllocType::Device);
927+
}
928+
929+
ur_result_t urEnqueueUSMSharedAllocExp(
930+
ur_queue_handle_t Queue, ///< [in] handle of the queue object
931+
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
932+
const size_t Size, ///< [in] minimum size in bytes of the USM memory object
933+
///< to be allocated
934+
const ur_exp_async_usm_alloc_properties_t
935+
*Properties, ///< [in][optional] pointer to the enqueue async alloc
936+
///< properties
937+
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
938+
const ur_event_handle_t
939+
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
940+
///< pointer to a list of events that must be complete
941+
///< before the kernel execution. If nullptr, the
942+
///< numEventsInWaitList must be 0, indicating no wait
943+
///< events.
944+
void **Mem, ///< [out] pointer to USM memory object
945+
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
946+
///< identifies the async alloc
947+
) {
948+
return enqueueUSMAllocHelper(Queue, Pool, Size, Properties,
949+
NumEventsInWaitList, EventWaitList, Mem,
950+
OutEvent, USMAllocType::Shared);
951+
}
952+
953+
ur_result_t urEnqueueUSMHostAllocExp(
954+
ur_queue_handle_t Queue, ///< [in] handle of the queue object
955+
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
956+
const size_t Size, ///< [in] minimum size in bytes of the USM memory object
957+
///< to be allocated
958+
const ur_exp_async_usm_alloc_properties_t
959+
*Properties, ///< [in][optional] pointer to the enqueue async alloc
960+
///< properties
961+
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
962+
const ur_event_handle_t
963+
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
964+
///< pointer to a list of events that must be complete
965+
///< before the kernel execution. If nullptr, the
966+
///< numEventsInWaitList must be 0, indicating no wait
967+
///< events.
968+
void **Mem, ///< [out] pointer to USM memory object
969+
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
970+
///< identifies the async alloc
971+
) {
972+
return enqueueUSMAllocHelper(Queue, Pool, Size, Properties,
973+
NumEventsInWaitList, EventWaitList, Mem,
974+
OutEvent, USMAllocType::Host);
975+
}
976+
977+
ur_result_t urEnqueueUSMFreeExp(
978+
ur_queue_handle_t Queue, ///< [in] handle of the queue object
979+
ur_usm_pool_handle_t Pool, ///< [in][optional] USM pool descriptor
980+
void *Mem, ///< [in] pointer to USM memory object
981+
uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
982+
const ur_event_handle_t
983+
*EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
984+
///< pointer to a list of events that must be complete
985+
///< before the kernel execution. If nullptr, the
986+
///< numEventsInWaitList must be 0, indicating no wait
987+
///< events.
988+
ur_event_handle_t *OutEvent ///< [out][optional] return an event object that
989+
///< identifies the async alloc
990+
) {
991+
std::ignore = Pool;
992+
993+
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
994+
995+
bool UseCopyEngine = false;
996+
_ur_ze_event_list_t TmpWaitList;
997+
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
998+
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
999+
1000+
// Get a new command list to be used on this call
1001+
ur_command_list_ptr_t CommandList{};
1002+
UR_CALL(Queue->Context->getAvailableCommandList(
1003+
Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
1004+
1005+
ze_event_handle_t ZeEvent = nullptr;
1006+
ur_event_handle_t InternalEvent{};
1007+
bool IsInternal = OutEvent == nullptr;
1008+
ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
1009+
1010+
UR_CALL(createEventAndAssociateQueue(Queue, Event,
1011+
UR_COMMAND_ENQUEUE_USM_FREE_EXP,
1012+
CommandList, IsInternal, false));
1013+
ZeEvent = (*Event)->ZeEvent;
1014+
(*Event)->WaitList = TmpWaitList;
1015+
1016+
const auto &ZeCommandList = CommandList->first;
1017+
const auto &WaitList = (*Event)->WaitList;
1018+
if (WaitList.Length) {
1019+
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
1020+
(ZeCommandList, WaitList.Length, WaitList.ZeEventList));
1021+
}
1022+
1023+
// Wait for commands execution until USM can be freed
1024+
UR_CALL(Queue->executeCommandList(CommandList, true)); // Blocking
1025+
1026+
// Free USM memory
1027+
auto Ret = USMFreeHelper(Queue->Context, Mem);
1028+
if (Ret) {
1029+
return Ret;
1030+
}
1031+
1032+
// Signal that USM free event was finished
1033+
ZE2UR_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));
1034+
1035+
UR_CALL(Queue->executeCommandList(CommandList, false));
1036+
1037+
return UR_RESULT_SUCCESS;
1038+
}
7901039
} // namespace ur::level_zero
7911040

7921041
static ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Ptr) {

0 commit comments

Comments
 (0)