Skip to content

Commit e8b7840

Browse files
mfrancepilloiskbenzie
authored andcommitted
Copy command-buffer event timestamps into a dedicated USM memory region.
Get the command-buffer start and end timestamps from this memory. Move events reset from the middle command list to the first to allow the copy of the profiling info in the last command list and relax command list order.
1 parent 2e156f7 commit e8b7840

File tree

3 files changed

+131
-46
lines changed

3 files changed

+131
-46
lines changed

source/adapters/level_zero/command_buffer.cpp

Lines changed: 58 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,13 @@
4545
│ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
4646
└──────────┴────────────────────────────────────────────────┴─────────┘
4747
48-
┌───────────────────┬──────────────────────────────┐
49-
Prefix │Reset signal event │ Barrier waiting on wait event│
50-
└───────────────────┴──────────────────────────────┘
48+
┌───────────────────┬──────────────┐──────────────────────────────┐
49+
Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50+
└───────────────────┴──────────────┘──────────────────────────────┘
5151
5252
┌─────────────────────────────────────────────┐──────────────┐
53-
Suffix │Barrier waiting on sync-point event, │ Reset events
54-
│signalling the UR command-buffer signal event│
53+
Suffix │Barrier waiting on sync-point event, │ Query CMD
54+
│signalling the UR command-buffer signal event│ Timestamps
5555
└─────────────────────────────────────────────┘──────────────┘
5656
5757
For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
@@ -433,6 +433,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
433433

434434
ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
435435
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
436+
// Dependencies between commands are explicitly enforced by sync points when
437+
// enqueuing. Consequently, relax the command ordering in the command list
438+
// can enable the backend to further optimize the workload
439+
ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
436440

437441
ze_command_list_handle_t ZeCommandList;
438442
// TODO We could optimize this by pooling both Level Zero command-lists and UR
@@ -499,13 +503,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
499503
ZE2UR_CALL(zeCommandListAppendEventReset,
500504
(CommandBuffer->ZeCommandList, CommandBuffer->WaitEvent->ZeEvent));
501505

502-
// Reset the L0 events we use for command-buffer internal sync-points to the
503-
// non-signalled state
504-
for (auto Event : WaitEventList) {
505-
ZE2UR_CALL(zeCommandListAppendEventReset,
506-
(CommandBuffer->ZeCommandList, Event));
507-
}
508-
509506
// Close the command list and have it ready for dispatch.
510507
ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList));
511508
return UR_RESULT_SUCCESS;
@@ -899,14 +896,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
899896
// Create command-list to execute before `CommandListPtr` and will signal
900897
// when `EventWaitList` dependencies are complete.
901898
ur_command_list_ptr_t WaitCommandList{};
899+
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, false,
900+
false));
901+
902+
// Create a list of events of all the events that compose the command buffer
903+
// workload.
904+
// This loop also resets the L0 events we use for command-buffer internal
905+
// sync-points to the non-signalled state.
906+
// This is required for multiple submissions.
907+
const size_t NumEvents = CommandBuffer->SyncPoints.size();
908+
std::vector<ze_event_handle_t> WaitEventList{NumEvents};
909+
for (size_t i = 0; i < NumEvents; i++) {
910+
auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent;
911+
WaitEventList[i] = ZeEvent;
912+
ZE2UR_CALL(zeCommandListAppendEventReset,
913+
(WaitCommandList->first, ZeEvent));
914+
}
915+
902916
if (NumEventsInWaitList) {
903917
_ur_ze_event_list_t TmpWaitList;
904918
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
905919
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
906920

907-
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
908-
false, false))
909-
910921
// Update the WaitList of the Wait Event
911922
// Events are appended to the WaitList if the WaitList is not empty
912923
if (CommandBuffer->WaitEvent->WaitList.isEmpty())
@@ -919,9 +930,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
919930
CommandBuffer->WaitEvent->WaitList.Length,
920931
CommandBuffer->WaitEvent->WaitList.ZeEventList));
921932
} else {
922-
UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
923-
false, false));
924-
925933
ZE2UR_CALL(zeCommandListAppendSignalEvent,
926934
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent));
927935
}
@@ -943,22 +951,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
943951
&(CommandBuffer->SignalEvent->ZeEvent)));
944952

945953
if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
946-
// We create an additional signal specific to the current execution of the
947-
// CommandBuffer. This signal is needed for profiling the execution time
948-
// of the CommandBuffer. It waits for the WaitEvent to be signaled
949-
// which indicates the start of the CommandBuffer actual execution.
950-
// This event is embedded into the Event return to the user to allow
951-
// the profiling engine to retrieve it.
952-
ur_event_handle_t StartEvent{};
953-
UR_CALL(createEventAndAssociateQueue(
954-
Queue, &StartEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
955-
WaitCommandList, false));
956-
957-
ZE2UR_CALL(zeCommandListAppendBarrier,
958-
(WaitCommandList->first, StartEvent->ZeEvent, 1,
959-
&(CommandBuffer->WaitEvent->ZeEvent)));
960-
961-
RetEvent->CommandData = StartEvent;
954+
// Multiple submissions of a command buffer implies that we need to save
955+
// the event timestamps before resubmiting the command buffer. We
956+
// therefore copy the these timestamps in a dedicated USM memory section
957+
// before completing the command buffer execution, and then attach this
958+
// memory to the event returned to users to allow to allow the profiling
959+
// engine to recover these timestamps.
960+
ur_usm_desc_t USMDesc{};
961+
ur_usm_device_desc_t UsmDeviceDesc{};
962+
UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC;
963+
ur_usm_host_desc_t UsmHostDesc{};
964+
UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC;
965+
UsmDeviceDesc.pNext = &UsmHostDesc;
966+
USMDesc.pNext = &UsmDeviceDesc;
967+
USMDesc.align = 4; // 4byte-aligned
968+
969+
size_t Size = WaitEventList.size() * sizeof(ze_kernel_timestamp_result_t);
970+
971+
struct command_buffer_profiling_t *Profiling =
972+
new command_buffer_profiling_t();
973+
974+
Profiling->NumEvents = WaitEventList.size();
975+
976+
urUSMSharedAlloc(RetEvent->Context, CommandBuffer->Device, &USMDesc,
977+
nullptr, Size, (void **)&Profiling->Timestamps);
978+
979+
ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps,
980+
(SignalCommandList->first, WaitEventList.size(),
981+
WaitEventList.data(), Profiling->Timestamps, 0,
982+
RetEvent->ZeEvent, 1,
983+
&(CommandBuffer->SignalEvent->ZeEvent)));
984+
985+
RetEvent->CommandData = static_cast<void *>(Profiling);
962986
}
963987
}
964988

source/adapters/level_zero/command_buffer.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@
1919
#include "context.hpp"
2020
#include "queue.hpp"
2121

22+
struct command_buffer_profiling_t {
23+
ur_exp_command_buffer_sync_point_t NumEvents;
24+
ze_kernel_timestamp_result_t *Timestamps;
25+
};
26+
2227
struct ur_exp_command_buffer_handle_t_ : public _ur_object {
2328
ur_exp_command_buffer_handle_t_(ur_context_handle_t Context,
2429
ur_device_handle_t Device,

source/adapters/level_zero/event.cpp

Lines changed: 68 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -456,16 +456,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
456456
) {
457457
std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
458458

459-
// A Command-buffer consists of three command-lists.
460-
// The start time should therefore be taken from an event associated
461-
// to the first command-list.
462-
if ((Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) &&
463-
(PropName == UR_PROFILING_INFO_COMMAND_START) && (Event->CommandData)) {
464-
auto StartEvent = static_cast<ur_event_handle_t>(Event->CommandData);
465-
return urEventGetProfilingInfo(StartEvent, UR_PROFILING_INFO_COMMAND_END,
466-
PropValueSize, PropValue, PropValueSizeRet);
467-
}
468-
469459
if (Event->UrQueue &&
470460
(Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
471461
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
@@ -482,6 +472,70 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
482472

483473
ze_kernel_timestamp_result_t tsResult;
484474

475+
// A Command-buffer consists of three command-lists for which only a single
476+
// event is returned to users. The actual profiling information related to the
477+
// command-buffer should therefore be extrated from graph events themsleves.
478+
// The timestamps of these events are saved in a memory region attached to
479+
// event usning CommandData field. The timings must therefore be recovered
480+
// from this memory.
481+
if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP) {
482+
if (Event->CommandData) {
483+
struct command_buffer_profiling_t *ProfilingsPtr;
484+
switch (PropName) {
485+
case UR_PROFILING_INFO_COMMAND_START: {
486+
ProfilingsPtr = static_cast<struct command_buffer_profiling_t *>(
487+
Event->CommandData);
488+
// Sync-point order does not necessarily match to the order of
489+
// execution. We therefore look for the first command executed.
490+
uint64_t MinStart = ProfilingsPtr->Timestamps->global.kernelStart;
491+
for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
492+
uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelStart;
493+
if (Timestamp < MinStart) {
494+
MinStart = Timestamp;
495+
}
496+
}
497+
uint64_t ContextStartTime =
498+
(MinStart & TimestampMaxValue) * ZeTimerResolution;
499+
return ReturnValue(ContextStartTime);
500+
}
501+
case UR_PROFILING_INFO_COMMAND_END: {
502+
ProfilingsPtr = static_cast<struct command_buffer_profiling_t *>(
503+
Event->CommandData);
504+
// Sync-point order does not necessarily match to the order of
505+
// execution. We therefore look for the last command executed.
506+
uint64_t MaxEnd = ProfilingsPtr->Timestamps->global.kernelEnd;
507+
uint64_t LastStart = ProfilingsPtr->Timestamps->global.kernelStart;
508+
for (uint64_t i = 1; i < ProfilingsPtr->NumEvents; i++) {
509+
uint64_t Timestamp = ProfilingsPtr->Timestamps[i].global.kernelEnd;
510+
if (Timestamp > MaxEnd) {
511+
MaxEnd = Timestamp;
512+
LastStart = ProfilingsPtr->Timestamps[i].global.kernelStart;
513+
}
514+
}
515+
uint64_t ContextStartTime = (LastStart & TimestampMaxValue);
516+
uint64_t ContextEndTime = (MaxEnd & TimestampMaxValue);
517+
518+
//
519+
// Handle a possible wrap-around (the underlying HW counter is <
520+
// 64-bit). Note, it will not report correct time if there were multiple
521+
// wrap arounds, and the longer term plan is to enlarge the capacity of
522+
// the HW timestamps.
523+
//
524+
if (ContextEndTime <= ContextStartTime) {
525+
ContextEndTime += TimestampMaxValue;
526+
}
527+
ContextEndTime *= ZeTimerResolution;
528+
return ReturnValue(ContextEndTime);
529+
}
530+
default:
531+
urPrint("urEventGetProfilingInfo: not supported ParamName\n");
532+
return UR_RESULT_ERROR_INVALID_VALUE;
533+
}
534+
} else {
535+
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
536+
}
537+
}
538+
485539
switch (PropName) {
486540
case UR_PROFILING_INFO_COMMAND_START: {
487541
ZE2UR_CALL(zeEventQueryKernelTimestamp, (Event->ZeEvent, &tsResult));
@@ -778,8 +832,10 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
778832
if (Event->CommandType == UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP &&
779833
Event->CommandData) {
780834
// Free the memory extra event allocated for profiling purposed.
781-
auto AssociateEvent = static_cast<ur_event_handle_t>(Event->CommandData);
782-
urEventRelease(AssociateEvent);
835+
struct command_buffer_profiling_t *ProfilingPtr =
836+
static_cast<struct command_buffer_profiling_t *>(Event->CommandData);
837+
urUSMFree(Event->Context, (void *)ProfilingPtr->Timestamps);
838+
delete ProfilingPtr;
783839
Event->CommandData = nullptr;
784840
}
785841
if (Event->OwnNativeHandle) {

0 commit comments

Comments
 (0)