45
45
│ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
46
46
└──────────┴────────────────────────────────────────────────┴─────────┘
47
47
48
- ┌───────────────────┬──────────────────────────────┐
49
- Prefix │Reset signal event │ Barrier waiting on wait event│
50
- └───────────────────┴──────────────────────────────┘
48
+ ┌───────────────────┬──────────────┐────────────── ────────────────┐
49
+ Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50
+ └───────────────────┴──────────────┘────────────── ────────────────┘
51
51
52
52
┌─────────────────────────────────────────────┐──────────────┐
53
- Suffix │Barrier waiting on sync-point event, │ Reset events │
54
- │signalling the UR command-buffer signal event│ │
53
+ Suffix │Barrier waiting on sync-point event, │ Query CMD │
54
+ │signalling the UR command-buffer signal event│ Timestamps │
55
55
└─────────────────────────────────────────────┘──────────────┘
56
56
57
57
For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
@@ -433,6 +433,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
433
433
434
434
ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
435
435
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
436
+ // Dependencies between commands are explicitly enforced by sync points when
437
+ // enqueuing. Consequently, relax the command ordering in the command list
438
+ // can enable the backend to further optimize the workload
439
+ ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
436
440
437
441
ze_command_list_handle_t ZeCommandList;
438
442
// TODO We could optimize this by pooling both Level Zero command-lists and UR
@@ -499,13 +503,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
499
503
ZE2UR_CALL (zeCommandListAppendEventReset,
500
504
(CommandBuffer->ZeCommandList , CommandBuffer->WaitEvent ->ZeEvent ));
501
505
502
- // Reset the L0 events we use for command-buffer internal sync-points to the
503
- // non-signalled state
504
- for (auto Event : WaitEventList) {
505
- ZE2UR_CALL (zeCommandListAppendEventReset,
506
- (CommandBuffer->ZeCommandList , Event));
507
- }
508
-
509
506
// Close the command list and have it ready for dispatch.
510
507
ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandList ));
511
508
return UR_RESULT_SUCCESS;
@@ -899,14 +896,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
899
896
// Create command-list to execute before `CommandListPtr` and will signal
900
897
// when `EventWaitList` dependencies are complete.
901
898
ur_command_list_ptr_t WaitCommandList{};
899
+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList, false ,
900
+ false ));
901
+
902
+ // Create a list of events of all the events that compose the command buffer
903
+ // workload.
904
+ // This loop also resets the L0 events we use for command-buffer internal
905
+ // sync-points to the non-signalled state.
906
+ // This is required for multiple submissions.
907
+ const size_t NumEvents = CommandBuffer->SyncPoints .size ();
908
+ std::vector<ze_event_handle_t > WaitEventList{NumEvents};
909
+ for (size_t i = 0 ; i < NumEvents; i++) {
910
+ auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
911
+ WaitEventList[i] = ZeEvent;
912
+ ZE2UR_CALL (zeCommandListAppendEventReset,
913
+ (WaitCommandList->first , ZeEvent));
914
+ }
915
+
902
916
if (NumEventsInWaitList) {
903
917
_ur_ze_event_list_t TmpWaitList;
904
918
UR_CALL (TmpWaitList.createAndRetainUrZeEventList (
905
919
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
906
920
907
- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
908
- false , false ))
909
-
910
921
// Update the WaitList of the Wait Event
911
922
// Events are appended to the WaitList if the WaitList is not empty
912
923
if (CommandBuffer->WaitEvent ->WaitList .isEmpty ())
@@ -919,9 +930,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
919
930
CommandBuffer->WaitEvent ->WaitList .Length ,
920
931
CommandBuffer->WaitEvent ->WaitList .ZeEventList ));
921
932
} else {
922
- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
923
- false , false ));
924
-
925
933
ZE2UR_CALL (zeCommandListAppendSignalEvent,
926
934
(WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
927
935
}
@@ -943,22 +951,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
943
951
&(CommandBuffer->SignalEvent ->ZeEvent )));
944
952
945
953
if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
946
- // We create an additional signal specific to the current execution of the
947
- // CommandBuffer. This signal is needed for profiling the execution time
948
- // of the CommandBuffer. It waits for the WaitEvent to be signaled
949
- // which indicates the start of the CommandBuffer actual execution.
950
- // This event is embedded into the Event return to the user to allow
951
- // the profiling engine to retrieve it.
952
- ur_event_handle_t StartEvent{};
953
- UR_CALL (createEventAndAssociateQueue (
954
- Queue, &StartEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
955
- WaitCommandList, false ));
956
-
957
- ZE2UR_CALL (zeCommandListAppendBarrier,
958
- (WaitCommandList->first , StartEvent->ZeEvent , 1 ,
959
- &(CommandBuffer->WaitEvent ->ZeEvent )));
960
-
961
- RetEvent->CommandData = StartEvent;
954
+ // Multiple submissions of a command buffer implies that we need to save
955
+ // the event timestamps before resubmiting the command buffer. We
956
+ // therefore copy the these timestamps in a dedicated USM memory section
957
+ // before completing the command buffer execution, and then attach this
958
+ // memory to the event returned to users to allow to allow the profiling
959
+ // engine to recover these timestamps.
960
+ ur_usm_desc_t USMDesc{};
961
+ ur_usm_device_desc_t UsmDeviceDesc{};
962
+ UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC;
963
+ ur_usm_host_desc_t UsmHostDesc{};
964
+ UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC;
965
+ UsmDeviceDesc.pNext = &UsmHostDesc;
966
+ USMDesc.pNext = &UsmDeviceDesc;
967
+ USMDesc.align = 4 ; // 4byte-aligned
968
+
969
+ size_t Size = WaitEventList.size () * sizeof (ze_kernel_timestamp_result_t );
970
+
971
+ struct command_buffer_profiling_t *Profiling =
972
+ new command_buffer_profiling_t ();
973
+
974
+ Profiling->NumEvents = WaitEventList.size ();
975
+
976
+ urUSMSharedAlloc (RetEvent->Context , CommandBuffer->Device , &USMDesc,
977
+ nullptr , Size, (void **)&Profiling->Timestamps );
978
+
979
+ ZE2UR_CALL (zeCommandListAppendQueryKernelTimestamps,
980
+ (SignalCommandList->first , WaitEventList.size (),
981
+ WaitEventList.data (), Profiling->Timestamps , 0 ,
982
+ RetEvent->ZeEvent , 1 ,
983
+ &(CommandBuffer->SignalEvent ->ZeEvent )));
984
+
985
+ RetEvent->CommandData = static_cast <void *>(Profiling);
962
986
}
963
987
}
964
988
0 commit comments