diff --git a/sycl/include/sycl/info/device_traits.def b/sycl/include/sycl/info/device_traits.def index 44c4192a338f3..7ca82b196bb82 100644 --- a/sycl/include/sycl/info/device_traits.def +++ b/sycl/include/sycl/info/device_traits.def @@ -235,8 +235,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_work_groups_2d, id<2>, __SYCL_TR __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_work_groups_3d, id<3>, UR_DEVICE_INFO_MAX_WORK_GROUPS_3D) __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_global_work_groups, size_t, __SYCL_TRAIT_HANDLED_IN_RT) -__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_cuda_cluster_group, bool, - UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP) +__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_cuda_cluster_group, bool, __SYCL_TRAIT_HANDLED_IN_RT) #ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 8e22fec339447..a4682b8d705b4 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -655,11 +655,11 @@ class device_impl : public std::enable_shared_from_this { } CASE(info::device::ext_oneapi_cuda_cluster_group) { - if (getBackend() != backend::ext_oneapi_cuda) - return false; - - return get_info_impl_nocheck() - .value_or(0); + auto SupportFlags = + get_info_impl(); + return static_cast( + SupportFlags & + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION); } // ext_codeplay_device_traits.def diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp index 6e5b933fa48d7..58c8c38564d91 100644 --- a/sycl/source/detail/kernel_impl.hpp +++ b/sycl/source/detail/kernel_impl.hpp @@ -371,7 +371,7 @@ kernel_impl::queryMaxNumWorkGroups(queue Queue, uint32_t GroupCount{0}; if (auto Result = Adapter->call_nocheck< - UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>( + UrApiKind::urKernelSuggestMaxCooperativeGroupCount>( Handle, DeviceHandleRef, Dimensions, WG, DynamicLocalMemorySize, &GroupCount); Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE && diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 783ce3b1412bb..8c8f52249030b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2441,9 +2441,9 @@ static ur_result_t SetKernelParamsAndLaunch( if (EnforcedLocalSize) LocalSize = RequiredWGSize; } - std::vector property_list; + std::vector property_list; if (KernelUsesClusterLaunch) { - ur_exp_launch_property_value_t launch_property_value_cluster_range; + ur_kernel_launch_property_value_t launch_property_value_cluster_range; launch_property_value_cluster_range.clusterDim[0] = NDRDesc.ClusterDimensions[0]; launch_property_value_cluster_range.clusterDim[1] = @@ -2451,48 +2451,27 @@ static ur_result_t SetKernelParamsAndLaunch( launch_property_value_cluster_range.clusterDim[2] = NDRDesc.ClusterDimensions[2]; - property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION, + property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION, launch_property_value_cluster_range}); - - if (IsCooperative) { - ur_exp_launch_property_value_t launch_property_value_cooperative; - launch_property_value_cooperative.cooperative = 1; - property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE, - launch_property_value_cooperative}); - } + } + if (IsCooperative) { + ur_kernel_launch_property_value_t launch_property_value_cooperative; + launch_property_value_cooperative.cooperative = 1; + property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE, + launch_property_value_cooperative}); } // If there is no implicit arg, let the driver handle it via a property if (WorkGroupMemorySize && !ImplicitLocalArg.has_value()) { - property_list.push_back( - {UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY, {{WorkGroupMemorySize}}}); - } - if (!property_list.empty()) { - ur_event_handle_t UREvent = nullptr; - ur_result_t Error = - Adapter->call_nocheck( - Queue->getHandleRef(), Kernel, NDRDesc.Dims, - &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0], LocalSize, - property_list.size(), property_list.data(), RawEvents.size(), - RawEvents.empty() ? nullptr : &RawEvents[0], - OutEventImpl ? &UREvent : nullptr); - if ((Error == UR_RESULT_SUCCESS) && OutEventImpl) { - OutEventImpl->setHandle(UREvent); - } - return Error; + property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY, + {{WorkGroupMemorySize}}}); } ur_event_handle_t UREvent = nullptr; - ur_result_t Error = - [&](auto... Args) { - if (IsCooperative) { - return Adapter - ->call_nocheck( - Args...); - } - return Adapter->call_nocheck(Args...); - }(Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], - &NDRDesc.GlobalSize[0], LocalSize, RawEvents.size(), - RawEvents.empty() ? nullptr : &RawEvents[0], - OutEventImpl ? &UREvent : nullptr); + ur_result_t Error = Adapter->call_nocheck( + Queue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0], + &NDRDesc.GlobalSize[0], LocalSize, property_list.size(), + property_list.empty() ? nullptr : property_list.data(), RawEvents.size(), + RawEvents.empty() ? nullptr : &RawEvents[0], + OutEventImpl ? &UREvent : nullptr); if (Error == UR_RESULT_SUCCESS && OutEventImpl) { OutEventImpl->setHandle(UREvent); } diff --git a/sycl/source/detail/ur_device_info_ret_types.inc b/sycl/source/detail/ur_device_info_ret_types.inc index 69a68389a732c..7e96275518a77 100644 --- a/sycl/source/detail/ur_device_info_ret_types.inc +++ b/sycl/source/detail/ur_device_info_ret_types.inc @@ -172,7 +172,6 @@ MAP(UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_BINDLESS_SAMPLE_1D_USM_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_BINDLESS_SAMPLE_2D_USM_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_BINDLESS_UNIQUE_ADDRESSING_PER_DIM_SUPPORT_EXP, ur_bool_t) -MAP(UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP, ur_bool_t) @@ -188,4 +187,5 @@ MAP(UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP, uint32_t) MAP(UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP, ur_bool_t) MAP(UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, ur_bool_t) +MAP(UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT, ur_kernel_launch_properties_support_flags_t) // clang-format on diff --git a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp index e15d361a5ca3e..7928e5da66bac 100644 --- a/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp +++ b/sycl/test-e2e/ClusterLaunch/enqueueLaunchCustom_check_event_deps.cpp @@ -1,5 +1,5 @@ // Checks whether or not event Dependencies are honored by -// urEnqueueKernelLaunchCustomExp +// urEnqueueKernelLaunch with cluster dimensions // REQUIRES: target-nvidia, aspect-ext_oneapi_cuda_cluster_group // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out // RUN: %{run} %t.out diff --git a/sycl/unittests/helpers/UrMock.hpp b/sycl/unittests/helpers/UrMock.hpp index 583fb0514cb97..934dd9f5f7a84 100644 --- a/sycl/unittests/helpers/UrMock.hpp +++ b/sycl/unittests/helpers/UrMock.hpp @@ -393,10 +393,9 @@ inline ur_result_t mock_urEventGetInfo(void *pParams) { } } -inline ur_result_t -mock_urKernelSuggestMaxCooperativeGroupCountExp(void *pParams) { +inline ur_result_t mock_urKernelSuggestMaxCooperativeGroupCount(void *pParams) { auto params = reinterpret_cast< - ur_kernel_suggest_max_cooperative_group_count_exp_params_t *>(pParams); + ur_kernel_suggest_max_cooperative_group_count_params_t *>(pParams); **params->ppGroupCountRet = 1; return UR_RESULT_SUCCESS; } @@ -571,8 +570,8 @@ template class UrMock { ADD_DEFAULT_OVERRIDE(urProgramGetInfo, mock_urProgramGetInfo) ADD_DEFAULT_OVERRIDE(urKernelGetGroupInfo, mock_urKernelGetGroupInfo) ADD_DEFAULT_OVERRIDE(urEventGetInfo, mock_urEventGetInfo) - ADD_DEFAULT_OVERRIDE(urKernelSuggestMaxCooperativeGroupCountExp, - mock_urKernelSuggestMaxCooperativeGroupCountExp) + ADD_DEFAULT_OVERRIDE(urKernelSuggestMaxCooperativeGroupCount, + mock_urKernelSuggestMaxCooperativeGroupCount) ADD_DEFAULT_OVERRIDE(urDeviceSelectBinary, mock_urDeviceSelectBinary) ADD_DEFAULT_OVERRIDE(urPlatformGetBackendOption, mock_urPlatformGetBackendOption) diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index c71523cdd24c3..b607d5f843afb 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -367,10 +367,6 @@ typedef enum ur_function_t { UR_FUNCTION_LOADER_INIT = 201, /// Enumerator for ::urLoaderTearDown UR_FUNCTION_LOADER_TEAR_DOWN = 202, - /// Enumerator for ::urEnqueueCooperativeKernelLaunchExp - UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP = 214, - /// Enumerator for ::urKernelSuggestMaxCooperativeGroupCountExp - UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP = 215, /// Enumerator for ::urProgramGetGlobalVariablePointer UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER = 216, /// Enumerator for ::urDeviceGetSelected @@ -381,8 +377,6 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 221, /// Enumerator for ::urEnqueueTimestampRecordingExp UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, - /// Enumerator for ::urEnqueueKernelLaunchCustomExp - UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP = 224, /// Enumerator for ::urKernelGetSuggestedLocalWorkSize UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225, /// Enumerator for ::urBindlessImagesImportExternalMemoryExp @@ -469,6 +463,8 @@ typedef enum ur_function_t { UR_FUNCTION_BINDLESS_IMAGES_GET_IMAGE_MEMORY_HANDLE_TYPE_SUPPORT_EXP = 270, /// Enumerator for ::urBindlessImagesFreeMappedLinearMemoryExp UR_FUNCTION_BINDLESS_IMAGES_FREE_MAPPED_LINEAR_MEMORY_EXP = 271, + /// Enumerator for ::urKernelSuggestMaxCooperativeGroupCount + UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT = 272, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -2309,6 +2305,9 @@ typedef enum ur_device_info_t { UR_DEVICE_INFO_MAX_POWER_LIMIT = 126, /// [::ur_bool_t] support for native bfloat16 conversions UR_DEVICE_INFO_BFLOAT16_CONVERSIONS_NATIVE = 127, + /// [::ur_kernel_launch_properties_support_flags_t] Bitfield of supported + /// kernel launch properties. + UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT = 128, /// [::ur_bool_t] Returns true if the device supports the use of /// command-buffers. UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000, @@ -2321,8 +2320,6 @@ typedef enum ur_device_info_t { /// [::ur_bool_t] Returns true if the device supports appending a /// command-buffer as a command inside another command-buffer. UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP = 0x1003, - /// [::ur_bool_t] return true if enqueue Cluster Launch is supported - UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP = 0x1111, /// [::ur_bool_t] returns true if the device supports the creation of /// bindless images UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, @@ -2414,14 +2411,9 @@ typedef enum ur_device_info_t { /// [::ur_bool_t] returns true if the device supports enqueueing of /// allocations and frees. UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_SUPPORT_EXP = 0x2050, - /// [::ur_bool_t] Returns true if the device supports the use of kernel - /// launch properties. - UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP = 0x3000, /// [::ur_bool_t] Returns true if the device supports the USM P2P /// experimental feature. UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP = 0x4000, - /// [::ur_bool_t] Returns true if the device supports cooperative kernels. - UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP = 0x5000, /// [::ur_bool_t] Returns true if the device supports the multi device /// compile experimental feature. UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP = 0x6000, @@ -2992,6 +2984,25 @@ typedef enum ur_device_throttle_reasons_flag_t { /// @brief Bit Mask for validating ur_device_throttle_reasons_flags_t #define UR_DEVICE_THROTTLE_REASONS_FLAGS_MASK 0xffffff80 +/////////////////////////////////////////////////////////////////////////////// +/// @brief Kernel launch properties support +typedef uint32_t ur_kernel_launch_properties_support_flags_t; +typedef enum ur_kernel_launch_properties_support_flag_t { + /// Supports ::UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE and + /// ::urKernelSuggestMaxCooperativeGroupCount + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE = UR_BIT(0), + /// Supports ::UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION = UR_BIT(1), + /// Supports ::UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_WORK_GROUP_MEMORY = UR_BIT(2), + /// @cond + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_kernel_launch_properties_support_flag_t; +/// @brief Bit Mask for validating ur_kernel_launch_properties_support_flags_t +#define UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAGS_MASK 0xfffffff8 + #if !defined(__GNUC__) #pragma endregion #endif @@ -6760,6 +6771,45 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( /// suggested local work size that will contain the result of the query size_t *pSuggestedLocalWorkSize); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Query the maximum number of work groups for a cooperative kernel +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hDevice` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pLocalWorkSize` +/// + `NULL == pGroupCountRet` +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If ::UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT returns a +/// value without the +/// ::UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE bit set. +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `workDim < 1 || workDim > 3` +UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] handle of the device object + ur_device_handle_t hDevice, + /// [in] number of dimensions, from 1 to 3, to specify the work-group + /// work-items + uint32_t workDim, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of local work-items forming a work-group that will execute the + /// kernel function. + const size_t *pLocalWorkSize, + /// [in] size of dynamic shared memory, for each work-group, in bytes, + /// that will be used when the kernel is launched + size_t dynamicSharedMemorySize, + /// [out] pointer to maximum number of groups + uint32_t *pGroupCountRet); + #if !defined(__GNUC__) #pragma endregion #endif @@ -7616,6 +7666,60 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback( #if !defined(__GNUC__) #pragma region enqueue #endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Specifies a launch property id +/// +/// @remarks +/// _Analogues_ +/// - **CUlaunchAttributeID** +typedef enum ur_kernel_launch_property_id_t { + /// The property has no effect. + UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE = 0, + /// Whether to launch a cooperative kernel. + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, + /// work-group cluster dimensions. + UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, + /// Implicit work group memory allocation. + UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, + /// @cond + UR_KERNEL_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_kernel_launch_property_id_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Specifies a launch property value +/// +/// @remarks +/// _Analogues_ +/// - **CUlaunchAttributeValue** +typedef union ur_kernel_launch_property_value_t { + /// [in] dimensions of the cluster (units of work-group) (x, y, z). Each + /// value must be a divisor of the corresponding global work-size + /// dimension (in units of work-group). + uint32_t clusterDim[3]; + /// [in] non-zero value indicates a cooperative kernel + int cooperative; + /// [in] non-zero value indicates the amount of work group memory to + /// allocate in bytes + size_t workgroup_mem_size; + +} ur_kernel_launch_property_value_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Kernel launch property +/// +/// @remarks +/// _Analogues_ +/// - **cuLaunchAttribute** +typedef struct ur_kernel_launch_property_t { + /// [in] launch property id + ur_kernel_launch_property_id_t id; + /// [in][tagged_by(id)] launch property value + ur_kernel_launch_property_value_t value; + +} ur_kernel_launch_property_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a command to execute a kernel /// @@ -7651,6 +7755,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback( /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If any property in `launchPropList` isn't supported by the device. UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( /// [in] handle of the queue object ur_queue_handle_t hQueue, @@ -7671,6 +7777,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -12106,105 +12217,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( /// [out] A pointer to the native handle of the command-buffer. ur_native_handle_t *phNativeCommandBuffer); -#if !defined(__GNUC__) -#pragma endregion -#endif -// Intel 'oneAPI' Unified Runtime Experimental APIs for Cooperative Kernels -#if !defined(__GNUC__) -#pragma region cooperative_kernels_(experimental) -#endif -/////////////////////////////////////////////////////////////////////////////// -/// @brief Enqueue a command to execute a cooperative kernel -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hQueue` -/// + `NULL == hKernel` -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` -/// + `NULL == pGlobalWorkSize` -/// - ::UR_RESULT_ERROR_INVALID_QUEUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -/// - ::UR_RESULT_ERROR_INVALID_EVENT -/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST -/// + `phEventWaitList == NULL && numEventsInWaitList > 0` -/// + `phEventWaitList != NULL && numEventsInWaitList == 0` -/// + If event objects in phEventWaitList are not valid events. -/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION -/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent); - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Query the maximum number of work groups for a cooperative kernel -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hKernel` -/// + `NULL == hDevice` -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pLocalWorkSize` -/// + `NULL == pGroupCountRet` -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] handle of the device object - ur_device_handle_t hDevice, - /// [in] number of dimensions, from 1 to 3, to specify the work-group - /// work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of local work-items forming a work-group that will execute the - /// kernel function. - const size_t *pLocalWorkSize, - /// [in] size of dynamic shared memory, for each work-group, in bytes, - /// that will be used when the kernel is launched - size_t dynamicSharedMemorySize, - /// [out] pointer to maximum number of groups - uint32_t *pGroupCountRet); - #if !defined(__GNUC__) #pragma endregion #endif @@ -12254,153 +12266,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( /// array. ur_event_handle_t *phEvent); -#if !defined(__GNUC__) -#pragma endregion -#endif -// Intel 'oneAPI' Unified Runtime Experimental APIs for (kernel) Launch -// Properties -#if !defined(__GNUC__) -#pragma region launch_properties_(experimental) -#endif -/////////////////////////////////////////////////////////////////////////////// -/// @brief Specifies a launch property id -/// -/// @remarks -/// _Analogues_ -/// - **CUlaunchAttributeID** -typedef enum ur_exp_launch_property_id_t { - /// The property has no effect - UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, - /// Whether to launch a cooperative kernel - UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, - /// work-group cluster dimensions - UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, - /// Implicit work group memory allocation - UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, - /// @cond - UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff - /// @endcond - -} ur_exp_launch_property_id_t; - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Specifies a launch property value -/// -/// @remarks -/// _Analogues_ -/// - **CUlaunchAttributeValue** -typedef union ur_exp_launch_property_value_t { - /// [in] dimensions of the cluster (units of work-group) (x, y, z). Each - /// value must be a divisor of the corresponding global work-size - /// dimension (in units of work-group). - uint32_t clusterDim[3]; - /// [in] non-zero value indicates a cooperative kernel - int cooperative; - /// [in] non-zero value indicates the amount of work group memory to - /// allocate in bytes - size_t workgroup_mem_size; - -} ur_exp_launch_property_value_t; - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Kernel launch property -/// -/// @remarks -/// _Analogues_ -/// - **cuLaunchAttribute** -typedef struct ur_exp_launch_property_t { - /// [in] launch property id - ur_exp_launch_property_id_t id; - /// [in][tagged_by(id)] launch property value - ur_exp_launch_property_value_t value; - -} ur_exp_launch_property_t; - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Launch kernel with custom launch properties -/// -/// @details -/// - Launches the kernel using the specified launch properties -/// - If numPropsInLaunchPropList == 0 then a regular kernel launch is used: -/// `urEnqueueKernelLaunch` -/// - Consult the appropriate adapter driver documentation for details of -/// adapter specific behavior and native error codes that may be returned. -/// -/// @remarks -/// _Analogues_ -/// - **cuLaunchKernelEx** -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hQueue` -/// + `NULL == hKernel` -/// + NULL == hQueue -/// + NULL == hKernel -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` -/// + `NULL == pGlobalWorkSize` -/// + `NULL == launchPropList` -/// + NULL == pGlobalWorkSize -/// + numPropsInLaunchpropList != 0 && launchPropList == NULL -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_QUEUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -/// - ::UR_RESULT_ERROR_INVALID_EVENT -/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST -/// + phEventWaitList == NULL && numEventsInWaitList > 0 -/// + phEventWaitList != NULL && numEventsInWaitList == 0 -/// + If event objects in phEventWaitList are not valid events. -/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS -/// + An event in phEventWaitList has ::UR_EVENT_STATUS_ERROR -/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION -/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. If nullptr, the runtime implementation - /// will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_exp_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If nullptr, - /// the numEventsInWaitList must be 0, indicating that no wait event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent); - #if !defined(__GNUC__) #pragma endregion #endif @@ -13596,17 +13461,17 @@ typedef struct ur_kernel_set_specialization_constants_params_t { } ur_kernel_set_specialization_constants_params_t; /////////////////////////////////////////////////////////////////////////////// -/// @brief Function parameters for urKernelSuggestMaxCooperativeGroupCountExp +/// @brief Function parameters for urKernelSuggestMaxCooperativeGroupCount /// @details Each entry is a pointer to the parameter passed to the function; /// allowing the callback the ability to modify the parameter's value -typedef struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t { +typedef struct ur_kernel_suggest_max_cooperative_group_count_params_t { ur_kernel_handle_t *phKernel; ur_device_handle_t *phDevice; uint32_t *pworkDim; const size_t **ppLocalWorkSize; size_t *pdynamicSharedMemorySize; uint32_t **ppGroupCountRet; -} ur_kernel_suggest_max_cooperative_group_count_exp_params_t; +} ur_kernel_suggest_max_cooperative_group_count_params_t; /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urQueueGetInfo @@ -13953,6 +13818,8 @@ typedef struct ur_enqueue_kernel_launch_params_t { const size_t **ppGlobalWorkOffset; const size_t **ppGlobalWorkSize; const size_t **ppLocalWorkSize; + uint32_t *pnumPropsInLaunchPropList; + const ur_kernel_launch_property_t **plaunchPropList; uint32_t *pnumEventsInWaitList; const ur_event_handle_t **pphEventWaitList; ur_event_handle_t **pphEvent; @@ -14345,24 +14212,6 @@ typedef struct ur_enqueue_write_host_pipe_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_write_host_pipe_params_t; -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function parameters for urEnqueueKernelLaunchCustomExp -/// @details Each entry is a pointer to the parameter passed to the function; -/// allowing the callback the ability to modify the parameter's value -typedef struct ur_enqueue_kernel_launch_custom_exp_params_t { - ur_queue_handle_t *phQueue; - ur_kernel_handle_t *phKernel; - uint32_t *pworkDim; - const size_t **ppGlobalWorkOffset; - const size_t **ppGlobalWorkSize; - const size_t **ppLocalWorkSize; - uint32_t *pnumPropsInLaunchPropList; - const ur_exp_launch_property_t **plaunchPropList; - uint32_t *pnumEventsInWaitList; - const ur_event_handle_t **pphEventWaitList; - ur_event_handle_t **pphEvent; -} ur_enqueue_kernel_launch_custom_exp_params_t; - /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urEnqueueEventsWaitWithBarrierExt /// @details Each entry is a pointer to the parameter passed to the function; @@ -14445,22 +14294,6 @@ typedef struct ur_enqueue_command_buffer_exp_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_command_buffer_exp_params_t; -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function parameters for urEnqueueCooperativeKernelLaunchExp -/// @details Each entry is a pointer to the parameter passed to the function; -/// allowing the callback the ability to modify the parameter's value -typedef struct ur_enqueue_cooperative_kernel_launch_exp_params_t { - ur_queue_handle_t *phQueue; - ur_kernel_handle_t *phKernel; - uint32_t *pworkDim; - const size_t **ppGlobalWorkOffset; - const size_t **ppGlobalWorkSize; - const size_t **ppLocalWorkSize; - uint32_t *pnumEventsInWaitList; - const ur_event_handle_t **pphEventWaitList; - ur_event_handle_t **pphEvent; -} ur_enqueue_cooperative_kernel_launch_exp_params_t; - /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urEnqueueTimestampRecordingExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/ur_api_funcs.def b/unified-runtime/include/ur_api_funcs.def index af2c09896e18a..dccd02176d290 100644 --- a/unified-runtime/include/ur_api_funcs.def +++ b/unified-runtime/include/ur_api_funcs.def @@ -73,7 +73,7 @@ _UR_API(urKernelSetExecInfo) _UR_API(urKernelSetArgSampler) _UR_API(urKernelSetArgMemObj) _UR_API(urKernelSetSpecializationConstants) -_UR_API(urKernelSuggestMaxCooperativeGroupCountExp) +_UR_API(urKernelSuggestMaxCooperativeGroupCount) _UR_API(urQueueGetInfo) _UR_API(urQueueCreate) _UR_API(urQueueRetain) @@ -133,13 +133,11 @@ _UR_API(urEnqueueDeviceGlobalVariableRead) _UR_API(urEnqueueReadHostPipe) _UR_API(urEnqueueWriteHostPipe) _UR_API(urEnqueueEventsWaitWithBarrierExt) -_UR_API(urEnqueueKernelLaunchCustomExp) _UR_API(urEnqueueUSMDeviceAllocExp) _UR_API(urEnqueueUSMSharedAllocExp) _UR_API(urEnqueueUSMHostAllocExp) _UR_API(urEnqueueUSMFreeExp) _UR_API(urEnqueueCommandBufferExp) -_UR_API(urEnqueueCooperativeKernelLaunchExp) _UR_API(urEnqueueTimestampRecordingExp) _UR_API(urEnqueueNativeCommandExp) _UR_API(urUSMHostAlloc) diff --git a/unified-runtime/include/ur_ddi.h b/unified-runtime/include/ur_ddi.h index 6aefa464b78a4..8f8ea3523bd36 100644 --- a/unified-runtime/include/ur_ddi.h +++ b/unified-runtime/include/ur_ddi.h @@ -525,6 +525,12 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgMemObj_t)( typedef ur_result_t(UR_APICALL *ur_pfnKernelSetSpecializationConstants_t)( ur_kernel_handle_t, uint32_t, const ur_specialization_constant_info_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCount +typedef ur_result_t(UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCount_t)( + ur_kernel_handle_t, ur_device_handle_t, uint32_t, const size_t *, size_t, + uint32_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of Kernel functions pointers typedef struct ur_kernel_dditable_t { @@ -544,6 +550,8 @@ typedef struct ur_kernel_dditable_t { ur_pfnKernelSetArgSampler_t pfnSetArgSampler; ur_pfnKernelSetArgMemObj_t pfnSetArgMemObj; ur_pfnKernelSetSpecializationConstants_t pfnSetSpecializationConstants; + ur_pfnKernelSuggestMaxCooperativeGroupCount_t + pfnSuggestMaxCooperativeGroupCount; } ur_kernel_dditable_t; /////////////////////////////////////////////////////////////////////////////// @@ -566,40 +574,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( typedef ur_result_t(UR_APICALL *ur_pfnGetKernelProcAddrTable_t)( ur_api_version_t, ur_kernel_dditable_t *); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function-pointer for urKernelSuggestMaxCooperativeGroupCountExp -typedef ur_result_t( - UR_APICALL *ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t)( - ur_kernel_handle_t, ur_device_handle_t, uint32_t, const size_t *, size_t, - uint32_t *); - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Table of KernelExp functions pointers -typedef struct ur_kernel_exp_dditable_t { - ur_pfnKernelSuggestMaxCooperativeGroupCountExp_t - pfnSuggestMaxCooperativeGroupCountExp; -} ur_kernel_exp_dditable_t; - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's KernelExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - /// [in] API version requested - ur_api_version_t version, - /// [in,out] pointer to table of DDI function pointers - ur_kernel_exp_dditable_t *pDdiTable); - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function-pointer for urGetKernelExpProcAddrTable -typedef ur_result_t(UR_APICALL *ur_pfnGetKernelExpProcAddrTable_t)( - ur_api_version_t, ur_kernel_exp_dditable_t *); - /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urQueueGetInfo typedef ur_result_t(UR_APICALL *ur_pfnQueueGetInfo_t)(ur_queue_handle_t, @@ -936,7 +910,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnGetGlobalProcAddrTable_t)( /// @brief Function-pointer for urEnqueueKernelLaunch typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunch_t)( ur_queue_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, - const size_t *, const size_t *, uint32_t, const ur_event_handle_t *, + const size_t *, const size_t *, uint32_t, + const ur_kernel_launch_property_t *, uint32_t, const ur_event_handle_t *, ur_event_handle_t *); /////////////////////////////////////////////////////////////////////////////// @@ -1147,13 +1122,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( typedef ur_result_t(UR_APICALL *ur_pfnGetEnqueueProcAddrTable_t)( ur_api_version_t, ur_enqueue_dditable_t *); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function-pointer for urEnqueueKernelLaunchCustomExp -typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)( - ur_queue_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, - const size_t *, const size_t *, uint32_t, const ur_exp_launch_property_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *); - /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urEnqueueUSMDeviceAllocExp typedef ur_result_t(UR_APICALL *ur_pfnEnqueueUSMDeviceAllocExp_t)( @@ -1187,13 +1155,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueCommandBufferExp_t)( ur_queue_handle_t, ur_exp_command_buffer_handle_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Function-pointer for urEnqueueCooperativeKernelLaunchExp -typedef ur_result_t(UR_APICALL *ur_pfnEnqueueCooperativeKernelLaunchExp_t)( - ur_queue_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, - const size_t *, const size_t *, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *); - /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urEnqueueTimestampRecordingExp typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)( @@ -1211,13 +1172,11 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueNativeCommandExp_t)( /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { - ur_pfnEnqueueKernelLaunchCustomExp_t pfnKernelLaunchCustomExp; ur_pfnEnqueueUSMDeviceAllocExp_t pfnUSMDeviceAllocExp; ur_pfnEnqueueUSMSharedAllocExp_t pfnUSMSharedAllocExp; ur_pfnEnqueueUSMHostAllocExp_t pfnUSMHostAllocExp; ur_pfnEnqueueUSMFreeExp_t pfnUSMFreeExp; ur_pfnEnqueueCommandBufferExp_t pfnCommandBufferExp; - ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp; ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp; ur_pfnEnqueueNativeCommandExp_t pfnNativeCommandExp; } ur_enqueue_exp_dditable_t; @@ -2046,7 +2005,6 @@ typedef struct ur_dditable_t { ur_program_dditable_t Program; ur_program_exp_dditable_t ProgramExp; ur_kernel_dditable_t Kernel; - ur_kernel_exp_dditable_t KernelExp; ur_queue_dditable_t Queue; ur_sampler_dditable_t Sampler; ur_mem_dditable_t Mem; diff --git a/unified-runtime/include/ur_print.h b/unified-runtime/include/ur_print.h index 0d3fa7f5b636c..fc2461c8e46bd 100644 --- a/unified-runtime/include/ur_print.h +++ b/unified-runtime/include/ur_print.h @@ -343,6 +343,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintDeviceThrottleReasonsFlags( enum ur_device_throttle_reasons_flag_t value, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_launch_properties_support_flag_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelLaunchPropertiesSupportFlags( + enum ur_kernel_launch_properties_support_flag_t value, char *buffer, + const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_context_flag_t enum /// @returns @@ -1101,6 +1111,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExecutionInfo(enum ur_execution_info_t value, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_launch_property_id_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelLaunchPropertyId( + enum ur_kernel_launch_property_id_t value, char *buffer, + const size_t buff_size, size_t *out_size); + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_launch_property_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelLaunchProperty( + const struct ur_kernel_launch_property_t params, char *buffer, + const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_map_flag_t enum /// @returns @@ -1375,26 +1405,6 @@ urPrintExpCommandBufferUpdateKernelLaunchDesc( const struct ur_exp_command_buffer_update_kernel_launch_desc_t params, char *buffer, const size_t buff_size, size_t *out_size); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_exp_launch_property_id_t enum -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_SIZE -/// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL -urPrintExpLaunchPropertyId(enum ur_exp_launch_property_id_t value, char *buffer, - const size_t buff_size, size_t *out_size); - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_exp_launch_property_t struct -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_SIZE -/// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintExpLaunchProperty( - const struct ur_exp_launch_property_t params, char *buffer, - const size_t buff_size, size_t *out_size); - /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_exp_peer_info_t enum /// @returns @@ -2081,16 +2091,14 @@ urPrintKernelSetSpecializationConstantsParams( char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_kernel_suggest_max_cooperative_group_count_exp_params_t -/// struct +/// @brief Print ur_kernel_suggest_max_cooperative_group_count_params_t struct /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_INVALID_SIZE /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL -urPrintKernelSuggestMaxCooperativeGroupCountExpParams( - const struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t - *params, +urPrintKernelSuggestMaxCooperativeGroupCountParams( + const struct ur_kernel_suggest_max_cooperative_group_count_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); /////////////////////////////////////////////////////////////////////////////// @@ -2676,16 +2684,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams( const struct ur_enqueue_write_host_pipe_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_enqueue_kernel_launch_custom_exp_params_t struct -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_SIZE -/// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueKernelLaunchCustomExpParams( - const struct ur_enqueue_kernel_launch_custom_exp_params_t *params, - char *buffer, const size_t buff_size, size_t *out_size); - /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_enqueue_events_wait_with_barrier_ext_params_t struct /// @returns @@ -2747,17 +2745,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueCommandBufferExpParams( const struct ur_enqueue_command_buffer_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print ur_enqueue_cooperative_kernel_launch_exp_params_t struct -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_SIZE -/// - `buff_size < out_size` -UR_APIEXPORT ur_result_t UR_APICALL -urPrintEnqueueCooperativeKernelLaunchExpParams( - const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *params, - char *buffer, const size_t buff_size, size_t *out_size); - /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_enqueue_timestamp_recording_exp_params_t struct /// @returns diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index c981ebdb8a201..c687b3a81998f 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -106,6 +106,11 @@ template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); +template <> +inline ur_result_t +printFlag(std::ostream &os, + uint32_t flag); + template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); @@ -217,6 +222,11 @@ template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_profiling_info_t value, size_t size); +inline ur_result_t +printUnion(std::ostream &os, + const union ur_kernel_launch_property_value_t params, + const enum ur_kernel_launch_property_id_t tag); + template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); @@ -250,10 +260,6 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_command_buffer_command_info_t value, size_t size); -inline ur_result_t printUnion(std::ostream &os, - const union ur_exp_launch_property_value_t params, - const enum ur_exp_launch_property_id_t tag); - template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_peer_info_t value, size_t size); @@ -335,6 +341,9 @@ inline std::ostream & operator<<(std::ostream &os, enum ur_device_usm_access_capability_flag_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_device_throttle_reasons_flag_t value); +inline std::ostream & +operator<<(std::ostream &os, + enum ur_kernel_launch_properties_support_flag_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_context_flag_t value); inline std::ostream & operator<<(std::ostream &os, @@ -504,6 +513,11 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct ur_event_native_properties_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_execution_info_t value); +inline std::ostream &operator<<(std::ostream &os, + enum ur_kernel_launch_property_id_t value); +inline std::ostream & +operator<<(std::ostream &os, + [[maybe_unused]] const struct ur_kernel_launch_property_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_map_flag_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_usm_migration_flag_t value); @@ -576,11 +590,6 @@ inline std::ostream &operator<<( inline std::ostream & operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_command_buffer_update_kernel_launch_desc_t params); -inline std::ostream &operator<<(std::ostream &os, - enum ur_exp_launch_property_id_t value); -inline std::ostream & -operator<<(std::ostream &os, - [[maybe_unused]] const struct ur_exp_launch_property_t params); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_peer_info_t value); inline std::ostream &operator<<(std::ostream &os, @@ -1102,12 +1111,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_LOADER_TEAR_DOWN: os << "UR_FUNCTION_LOADER_TEAR_DOWN"; break; - case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: - os << "UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP"; - break; - case UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP: - os << "UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP"; - break; case UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER: os << "UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER"; break; @@ -1123,9 +1126,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP"; break; - case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP: - os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP"; - break; case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE"; break; @@ -1256,6 +1256,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_BINDLESS_IMAGES_FREE_MAPPED_LINEAR_MEMORY_EXP: os << "UR_FUNCTION_BINDLESS_IMAGES_FREE_MAPPED_LINEAR_MEMORY_EXP"; break; + case UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT: + os << "UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT"; + break; default: os << "unknown enumerator"; break; @@ -2981,6 +2984,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_BFLOAT16_CONVERSIONS_NATIVE: os << "UR_DEVICE_INFO_BFLOAT16_CONVERSIONS_NATIVE"; break; + case UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT: + os << "UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT"; + break; case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: os << "UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP"; break; @@ -2993,9 +2999,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP: os << "UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP"; break; - case UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP: - os << "UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP"; - break; case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: os << "UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP"; break; @@ -3089,15 +3092,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_SUPPORT_EXP: os << "UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_SUPPORT_EXP"; break; - case UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP: - os << "UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP"; - break; case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: os << "UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP"; break; - case UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP: - os << "UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP"; - break; case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: os << "UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP"; break; @@ -4726,6 +4723,21 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, os << ")"; } break; + case UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT: { + const ur_kernel_launch_properties_support_flags_t *tptr = + (const ur_kernel_launch_properties_support_flags_t *)ptr; + if (sizeof(ur_kernel_launch_properties_support_flags_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" + << sizeof(ur_kernel_launch_properties_support_flags_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + ur::details::printFlag(os, + *tptr); + + os << ")"; + } break; case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: { const ur_bool_t *tptr = (const ur_bool_t *)ptr; if (sizeof(ur_bool_t) > size) { @@ -4780,19 +4792,6 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, os << ")"; } break; - case UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP: { - const ur_bool_t *tptr = (const ur_bool_t *)ptr; - if (sizeof(ur_bool_t) > size) { - os << "invalid size (is: " << size - << ", expected: >=" << sizeof(ur_bool_t) << ")"; - return UR_RESULT_ERROR_INVALID_SIZE; - } - os << (const void *)(tptr) << " ("; - - os << *tptr; - - os << ")"; - } break; case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: { const ur_bool_t *tptr = (const ur_bool_t *)ptr; if (sizeof(ur_bool_t) > size) { @@ -5198,19 +5197,6 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, os << ")"; } break; - case UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP: { - const ur_bool_t *tptr = (const ur_bool_t *)ptr; - if (sizeof(ur_bool_t) > size) { - os << "invalid size (is: " << size - << ", expected: >=" << sizeof(ur_bool_t) << ")"; - return UR_RESULT_ERROR_INVALID_SIZE; - } - os << (const void *)(tptr) << " ("; - - os << *tptr; - - os << ")"; - } break; case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: { const ur_bool_t *tptr = (const ur_bool_t *)ptr; if (sizeof(ur_bool_t) > size) { @@ -5224,19 +5210,6 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, os << ")"; } break; - case UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP: { - const ur_bool_t *tptr = (const ur_bool_t *)ptr; - if (sizeof(ur_bool_t) > size) { - os << "invalid size (is: " << size - << ", expected: >=" << sizeof(ur_bool_t) << ")"; - return UR_RESULT_ERROR_INVALID_SIZE; - } - os << (const void *)(tptr) << " ("; - - os << *tptr; - - os << ")"; - } break; case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: { const ur_bool_t *tptr = (const ur_bool_t *)ptr; if (sizeof(ur_bool_t) > size) { @@ -6204,6 +6177,85 @@ printFlag(std::ostream &os, uint32_t flag) { } } // namespace ur::details /////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_kernel_launch_properties_support_flag_t +/// type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, + enum ur_kernel_launch_properties_support_flag_t value) { + switch (value) { + case UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE: + os << "UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE"; + break; + case UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION: + os << "UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION"; + break; + case UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_WORK_GROUP_MEMORY: + os << "UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_WORK_GROUP_MEMORY"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} + +namespace ur::details { +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_launch_properties_support_flag_t flag +template <> +inline ur_result_t +printFlag(std::ostream &os, + uint32_t flag) { + uint32_t val = flag; + bool first = true; + + if ((val & UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE) == + (uint32_t)UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE) { + val ^= (uint32_t)UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE; + } + + if ((val & UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION) == + (uint32_t)UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION) { + val ^= (uint32_t)UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION; + } + + if ((val & UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_WORK_GROUP_MEMORY) == + (uint32_t)UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_WORK_GROUP_MEMORY) { + val ^= (uint32_t)UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_WORK_GROUP_MEMORY; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_WORK_GROUP_MEMORY; + } + if (val != 0) { + std::bitset<32> bits(val); + if (!first) { + os << " | "; + } + os << "unknown bit flags " << bits; + } else if (first) { + os << "0"; + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_context_flag_t type /// @returns /// std::ostream & @@ -10705,6 +10757,96 @@ inline std::ostream &operator<<(std::ostream &os, return os; } /////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_kernel_launch_property_id_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, + enum ur_kernel_launch_property_id_t value) { + switch (value) { + case UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE: + os << "UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE"; + break; + case UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE: + os << "UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE"; + break; + case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: + os << "UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION"; + break; + case UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: + os << "UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} +namespace ur::details { + +/////////////////////////////////////////////////////////////////////////////// +// @brief Print ur_kernel_launch_property_value_t union +inline ur_result_t +printUnion(std::ostream &os, + const union ur_kernel_launch_property_value_t params, + const enum ur_kernel_launch_property_id_t tag) { + os << "(union ur_kernel_launch_property_value_t){"; + + switch (tag) { + case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: + + os << ".clusterDim = {"; + for (auto i = 0; i < 3; i++) { + if (i != 0) { + os << ", "; + } + + os << (params.clusterDim[i]); + } + os << "}"; + + break; + case UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE: + + os << ".cooperative = "; + + os << (params.cooperative); + + break; + case UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: + + os << ".workgroup_mem_size = "; + + os << (params.workgroup_mem_size); + + break; + default: + os << ""; + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + os << "}"; + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_kernel_launch_property_t type +/// @returns +/// std::ostream & +inline std::ostream & +operator<<(std::ostream &os, const struct ur_kernel_launch_property_t params) { + os << "(struct ur_kernel_launch_property_t){"; + + os << ".id = "; + + os << (params.id); + + os << ", "; + os << ".value = "; + ur::details::printUnion(os, (params.value), params.id); + + os << "}"; + return os; +} +/////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_map_flag_t type /// @returns /// std::ostream & @@ -11953,95 +12095,6 @@ inline std::ostream &operator<<( return os; } /////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_exp_launch_property_id_t type -/// @returns -/// std::ostream & -inline std::ostream &operator<<(std::ostream &os, - enum ur_exp_launch_property_id_t value) { - switch (value) { - case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: - os << "UR_EXP_LAUNCH_PROPERTY_ID_IGNORE"; - break; - case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: - os << "UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE"; - break; - case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: - os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION"; - break; - case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: - os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY"; - break; - default: - os << "unknown enumerator"; - break; - } - return os; -} -namespace ur::details { - -/////////////////////////////////////////////////////////////////////////////// -// @brief Print ur_exp_launch_property_value_t union -inline ur_result_t printUnion(std::ostream &os, - const union ur_exp_launch_property_value_t params, - const enum ur_exp_launch_property_id_t tag) { - os << "(union ur_exp_launch_property_value_t){"; - - switch (tag) { - case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: - - os << ".clusterDim = {"; - for (auto i = 0; i < 3; i++) { - if (i != 0) { - os << ", "; - } - - os << (params.clusterDim[i]); - } - os << "}"; - - break; - case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: - - os << ".cooperative = "; - - os << (params.cooperative); - - break; - case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: - - os << ".workgroup_mem_size = "; - - os << (params.workgroup_mem_size); - - break; - default: - os << ""; - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - os << "}"; - return UR_RESULT_SUCCESS; -} -} // namespace ur::details -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_exp_launch_property_t type -/// @returns -/// std::ostream & -inline std::ostream &operator<<(std::ostream &os, - const struct ur_exp_launch_property_t params) { - os << "(struct ur_exp_launch_property_t){"; - - os << ".id = "; - - os << (params.id); - - os << ", "; - os << ".value = "; - ur::details::printUnion(os, (params.value), params.id); - - os << "}"; - return os; -} -/////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_exp_peer_info_t type /// @returns /// std::ostream & @@ -14168,12 +14221,12 @@ operator<<(std::ostream &os, [[maybe_unused]] const struct /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the -/// ur_kernel_suggest_max_cooperative_group_count_exp_params_t type +/// ur_kernel_suggest_max_cooperative_group_count_params_t type /// @returns /// std::ostream & inline std::ostream & operator<<(std::ostream &os, [[maybe_unused]] const struct - ur_kernel_suggest_max_cooperative_group_count_exp_params_t *params) { + ur_kernel_suggest_max_cooperative_group_count_params_t *params) { os << ".hKernel = "; @@ -15108,6 +15161,27 @@ inline std::ostream &operator<<( ur::details::printPtr(os, *(params->ppLocalWorkSize)); + os << ", "; + os << ".numPropsInLaunchPropList = "; + + os << *(params->pnumPropsInLaunchPropList); + + os << ", "; + os << ".launchPropList = "; + ur::details::printPtr( + os, reinterpret_cast(*(params->plaunchPropList))); + if (*(params->plaunchPropList) != NULL) { + os << " {"; + for (size_t i = 0; i < *params->pnumPropsInLaunchPropList; ++i) { + if (i != 0) { + os << ", "; + } + + os << (*(params->plaunchPropList))[i]; + } + os << "}"; + } + os << ", "; os << ".numEventsInWaitList = "; @@ -16733,95 +16807,6 @@ inline std::ostream &operator<<( return os; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the ur_enqueue_kernel_launch_custom_exp_params_t -/// type -/// @returns -/// std::ostream & -inline std::ostream &operator<<( - std::ostream &os, - [[maybe_unused]] const struct ur_enqueue_kernel_launch_custom_exp_params_t - *params) { - - os << ".hQueue = "; - - ur::details::printPtr(os, *(params->phQueue)); - - os << ", "; - os << ".hKernel = "; - - ur::details::printPtr(os, *(params->phKernel)); - - os << ", "; - os << ".workDim = "; - - os << *(params->pworkDim); - - os << ", "; - os << ".pGlobalWorkOffset = "; - - ur::details::printPtr(os, *(params->ppGlobalWorkOffset)); - - os << ", "; - os << ".pGlobalWorkSize = "; - - ur::details::printPtr(os, *(params->ppGlobalWorkSize)); - - os << ", "; - os << ".pLocalWorkSize = "; - - ur::details::printPtr(os, *(params->ppLocalWorkSize)); - - os << ", "; - os << ".numPropsInLaunchPropList = "; - - os << *(params->pnumPropsInLaunchPropList); - - os << ", "; - os << ".launchPropList = "; - ur::details::printPtr( - os, reinterpret_cast(*(params->plaunchPropList))); - if (*(params->plaunchPropList) != NULL) { - os << " {"; - for (size_t i = 0; i < *params->pnumPropsInLaunchPropList; ++i) { - if (i != 0) { - os << ", "; - } - - os << (*(params->plaunchPropList))[i]; - } - os << "}"; - } - - os << ", "; - os << ".numEventsInWaitList = "; - - os << *(params->pnumEventsInWaitList); - - os << ", "; - os << ".phEventWaitList = "; - ur::details::printPtr( - os, reinterpret_cast(*(params->pphEventWaitList))); - if (*(params->pphEventWaitList) != NULL) { - os << " {"; - for (size_t i = 0; i < *params->pnumEventsInWaitList; ++i) { - if (i != 0) { - os << ", "; - } - - ur::details::printPtr(os, (*(params->pphEventWaitList))[i]); - } - os << "}"; - } - - os << ", "; - os << ".phEvent = "; - - ur::details::printPtr(os, *(params->pphEvent)); - - return os; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the /// ur_enqueue_events_wait_with_barrier_ext_params_t type @@ -17153,73 +17138,6 @@ operator<<(std::ostream &os, return os; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Print operator for the -/// ur_enqueue_cooperative_kernel_launch_exp_params_t type -/// @returns -/// std::ostream & -inline std::ostream & -operator<<(std::ostream &os, [[maybe_unused]] const struct - ur_enqueue_cooperative_kernel_launch_exp_params_t *params) { - - os << ".hQueue = "; - - ur::details::printPtr(os, *(params->phQueue)); - - os << ", "; - os << ".hKernel = "; - - ur::details::printPtr(os, *(params->phKernel)); - - os << ", "; - os << ".workDim = "; - - os << *(params->pworkDim); - - os << ", "; - os << ".pGlobalWorkOffset = "; - - ur::details::printPtr(os, *(params->ppGlobalWorkOffset)); - - os << ", "; - os << ".pGlobalWorkSize = "; - - ur::details::printPtr(os, *(params->ppGlobalWorkSize)); - - os << ", "; - os << ".pLocalWorkSize = "; - - ur::details::printPtr(os, *(params->ppLocalWorkSize)); - - os << ", "; - os << ".numEventsInWaitList = "; - - os << *(params->pnumEventsInWaitList); - - os << ", "; - os << ".phEventWaitList = "; - ur::details::printPtr( - os, reinterpret_cast(*(params->pphEventWaitList))); - if (*(params->pphEventWaitList) != NULL) { - os << " {"; - for (size_t i = 0; i < *params->pnumEventsInWaitList; ++i) { - if (i != 0) { - os << ", "; - } - - ur::details::printPtr(os, (*(params->pphEventWaitList))[i]); - } - os << "}"; - } - - os << ", "; - os << ".phEvent = "; - - ur::details::printPtr(os, *(params->pphEvent)); - - return os; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_enqueue_timestamp_recording_exp_params_t /// type @@ -21065,9 +20983,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, os << (const struct ur_kernel_set_specialization_constants_params_t *) params; } break; - case UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP: { - os << (const struct - ur_kernel_suggest_max_cooperative_group_count_exp_params_t *)params; + case UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT: { + os << (const struct ur_kernel_suggest_max_cooperative_group_count_params_t + *)params; } break; case UR_FUNCTION_QUEUE_GET_INFO: { os << (const struct ur_queue_get_info_params_t *)params; @@ -21247,9 +21165,6 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE: { os << (const struct ur_enqueue_write_host_pipe_params_t *)params; } break; - case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP: { - os << (const struct ur_enqueue_kernel_launch_custom_exp_params_t *)params; - } break; case UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER_EXT: { os << (const struct ur_enqueue_events_wait_with_barrier_ext_params_t *) params; @@ -21269,10 +21184,6 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, case UR_FUNCTION_ENQUEUE_COMMAND_BUFFER_EXP: { os << (const struct ur_enqueue_command_buffer_exp_params_t *)params; } break; - case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: { - os << (const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *) - params; - } break; case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: { os << (const struct ur_enqueue_timestamp_recording_exp_params_t *)params; } break; diff --git a/unified-runtime/scripts/core/EXP-COOPERATIVE-KERNELS.rst b/unified-runtime/scripts/core/EXP-COOPERATIVE-KERNELS.rst deleted file mode 100644 index ba055c48ff7a9..0000000000000 --- a/unified-runtime/scripts/core/EXP-COOPERATIVE-KERNELS.rst +++ /dev/null @@ -1,72 +0,0 @@ -<% - OneApi=tags['$OneApi'] - x=tags['$x'] - X=x.upper() -%> - -.. _experimental-cooperative-kernels: - -================================================================================ -Cooperative Kernels -================================================================================ - -.. warning:: - - Experimental features: - - * May be replaced, updated, or removed at any time. - * Do not require maintaining API/ABI stability of their own additions over - time. - * Do not require conformance testing of their own additions. - - -Motivation --------------------------------------------------------------------------------- -Cooperative kernels are kernels that use cross-workgroup synchronization -features. All enqueued workgroups must run concurrently for cooperative kernels -to execute without hanging. This experimental feature provides an API for -querying the maximum number of workgroups and launching cooperative kernels. - -Any device can support cooperative kernels by restricting the maximum number of -workgroups to 1. Devices that support cross-workgroup synchronization can -specify a larger maximum for a given cooperative kernel. - -The functions defined here align with those specified in Level Zero. - -API --------------------------------------------------------------------------------- - -Enums -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* ${x}_device_info_t - * ${X}_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP - -Functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* ${x}EnqueueCooperativeKernelLaunchExp -* ${x}KernelSuggestMaxCooperativeGroupCountExp - -Changelog --------------------------------------------------------------------------------- -+-----------+---------------------------------------------+ -| Revision | Changes | -+===========+=============================================+ -| 1.0 | Initial Draft | -+-----------+---------------------------------------------+ -| 1.1 | Switch from extension string macro to | -| | device info enum for reporting support. | -+-----------+---------------------------------------------+ - -Support --------------------------------------------------------------------------------- - -Adapters which support this experimental feature *must* return ``true`` when -queried for ${X}_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP via -${x}DeviceGetInfo. Conversely, before using any of the functionality defined in -this experimental feature the user *must* use the device query to determine if -the adapter supports this feature. - -Contributors --------------------------------------------------------------------------------- -* Michael Aziz `michael.aziz@intel.com `_ -* Aaron Greig `aaron.greig@codeplay.com `_ diff --git a/unified-runtime/scripts/core/EXP-LAUNCH-PROPERTIES.rst b/unified-runtime/scripts/core/EXP-LAUNCH-PROPERTIES.rst deleted file mode 100644 index 81703aaca503b..0000000000000 --- a/unified-runtime/scripts/core/EXP-LAUNCH-PROPERTIES.rst +++ /dev/null @@ -1,107 +0,0 @@ -<% - OneApi=tags['$OneApi'] - x=tags['$x'] - X=x.upper() -%> - -.. _experimental-launch-properties: - -================================================================================ -LAUNCH Properties -================================================================================ - -.. warning:: - - Experimental features: - - * May be replaced, updated, or removed at any time. - * Do not require maintaining API/ABI stability of their own additions over - time. - * Do not require conformance testing of their own additions. - - -Terminology --------------------------------------------------------------------------------- -"Launch Properties" is used to indicate optional kernel launch properties that -can be specified at the time of a kernel launch. Such properties can be used to -enable hardware specific kernel launch features. - -Motivation --------------------------------------------------------------------------------- -Advances in hardware sometimes require new kernel properties. One example is -distributed shared memory as used by Nvidia Hopper GPUs. Launching a kernel -that supports distributed shared memory requires specifying a set of "cluster" -dimensions, in units of work-groups, over which the shared memory is -"distributed". Additionally some applications require specification of kernel -properties at launch-time. - -This extension is a future-proof and portable solution that supports these two -requirements. Instead of using a fixed set of kernel enqueue arguments, the -approach is to introduce the ${x}_exp_launch_property_t type that enables a -more extendable API. - -Each ${x}_exp_launch_property_t instance corresponds to a specific kernel -launch property. -Only one new function is introduced: ${x}EnqueueKernelLaunchCustomExp. -${x}EnqueueKernelLaunchCustomExp takes an array of ${x}_exp_launch_property_t -as an argument, and launches a kernel using these properties. -${x}EnqueueKernelLaunchCustomExp corresponds closely to the CUDA Driver API -``cuLaunchKernelEx``. - -Many kernel lauch properties can be supported, such as cooperative kernel -launches. As such, eventually this extension should be able to replace the -cooperative kernels Unified-Runtime extension. - -API --------------------------------------------------------------------------------- - -Enums -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -* ${x}_device_info_t - * ${X}_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP - -* ${x}_exp_launch_property_id_t - -Unions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -* ${x}_exp_launch_property_value_t - -Structs -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -* ${x}_exp_launch_property_t - -Functions -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -* ${x}EnqueueKernelLaunchCustomExp - -Support --------------------------------------------------------------------------------- - -Adapters which support this experimental feature *must* return ``true`` when -queried for ${X}_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP via -${x}DeviceGetInfo. Conversely, before using any of the functionality defined -in this experimental feature the user *must* use the device query to determine -if the adapter supports this feature. - - -Changelog --------------------------------------------------------------------------------- - -+-----------+---------------------------------------------+ -| Revision | Changes | -+===========+=============================================+ -| 1.0 | Initial Draft | -+-----------+---------------------------------------------+ -| 1.1 | Switch from extension string macro to | -| | device info enum for reporting support. | -+-----------+---------------------------------------------+ - -Contributors --------------------------------------------------------------------------------- - -* JackAKirk `jack.kirk@codeplay.com `_ -* Aaron Greig `aaron.greig@codeplay.com `_ diff --git a/unified-runtime/scripts/core/PROG.rst b/unified-runtime/scripts/core/PROG.rst index 23f37327a5ade..28d8397927b5e 100644 --- a/unified-runtime/scripts/core/PROG.rst +++ b/unified-runtime/scripts/core/PROG.rst @@ -293,7 +293,7 @@ event dependencies that are passed to each Enqueue command. const size_t gWorkSize = {128, 128, 128}; const size_t lWorkSize = {1, 8, 8}; ${x}EnqueueKernelLaunch(hQueue, hKernel, nDim, gWorkOffset, gWorkSize, - lWorkSize, 0, nullptr, nullptr); + lWorkSize, 0, nullptr, 0, nullptr, nullptr); Queue object lifetime --------------------- diff --git a/unified-runtime/scripts/core/device.yml b/unified-runtime/scripts/core/device.yml index 12281af495163..4d2932ffd4be1 100644 --- a/unified-runtime/scripts/core/device.yml +++ b/unified-runtime/scripts/core/device.yml @@ -462,6 +462,8 @@ etors: desc: "[int32_t][optional-query] return max power limit in milliwatts." - name: BFLOAT16_CONVERSIONS_NATIVE desc: "[$x_bool_t] support for native bfloat16 conversions" + - name: KERNEL_LAUNCH_PROPERTIES_SUPPORT + desc: "[$x_kernel_launch_properties_support_flags_t] Bitfield of supported kernel launch properties." --- #-------------------------------------------------------------------------- type: function desc: "Retrieves various information about device" @@ -971,3 +973,18 @@ etors: - name: OTHER desc: "The clock frequency is throttled due to other reason." value: "$X_BIT(6)" +--- #-------------------------------------------------------------------------- +type: enum +desc: "Kernel launch properties support" +class: $xDevice +name: $x_kernel_launch_properties_support_flags_t +etors: + - name: COOPERATIVE + desc: "Supports $X_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE and $xKernelSuggestMaxCooperativeGroupCount" + value: "$X_BIT(0)" + - name: CLUSTER_DIMENSION + desc: "Supports $X_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION" + value: "$X_BIT(1)" + - name: WORK_GROUP_MEMORY + desc: "Supports $X_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY" + value: "$X_BIT(2)" diff --git a/unified-runtime/scripts/core/enqueue.yml b/unified-runtime/scripts/core/enqueue.yml index e44f22f58c3ce..1aef60149bce8 100644 --- a/unified-runtime/scripts/core/enqueue.yml +++ b/unified-runtime/scripts/core/enqueue.yml @@ -12,6 +12,56 @@ type: header desc: "Intel $OneApi Unified Runtime APIs" ordinal: "10" --- #-------------------------------------------------------------------------- +type: enum +desc: "Specifies a launch property id" +name: $x_kernel_launch_property_id_t +analogue: + - "**CUlaunchAttributeID**" +etors: + - name: IGNORE + desc: "The property has no effect." + - name: COOPERATIVE + desc: "Whether to launch a cooperative kernel." + - name: CLUSTER_DIMENSION + desc: "work-group cluster dimensions." + - name: WORK_GROUP_MEMORY + desc: "Implicit work group memory allocation." +--- #-------------------------------------------------------------------------- +type: union +desc: "Specifies a launch property value" +name: $x_kernel_launch_property_value_t +tag: $x_kernel_launch_property_id_t +analogue: + - "**CUlaunchAttributeValue**" +members: + - type: uint32_t[3] + name: clusterDim + desc: "[in] dimensions of the cluster (units of work-group) (x, y, z). Each value must be a divisor of the corresponding global work-size dimension (in units of work-group)." + tag: $X_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION + - type: int + name: cooperative + desc: "[in] non-zero value indicates a cooperative kernel" + tag: $X_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE + - type: size_t + name: workgroup_mem_size + desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes" + tag: $X_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY +--- #-------------------------------------------------------------------------- +type: struct +desc: "Kernel launch property" +name: $x_kernel_launch_property_t +analogue: + - "**cuLaunchAttribute**" +members: + - type: $x_kernel_launch_property_id_t + name: id + desc: "[in] launch property id" + init: $X_KERNEL_LAUNCH_PROPERTY_ID_IGNORE + - type: $x_kernel_launch_property_value_t + name: value + desc: "[in][tagged_by(id)] launch property value" + init: nullptr +--- #-------------------------------------------------------------------------- type: function desc: "Enqueue a command to execute a kernel" class: $xEnqueue @@ -42,6 +92,12 @@ params: desc: | [in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. If nullptr, the runtime implementation will choose the work-group size. + - type: uint32_t + name: numPropsInLaunchPropList + desc: "[in] size of the launch prop list" + - type: const $x_kernel_launch_property_t* + name: launchPropList + desc: "[in][optional][range(0, numPropsInLaunchPropList)] pointer to a list of launch properties" - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" @@ -69,6 +125,8 @@ returns: - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES + - $X_RESULT_ERROR_INVALID_OPERATION: + - "If any property in `launchPropList` isn't supported by the device." --- #-------------------------------------------------------------------------- type: function desc: "Enqueue a command which waits a list of events to complete before it completes" diff --git a/unified-runtime/scripts/core/exp-cooperative-kernels.yml b/unified-runtime/scripts/core/exp-cooperative-kernels.yml deleted file mode 100644 index 9fbe0d8f8bb35..0000000000000 --- a/unified-runtime/scripts/core/exp-cooperative-kernels.yml +++ /dev/null @@ -1,101 +0,0 @@ -# -# Copyright (C) 2023 Intel Corporation -# -# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. -# See LICENSE.TXT -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# See YaML.md for syntax definition -# ---- #-------------------------------------------------------------------------- -type: header -desc: "Intel $OneApi Unified Runtime Experimental APIs for Cooperative Kernels" -ordinal: "99" ---- #-------------------------------------------------------------------------- -type: enum -extend: true -typed_etors: true -desc: "Extension enums for $x_device_info_t to support cooperative kernels." -name: $x_device_info_t -etors: - - name: COOPERATIVE_KERNEL_SUPPORT_EXP - value: "0x5000" - desc: "[$x_bool_t] Returns true if the device supports cooperative kernels." ---- #-------------------------------------------------------------------------- -type: function -desc: "Enqueue a command to execute a cooperative kernel" -class: $xEnqueue -name: CooperativeKernelLaunchExp -params: - - type: $x_queue_handle_t - name: hQueue - desc: "[in] handle of the queue object" - - type: $x_kernel_handle_t - name: hKernel - desc: "[in] handle of the kernel object" - - type: uint32_t - name: workDim - desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items" - - type: "const size_t*" - name: pGlobalWorkOffset - desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" - - type: "const size_t*" - name: pGlobalWorkSize - desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" - - type: "const size_t*" - name: pLocalWorkSize - desc: | - [in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. - If nullptr, the runtime implementation will choose the work-group size. - - type: uint32_t - name: numEventsInWaitList - desc: "[in] size of the event wait list" - - type: "const $x_event_handle_t*" - name: phEventWaitList - desc: | - [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. - If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. - - type: $x_event_handle_t* - name: phEvent - desc: | - [out][optional][alloc] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array. -returns: - - $X_RESULT_ERROR_INVALID_QUEUE - - $X_RESULT_ERROR_INVALID_KERNEL - - $X_RESULT_ERROR_INVALID_EVENT - - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - - "`phEventWaitList == NULL && numEventsInWaitList > 0`" - - "`phEventWaitList != NULL && numEventsInWaitList == 0`" - - "If event objects in phEventWaitList are not valid events." - - $X_RESULT_ERROR_INVALID_WORK_DIMENSION - - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE - - $X_RESULT_ERROR_INVALID_VALUE - - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - - $X_RESULT_ERROR_OUT_OF_RESOURCES ---- #-------------------------------------------------------------------------- -type: function -desc: "Query the maximum number of work groups for a cooperative kernel" -class: $xKernel -name: SuggestMaxCooperativeGroupCountExp -params: - - type: $x_kernel_handle_t - name: hKernel - desc: "[in] handle of the kernel object" - - type: $x_device_handle_t - name: hDevice - desc: "[in] handle of the device object" - - type: uint32_t - name: workDim - desc: "[in] number of dimensions, from 1 to 3, to specify the work-group work-items" - - type: "const size_t*" - name: pLocalWorkSize - desc: | - [in] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. - - type: size_t - name: dynamicSharedMemorySize - desc: "[in] size of dynamic shared memory, for each work-group, in bytes, that will be used when the kernel is launched" - - type: "uint32_t*" - name: "pGroupCountRet" - desc: "[out] pointer to maximum number of groups" -returns: - - $X_RESULT_ERROR_INVALID_KERNEL diff --git a/unified-runtime/scripts/core/exp-launch-properties.yml b/unified-runtime/scripts/core/exp-launch-properties.yml deleted file mode 100644 index 558dd46cc8e6d..0000000000000 --- a/unified-runtime/scripts/core/exp-launch-properties.yml +++ /dev/null @@ -1,154 +0,0 @@ -# -# Copyright (C) 2024 Intel Corporation -# -# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. -# See LICENSE.TXT -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# See YaML.md for syntax definition -# ---- #-------------------------------------------------------------------------- -type: header -desc: "Intel $OneApi Unified Runtime Experimental APIs for (kernel) Launch Properties" -ordinal: "99" ---- #-------------------------------------------------------------------------- -type: enum -extend: true -typed_etors: true -desc: "Extension enums for $x_device_info_t to support launch properties." -name: $x_device_info_t -etors: - - name: LAUNCH_PROPERTIES_SUPPORT_EXP - value: "0x3000" - desc: "[$x_bool_t] Returns true if the device supports the use of kernel launch properties." ---- #-------------------------------------------------------------------------- -type: enum -desc: "Specifies a launch property id" -name: $x_exp_launch_property_id_t -analogue: - - "**CUlaunchAttributeID**" -etors: - - name: IGNORE - desc: "The property has no effect" - - name: COOPERATIVE - desc: "Whether to launch a cooperative kernel" - - name: CLUSTER_DIMENSION - desc: "work-group cluster dimensions" - - name: WORK_GROUP_MEMORY - desc: "Implicit work group memory allocation" ---- #-------------------------------------------------------------------------- -type: union -desc: "Specifies a launch property value" -name: $x_exp_launch_property_value_t -tag: $x_exp_launch_property_id_t -analogue: - - "**CUlaunchAttributeValue**" -members: - - type: uint32_t[3] - name: clusterDim - desc: "[in] dimensions of the cluster (units of work-group) (x, y, z). Each value must be a divisor of the corresponding global work-size dimension (in units of work-group)." - tag: $X_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION - - type: int - name: cooperative - desc: "[in] non-zero value indicates a cooperative kernel" - tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE - - type: size_t - name: workgroup_mem_size - desc: "[in] non-zero value indicates the amount of work group memory to allocate in bytes" - tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY ---- #-------------------------------------------------------------------------- -type: struct -desc: "Kernel launch property" -name: $x_exp_launch_property_t -analogue: - - "**cuLaunchAttribute**" -members: - - type: $x_exp_launch_property_id_t - name: id - desc: "[in] launch property id" - init: $X_EXP_LAUNCH_PROPERTY_ID_IGNORE - - type: $x_exp_launch_property_value_t - name: value - desc: "[in][tagged_by(id)] launch property value" - init: nullptr ---- #-------------------------------------------------------------------------- -type: function -desc: "Launch kernel with custom launch properties" -class: $xEnqueue -name: KernelLaunchCustomExp -ordinal: "0" -analogue: - - "**cuLaunchKernelEx**" -details: - - "Launches the kernel using the specified launch properties" - - "If numPropsInLaunchPropList == 0 then a regular kernel launch is used: `urEnqueueKernelLaunch`" - - "Consult the appropriate adapter driver documentation for details of adapter specific behavior and native error codes that may be returned." -params: - - type: $x_queue_handle_t - name: hQueue - desc: "[in] handle of the queue object" - - type: $x_kernel_handle_t - name: hKernel - desc: "[in] handle of the kernel object" - - type: uint32_t - name: workDim - desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items" - - type: "const size_t*" - name: pGlobalWorkOffset - desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" - - type: const size_t* - name: pGlobalWorkSize - desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" - - type: const size_t* - name: pLocalWorkSize - desc: "[in][optional] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. If nullptr, the runtime implementation will choose the work-group size." - - type: uint32_t - name: numPropsInLaunchPropList - desc: "[in] size of the launch prop list" - - type: const $x_exp_launch_property_t* - name: launchPropList - desc: "[in][range(0, numPropsInLaunchPropList)] pointer to a list of launch properties" - - type: uint32_t - name: numEventsInWaitList - desc: "[in] size of the event wait list" - - type: const $x_event_handle_t* - name: phEventWaitList - desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. " - - type: $x_event_handle_t* - name: phEvent - desc: "[out][optional][alloc] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array." -returns: - - $X_RESULT_SUCCESS - - $X_RESULT_ERROR_UNINITIALIZED - - $X_RESULT_ERROR_DEVICE_LOST - - $X_RESULT_ERROR_ADAPTER_SPECIFIC - - $X_RESULT_ERROR_INVALID_NULL_HANDLE: - - "NULL == hQueue" - - "NULL == hKernel" - - $X_RESULT_ERROR_INVALID_NULL_POINTER: - - "NULL == pGlobalWorkSize" - - "numPropsInLaunchpropList != 0 && launchPropList == NULL" - - $X_RESULT_ERROR_INVALID_QUEUE - - $X_RESULT_ERROR_INVALID_KERNEL - - $X_RESULT_ERROR_INVALID_EVENT - - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - - "phEventWaitList == NULL && numEventsInWaitList > 0" - - "phEventWaitList != NULL && numEventsInWaitList == 0" - - "If event objects in phEventWaitList are not valid events." - - $X_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS: - - "An event in phEventWaitList has $X_EVENT_STATUS_ERROR" - - $X_RESULT_ERROR_INVALID_WORK_DIMENSION - - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE - - $X_RESULT_ERROR_INVALID_VALUE - - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - - $X_RESULT_ERROR_OUT_OF_RESOURCES ---- #-------------------------------------------------------------------------- -type: enum -extend: true -typed_etors: true -desc: "Extension enums to $x_device_info_t to support arch specific launch properties." -name: $x_device_info_t -etors: - - name: CLUSTER_LAUNCH_SUPPORT_EXP - value: "0x1111" - desc: "[$x_bool_t] return true if enqueue Cluster Launch is supported" diff --git a/unified-runtime/scripts/core/kernel.yml b/unified-runtime/scripts/core/kernel.yml index ff2bf0b3d0343..957a27c32d548 100644 --- a/unified-runtime/scripts/core/kernel.yml +++ b/unified-runtime/scripts/core/kernel.yml @@ -598,3 +598,34 @@ params: suggested local work size that will contain the result of the query returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE +--- #-------------------------------------------------------------------------- +type: function +desc: "Query the maximum number of work groups for a cooperative kernel" +class: $xKernel +name: SuggestMaxCooperativeGroupCount +params: + - type: $x_kernel_handle_t + name: hKernel + desc: "[in] handle of the kernel object" + - type: $x_device_handle_t + name: hDevice + desc: "[in] handle of the device object" + - type: uint32_t + name: workDim + desc: "[in] number of dimensions, from 1 to 3, to specify the work-group work-items" + - type: "const size_t*" + name: pLocalWorkSize + desc: | + [in] pointer to an array of workDim unsigned values that specify the number of local work-items forming a work-group that will execute the kernel function. + - type: size_t + name: dynamicSharedMemorySize + desc: "[in] size of dynamic shared memory, for each work-group, in bytes, that will be used when the kernel is launched" + - type: "uint32_t*" + name: "pGroupCountRet" + desc: "[out] pointer to maximum number of groups" +returns: + - $X_RESULT_ERROR_INVALID_KERNEL + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: + - "If $X_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT returns a value without the $X_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE bit set." + - $X_RESULT_ERROR_INVALID_WORK_DIMENSION: + - "`workDim < 1 || workDim > 3`" diff --git a/unified-runtime/scripts/core/registry.yml b/unified-runtime/scripts/core/registry.yml index ee1cdefcc13d2..cd3ad5f0905d5 100644 --- a/unified-runtime/scripts/core/registry.yml +++ b/unified-runtime/scripts/core/registry.yml @@ -508,12 +508,6 @@ etors: - name: LOADER_TEAR_DOWN desc: Enumerator for $xLoaderTearDown value: '202' -- name: ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP - desc: Enumerator for $xEnqueueCooperativeKernelLaunchExp - value: '214' -- name: KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP - desc: Enumerator for $xKernelSuggestMaxCooperativeGroupCountExp - value: '215' - name: PROGRAM_GET_GLOBAL_VARIABLE_POINTER desc: Enumerator for $xProgramGetGlobalVariablePointer value: '216' @@ -529,9 +523,6 @@ etors: - name: ENQUEUE_TIMESTAMP_RECORDING_EXP desc: Enumerator for $xEnqueueTimestampRecordingExp value: '223' -- name: ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP - desc: Enumerator for $xEnqueueKernelLaunchCustomExp - value: '224' - name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE desc: Enumerator for $xKernelGetSuggestedLocalWorkSize value: '225' @@ -661,6 +652,9 @@ etors: - name: BINDLESS_IMAGES_FREE_MAPPED_LINEAR_MEMORY_EXP desc: Enumerator for $xBindlessImagesFreeMappedLinearMemoryExp value: '271' +- name: KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT + desc: Enumerator for $xKernelSuggestMaxCooperativeGroupCount + value: '272' --- type: enum desc: Defines structure types diff --git a/unified-runtime/source/adapters/adapter.def.in b/unified-runtime/source/adapters/adapter.def.in index 944952685791c..e41134651c09a 100644 --- a/unified-runtime/source/adapters/adapter.def.in +++ b/unified-runtime/source/adapters/adapter.def.in @@ -9,7 +9,6 @@ EXPORTS urGetEnqueueExpProcAddrTable urGetEventProcAddrTable urGetKernelProcAddrTable - urGetKernelExpProcAddrTable urGetMemProcAddrTable urGetPhysicalMemProcAddrTable urGetPlatformProcAddrTable diff --git a/unified-runtime/source/adapters/adapter.map.in b/unified-runtime/source/adapters/adapter.map.in index 5fe52a579de11..c4b71d68d8079 100644 --- a/unified-runtime/source/adapters/adapter.map.in +++ b/unified-runtime/source/adapters/adapter.map.in @@ -9,7 +9,6 @@ urGetEnqueueExpProcAddrTable; urGetEventProcAddrTable; urGetKernelProcAddrTable; - urGetKernelExpProcAddrTable; urGetMemProcAddrTable; urGetPhysicalMemProcAddrTable; urGetPlatformProcAddrTable; diff --git a/unified-runtime/source/adapters/cuda/device.cpp b/unified-runtime/source/adapters/cuda/device.cpp index dc4cb164a6ed4..d692c0341453f 100644 --- a/unified-runtime/source/adapters/cuda/device.cpp +++ b/unified-runtime/source/adapters/cuda/device.cpp @@ -1164,25 +1164,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP: return ReturnValue(true); - case UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP: { - int Value = getAttribute(hDevice, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 9; - return ReturnValue(static_cast(Value)); - } case UR_DEVICE_INFO_LOW_POWER_EVENTS_SUPPORT_EXP: return ReturnValue(false); case UR_DEVICE_INFO_USE_NATIVE_ASSERT: return ReturnValue(true); case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: return ReturnValue(true); - case UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP: - return ReturnValue(true); - case UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP: - return ReturnValue(true); case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(false); case UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_SUPPORT_EXP: return ReturnValue(true); + case UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT: { + auto LaunchPropsSupport = + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE | + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_WORK_GROUP_MEMORY; + if (getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= + 9) { + LaunchPropsSupport |= + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION; + } + + return ReturnValue(0); + } default: break; } diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp index bc8d81ae44312..8a07ac5f21cf0 100644 --- a/unified-runtime/source/adapters/cuda/enqueue.cpp +++ b/unified-runtime/source/adapters/cuda/enqueue.cpp @@ -477,48 +477,18 @@ enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent, - /*WorkGroupMemory=*/0); -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - if (pGlobalWorkOffset == nullptr || *pGlobalWorkOffset == 0) { - ur_exp_launch_property_t coop_prop; - coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE; - coop_prop.value.cooperative = 1; - return urEnqueueKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList, - phEvent); - } - return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent); -} - -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_exp_launch_property_t *launchPropList, + const ur_kernel_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { size_t WorkGroupMemory = [&]() -> size_t { - const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if( + const ur_kernel_launch_property_t *WorkGroupMemoryProp = std::find_if( launchPropList, launchPropList + numPropsInLaunchPropList, - [](const ur_exp_launch_property_t &Prop) { - return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY; + [](const ur_kernel_launch_property_t &Prop) { + return Prop.id == UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY; }); if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList) return WorkGroupMemoryProp->value.workgroup_mem_size; @@ -565,12 +535,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { switch (launchPropList[i].id) { - case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: { + case UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE: { auto &attr = launch_attribute.emplace_back(); attr.id = CU_LAUNCH_ATTRIBUTE_IGNORE; break; } - case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { + case UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { auto &attr = launch_attribute.emplace_back(); attr.id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. @@ -593,13 +563,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( break; } - case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: { + case UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE: { auto &attr = launch_attribute.emplace_back(); attr.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; attr.value.cooperative = launchPropList[i].value.cooperative; break; } - case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: { + case UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: { break; } default: { diff --git a/unified-runtime/source/adapters/cuda/kernel.cpp b/unified-runtime/source/adapters/cuda/kernel.cpp index dd1825e49df53..19ad234b10e45 100644 --- a/unified-runtime/source/adapters/cuda/kernel.cpp +++ b/unified-runtime/source/adapters/cuda/kernel.cpp @@ -187,7 +187,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( +UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( ur_kernel_handle_t hKernel, ur_device_handle_t /*hDevice*/, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 5d5ebc37f834d..fa3ae15f1e59a 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -127,6 +127,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnSuggestMaxCooperativeGroupCount = + urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; } @@ -429,10 +431,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return result; } - pDdiTable->pfnCooperativeKernelLaunchExp = - urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; - pDdiTable->pfnKernelLaunchCustomExp = urEnqueueKernelLaunchCustomExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnUSMDeviceAllocExp = urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = urEnqueueUSMSharedAllocExp; @@ -443,19 +442,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return UR_RESULT_SUCCESS; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = - urKernelSuggestMaxCooperativeGroupCountExp; - - return UR_RESULT_SUCCESS; -} - UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable( ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) { auto result = validateProcInputs(version, pDdiTable); @@ -492,7 +478,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version, urGetEnqueueExpProcAddrTable(version, &pDdiTable->EnqueueExp); urGetEventProcAddrTable(version, &pDdiTable->Event); urGetKernelProcAddrTable(version, &pDdiTable->Kernel); - urGetKernelExpProcAddrTable(version, &pDdiTable->KernelExp); urGetMemProcAddrTable(version, &pDdiTable->Mem); urGetPhysicalMemProcAddrTable(version, &pDdiTable->PhysicalMem); urGetPlatformProcAddrTable(version, &pDdiTable->Platform); diff --git a/unified-runtime/source/adapters/hip/device.cpp b/unified-runtime/source/adapters/hip/device.cpp index 151f4579a165a..457e599397075 100644 --- a/unified-runtime/source/adapters/hip/device.cpp +++ b/unified-runtime/source/adapters/hip/device.cpp @@ -992,7 +992,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU: case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH: case UR_DEVICE_INFO_IP_VERSION: - case UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP: case UR_DEVICE_INFO_CURRENT_CLOCK_THROTTLE_REASONS: case UR_DEVICE_INFO_FAN_SPEED: case UR_DEVICE_INFO_MIN_POWER_LIMIT: @@ -1040,12 +1039,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(true); case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: return ReturnValue(true); - case UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP: - return ReturnValue(false); - case UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP: - return ReturnValue(true); case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT: + return ReturnValue(0); default: break; } diff --git a/unified-runtime/source/adapters/hip/enqueue.cpp b/unified-runtime/source/adapters/hip/enqueue.cpp index fd5c756608964..04336dd4ee132 100644 --- a/unified-runtime/source/adapters/hip/enqueue.cpp +++ b/unified-runtime/source/adapters/hip/enqueue.cpp @@ -251,13 +251,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { UR_ASSERT(hQueue->getContext() == hKernel->getContext(), UR_RESULT_ERROR_INVALID_QUEUE); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; + propIndex++) { + // Adapters that don't support cooperative kernels are currently expected + // to ignore COOPERATIVE launch properties. Ideally we should avoid passing + // these at the SYCL RT level instead, see + // https://github.com/intel/llvm/issues/18421 + if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE || + launchPropList[propIndex].id == + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { + continue; + } + return UR_RESULT_ERROR_INVALID_OPERATION; + } + // Early exit for zero size range kernel if (*pGlobalWorkSize == 0) { return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, @@ -327,16 +343,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent); -} - /// Enqueues a wait on the given queue for all events. /// See \ref enqueueEventWait /// diff --git a/unified-runtime/source/adapters/hip/kernel.cpp b/unified-runtime/source/adapters/hip/kernel.cpp index f7313df733647..9f9e0d37e4aa7 100644 --- a/unified-runtime/source/adapters/hip/kernel.cpp +++ b/unified-runtime/source/adapters/hip/kernel.cpp @@ -166,7 +166,7 @@ urKernelGetNativeHandle(ur_kernel_handle_t, ur_native_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( +UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( ur_kernel_handle_t /*hKernel*/, ur_device_handle_t /*hDevice*/, uint32_t /*workDim*/, const size_t * /*pLocalWorkSize*/, size_t /*dynamicSharedMemorySize*/, uint32_t * /*pGroupCountRet*/) { diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index 5e1a9a1c737c3..8b9543680caa1 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -127,6 +127,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnSuggestMaxCooperativeGroupCount = + urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; } @@ -426,8 +428,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return result; } - pDdiTable->pfnCooperativeKernelLaunchExp = - urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; @@ -435,19 +435,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return UR_RESULT_SUCCESS; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = - urKernelSuggestMaxCooperativeGroupCountExp; - - return UR_RESULT_SUCCESS; -} - UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable( ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) { auto result = validateProcInputs(version, pDdiTable); @@ -484,7 +471,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version, urGetEnqueueExpProcAddrTable(version, &pDdiTable->EnqueueExp); urGetEventProcAddrTable(version, &pDdiTable->Event); urGetKernelProcAddrTable(version, &pDdiTable->Kernel); - urGetKernelExpProcAddrTable(version, &pDdiTable->KernelExp); urGetMemProcAddrTable(version, &pDdiTable->Mem); urGetPhysicalMemProcAddrTable(version, &pDdiTable->PhysicalMem); urGetPlatformProcAddrTable(version, &pDdiTable->Platform); diff --git a/unified-runtime/source/adapters/level_zero/device.cpp b/unified-runtime/source/adapters/level_zero/device.cpp index da7de39f0bc07..933bc8192d9de 100644 --- a/unified-runtime/source/adapters/level_zero/device.cpp +++ b/unified-runtime/source/adapters/level_zero/device.cpp @@ -1221,10 +1221,6 @@ ur_result_t urDeviceGetInfo( return ReturnValue(false); case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: return ReturnValue(true); - case UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP: - return ReturnValue(false); - case UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP: - return ReturnValue(true); case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(true); case UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_SUPPORT_EXP: @@ -1337,6 +1333,8 @@ ur_result_t urDeviceGetInfo( return ReturnValue(int32_t{PowerProperties.maxLimit}); } } + case UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT: + return ReturnValue(UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE); default: UR_LOG(ERR, "Unsupported ParamName in urGetDeviceInfo"); UR_LOG(ERR, "ParamNameParamName={}(0x{})", ParamName, diff --git a/unified-runtime/source/adapters/level_zero/kernel.cpp b/unified-runtime/source/adapters/level_zero/kernel.cpp index b5bc40c25be19..1acbcf1676b80 100644 --- a/unified-runtime/source/adapters/level_zero/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/kernel.cpp @@ -56,7 +56,7 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } -ur_result_t urEnqueueKernelLaunch( +inline ur_result_t EnqueueCooperativeKernelLaunchHelper( /// [in] handle of the queue object ur_queue_handle_t Queue, /// [in] handle of the kernel object @@ -89,9 +89,19 @@ ur_result_t urEnqueueKernelLaunch( UR_ASSERT(WorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(WorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - ze_kernel_handle_t ZeKernel{}; - UR_CALL(getZeKernel(Queue->Device->ZeDevice, Kernel, &ZeKernel)); + auto ZeDevice = Queue->Device->ZeDevice; + ze_kernel_handle_t ZeKernel{}; + if (Kernel->ZeKernelMap.empty()) { + ZeKernel = Kernel->ZeKernel; + } else { + auto It = Kernel->ZeKernelMap.find(ZeDevice); + if (It == Kernel->ZeKernelMap.end()) { + /* kernel and queue don't match */ + return UR_RESULT_ERROR_INVALID_QUEUE; + } + ZeKernel = It->second; + } // Lock automatically releases when this goes out of scope. std::scoped_lock Lock( Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex); @@ -118,9 +128,110 @@ ur_result_t urEnqueueKernelLaunch( ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; uint32_t WG[3]{}; - UR_CALL(calculateKernelWorkDimensions(Kernel->ZeKernel, Queue->Device, - ZeThreadGroupDimensions, WG, WorkDim, - GlobalWorkSize, LocalWorkSize)); + // New variable needed because GlobalWorkSize parameter might not be of size 3 + size_t GlobalWorkSize3D[3]{1, 1, 1}; + std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); + + if (LocalWorkSize) { + // L0 + for (uint32_t I = 0; I < WorkDim; I++) { + UR_ASSERT(LocalWorkSize[I] < (std::numeric_limits::max)(), + UR_RESULT_ERROR_INVALID_VALUE); + WG[I] = static_cast(LocalWorkSize[I]); + } + } else { + // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize + // values do not fit to 32-bit that the API only supports currently. + bool SuggestGroupSize = true; + for (int I : {0, 1, 2}) { + if (GlobalWorkSize3D[I] > UINT32_MAX) { + SuggestGroupSize = false; + } + } + if (SuggestGroupSize) { + ZE2UR_CALL(zeKernelSuggestGroupSize, + (ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], + GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); + } else { + for (int I : {0, 1, 2}) { + // Try to find a I-dimension WG size that the GlobalWorkSize[I] is + // fully divisable with. Start with the max possible size in + // each dimension. + uint32_t GroupSize[] = { + Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX, + Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY, + Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; + GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); + while (GlobalWorkSize3D[I] % GroupSize[I]) { + --GroupSize[I]; + } + + if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { + UR_LOG(ERR, + "urEnqueueCooperativeKernelLaunchExp: can't find a WG size " + "suitable for global work size > UINT32_MAX"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + WG[I] = GroupSize[I]; + } + UR_LOG(DEBUG, + "urEnqueueCooperativeKernelLaunchExp: using computed WG " + "size = {{{}, {}, {}}}", + WG[0], WG[1], WG[2]); + } + } + + // TODO: assert if sizes do not fit into 32-bit? + + switch (WorkDim) { + case 3: + ZeThreadGroupDimensions.groupCountX = + static_cast(GlobalWorkSize3D[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + static_cast(GlobalWorkSize3D[1] / WG[1]); + ZeThreadGroupDimensions.groupCountZ = + static_cast(GlobalWorkSize3D[2] / WG[2]); + break; + case 2: + ZeThreadGroupDimensions.groupCountX = + static_cast(GlobalWorkSize3D[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + static_cast(GlobalWorkSize3D[1] / WG[1]); + WG[2] = 1; + break; + case 1: + ZeThreadGroupDimensions.groupCountX = + static_cast(GlobalWorkSize3D[0] / WG[0]); + WG[1] = WG[2] = 1; + break; + + default: + UR_LOG(ERR, "urEnqueueCooperativeKernelLaunchExp: unsupported work_dim"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + // Error handling for non-uniform group size case + if (GlobalWorkSize3D[0] != + size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { + UR_LOG(ERR, + "urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " + "range is not a multiple of the group size in the 1st dimension"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize3D[1] != + size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { + UR_LOG(ERR, + "urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " + "range is not a multiple of the group size in the 2nd dimension"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize3D[2] != + size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { + UR_LOG(DEBUG, + "urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " + "range is not a multiple of the group size in the 3rd dimension"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2])); @@ -176,7 +287,7 @@ ur_result_t urEnqueueKernelLaunch( ContextsLock.lock(); Queue->CaptureIndirectAccesses(); // Add the command to the command list, which implies submission. - ZE2UR_CALL(zeCommandListAppendLaunchKernel, + ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); } else { @@ -184,12 +295,13 @@ ur_result_t urEnqueueKernelLaunch( // No lock is needed here, unlike the immediate commandlist case above, // because the kernels are not actually submitted yet. Kernels will be // submitted only when the comamndlist is closed. Then, a lock is held. - ZE2UR_CALL(zeCommandListAppendLaunchKernel, + ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); } - UR_LOG(DEBUG, "calling zeCommandListAppendLaunchKernel() with ZeEvent {}", + UR_LOG(DEBUG, + "calling zeCommandListAppendLaunchCooperativeKernel() with ZeEvent {}", ur_cast(ZeEvent)); printZeEventList((*Event)->WaitList); @@ -201,7 +313,7 @@ ur_result_t urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } -ur_result_t urEnqueueCooperativeKernelLaunchExp( +ur_result_t urEnqueueKernelLaunch( /// [in] handle of the queue object ur_queue_handle_t Queue, /// [in] handle of the kernel object @@ -221,6 +333,11 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp( /// will execute the kernel function. If nullptr, the runtime /// implementation will choose the work-group size. const size_t *LocalWorkSize, + /// [in] size of the launch prop list + uint32_t NumPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *LaunchPropList, /// [in] size of the event wait list uint32_t NumEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -231,22 +348,28 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp( /// [in,out][optional] return an event object that identifies this /// particular kernel execution instance. ur_event_handle_t *OutEvent) { + for (uint32_t PropIndex = 0; PropIndex < NumPropsInLaunchPropList; + PropIndex++) { + if (LaunchPropList[PropIndex].id == + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE && + LaunchPropList[PropIndex].value.cooperative) { + return EnqueueCooperativeKernelLaunchHelper( + Queue, Kernel, WorkDim, GlobalWorkOffset, GlobalWorkSize, + LocalWorkSize, NumEventsInWaitList, EventWaitList, OutEvent); + } + if (LaunchPropList[PropIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE && + LaunchPropList[PropIndex].id != + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { + // We don't support any other properties. + return UR_RESULT_ERROR_INVALID_OPERATION; + } + } UR_ASSERT(WorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(WorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - auto ZeDevice = Queue->Device->ZeDevice; - ze_kernel_handle_t ZeKernel{}; - if (Kernel->ZeKernelMap.empty()) { - ZeKernel = Kernel->ZeKernel; - } else { - auto It = Kernel->ZeKernelMap.find(ZeDevice); - if (It == Kernel->ZeKernelMap.end()) { - /* kernel and queue don't match */ - return UR_RESULT_ERROR_INVALID_QUEUE; - } - ZeKernel = It->second; - } + UR_CALL(getZeKernel(Queue->Device->ZeDevice, Kernel, &ZeKernel)); + // Lock automatically releases when this goes out of scope. std::scoped_lock Lock( Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex); @@ -273,110 +396,9 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp( ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; uint32_t WG[3]{}; - // New variable needed because GlobalWorkSize parameter might not be of size 3 - size_t GlobalWorkSize3D[3]{1, 1, 1}; - std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); - - if (LocalWorkSize) { - // L0 - for (uint32_t I = 0; I < WorkDim; I++) { - UR_ASSERT(LocalWorkSize[I] < (std::numeric_limits::max)(), - UR_RESULT_ERROR_INVALID_VALUE); - WG[I] = static_cast(LocalWorkSize[I]); - } - } else { - // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize - // values do not fit to 32-bit that the API only supports currently. - bool SuggestGroupSize = true; - for (int I : {0, 1, 2}) { - if (GlobalWorkSize3D[I] > UINT32_MAX) { - SuggestGroupSize = false; - } - } - if (SuggestGroupSize) { - ZE2UR_CALL(zeKernelSuggestGroupSize, - (ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], - GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); - } else { - for (int I : {0, 1, 2}) { - // Try to find a I-dimension WG size that the GlobalWorkSize[I] is - // fully divisable with. Start with the max possible size in - // each dimension. - uint32_t GroupSize[] = { - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; - GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); - while (GlobalWorkSize3D[I] % GroupSize[I]) { - --GroupSize[I]; - } - - if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { - UR_LOG(ERR, - "urEnqueueCooperativeKernelLaunchExp: can't find a WG size " - "suitable for global work size > UINT32_MAX"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - WG[I] = GroupSize[I]; - } - UR_LOG(DEBUG, - "urEnqueueCooperativeKernelLaunchExp: using computed WG " - "size = {{{}, {}, {}}}", - WG[0], WG[1], WG[2]); - } - } - - // TODO: assert if sizes do not fit into 32-bit? - - switch (WorkDim) { - case 3: - ZeThreadGroupDimensions.groupCountX = - static_cast(GlobalWorkSize3D[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - static_cast(GlobalWorkSize3D[1] / WG[1]); - ZeThreadGroupDimensions.groupCountZ = - static_cast(GlobalWorkSize3D[2] / WG[2]); - break; - case 2: - ZeThreadGroupDimensions.groupCountX = - static_cast(GlobalWorkSize3D[0] / WG[0]); - ZeThreadGroupDimensions.groupCountY = - static_cast(GlobalWorkSize3D[1] / WG[1]); - WG[2] = 1; - break; - case 1: - ZeThreadGroupDimensions.groupCountX = - static_cast(GlobalWorkSize3D[0] / WG[0]); - WG[1] = WG[2] = 1; - break; - - default: - UR_LOG(ERR, "urEnqueueCooperativeKernelLaunchExp: unsupported work_dim"); - return UR_RESULT_ERROR_INVALID_VALUE; - } - - // Error handling for non-uniform group size case - if (GlobalWorkSize3D[0] != - size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { - UR_LOG(ERR, - "urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " - "range is not a multiple of the group size in the 1st dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize3D[1] != - size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { - UR_LOG(ERR, - "urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " - "range is not a multiple of the group size in the 2nd dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - if (GlobalWorkSize3D[2] != - size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { - UR_LOG(DEBUG, - "urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " - "range is not a multiple of the group size in the 3rd dimension"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } + UR_CALL(calculateKernelWorkDimensions(Kernel->ZeKernel, Queue->Device, + ZeThreadGroupDimensions, WG, WorkDim, + GlobalWorkSize, LocalWorkSize)); ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2])); @@ -432,7 +454,7 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp( ContextsLock.lock(); Queue->CaptureIndirectAccesses(); // Add the command to the command list, which implies submission. - ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, + ZE2UR_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); } else { @@ -440,13 +462,12 @@ ur_result_t urEnqueueCooperativeKernelLaunchExp( // No lock is needed here, unlike the immediate commandlist case above, // because the kernels are not actually submitted yet. Kernels will be // submitted only when the comamndlist is closed. Then, a lock is held. - ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, + ZE2UR_CALL(zeCommandListAppendLaunchKernel, (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); } - UR_LOG(DEBUG, - "calling zeCommandListAppendLaunchCooperativeKernel() with ZeEvent {}", + UR_LOG(DEBUG, "calling zeCommandListAppendLaunchKernel() with ZeEvent {}", ur_cast(ZeEvent)); printZeEventList((*Event)->WaitList); @@ -1094,7 +1115,7 @@ ur_result_t urKernelGetNativeHandle( return UR_RESULT_SUCCESS; } -ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( +ur_result_t urKernelSuggestMaxCooperativeGroupCount( ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { diff --git a/unified-runtime/source/adapters/level_zero/queue.cpp b/unified-runtime/source/adapters/level_zero/queue.cpp index 35bf1a63d84eb..67749029e1caa 100644 --- a/unified-runtime/source/adapters/level_zero/queue.cpp +++ b/unified-runtime/source/adapters/level_zero/queue.cpp @@ -927,20 +927,6 @@ ur_result_t urQueueFlush( return Queue->executeAllOpenCommandLists(); } -ur_result_t urEnqueueKernelLaunchCustomExp( - ur_queue_handle_t /*hQueue*/, ur_kernel_handle_t /*hKernel*/, - uint32_t /*workDim*/, const size_t * /*pGlobalWorkOffset*/, - const size_t * /*pGlobalWorkSize*/, const size_t * /*pLocalWorkSize*/, - uint32_t /*numPropsInLaunchPropList*/, - const ur_exp_launch_property_t * /*launchPropList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - UR_LOG(ERR, "[UR][L0] {} function not implemented!", - "{} function not implemented!", __FUNCTION__); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - } // namespace ur::level_zero // Configuration of the command-list batching. diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index 5ccf212db60aa..1cbcbed231eaa 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -234,15 +234,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return result; } - pDdiTable->pfnKernelLaunchCustomExp = - ur::level_zero::urEnqueueKernelLaunchCustomExp; pDdiTable->pfnUSMDeviceAllocExp = ur::level_zero::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = ur::level_zero::urEnqueueUSMSharedAllocExp; pDdiTable->pfnUSMHostAllocExp = ur::level_zero::urEnqueueUSMHostAllocExp; pDdiTable->pfnUSMFreeExp = ur::level_zero::urEnqueueUSMFreeExp; pDdiTable->pfnCommandBufferExp = ur::level_zero::urEnqueueCommandBufferExp; - pDdiTable->pfnCooperativeKernelLaunchExp = - ur::level_zero::urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = ur::level_zero::urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = ur::level_zero::urEnqueueNativeCommandExp; @@ -296,19 +292,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgMemObj = ur::level_zero::urKernelSetArgMemObj; pDdiTable->pfnSetSpecializationConstants = ur::level_zero::urKernelSetSpecializationConstants; - - return result; -} - -UR_APIEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = - ur::level_zero::urKernelSuggestMaxCooperativeGroupCountExp; + pDdiTable->pfnSuggestMaxCooperativeGroupCount = + ur::level_zero::urKernelSuggestMaxCooperativeGroupCount; return result; } @@ -609,10 +594,6 @@ ur_result_t populateDdiTable(ur_dditable_t *ddi) { &ddi->Kernel); if (result != UR_RESULT_SUCCESS) return result; - result = NAMESPACE_::urGetKernelExpProcAddrTable(UR_API_VERSION_CURRENT, - &ddi->KernelExp); - if (result != UR_RESULT_SUCCESS) - return result; result = NAMESPACE_::urGetMemProcAddrTable(UR_API_VERSION_CURRENT, &ddi->Mem); if (result != UR_RESULT_SUCCESS) return result; diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index f2499c567b71c..d731dea5a8c43 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -296,6 +296,10 @@ ur_result_t urKernelGetSuggestedLocalWorkSize(ur_kernel_handle_t hKernel, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize); +ur_result_t urKernelSuggestMaxCooperativeGroupCount( + ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, + const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, + uint32_t *pGroupCountRet); ur_result_t urQueueGetInfo(ur_queue_handle_t hQueue, ur_queue_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet); @@ -338,8 +342,10 @@ ur_result_t urEventSetCallback(ur_event_handle_t hEvent, ur_result_t urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, @@ -755,25 +761,9 @@ urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_result_t urCommandBufferGetNativeHandleExp(ur_exp_command_buffer_handle_t hCommandBuffer, ur_native_handle_t *phNativeCommandBuffer); -ur_result_t urEnqueueCooperativeKernelLaunchExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); -ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( - ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, - const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, - uint32_t *pGroupCountRet); ur_result_t urEnqueueTimestampRecordingExp( ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); -ur_result_t urEnqueueKernelLaunchCustomExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_exp_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); ur_result_t urProgramBuildExp(ur_program_handle_t hProgram, uint32_t numDevices, ur_device_handle_t *phDevices, const char *pOptions); diff --git a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp index 4fcdadf64ad8d..45768ad45032e 100644 --- a/unified-runtime/source/adapters/level_zero/v2/kernel.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/kernel.cpp @@ -674,7 +674,7 @@ ur_result_t urKernelGetSuggestedLocalWorkSize( return UR_RESULT_SUCCESS; } -ur_result_t urKernelSuggestMaxCooperativeGroupCountExp( +ur_result_t urKernelSuggestMaxCooperativeGroupCount( ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, uint32_t workDim, const size_t *pLocalWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index faa09eee3eaa5..d043a68dcaec7 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -59,11 +59,14 @@ ur_result_t urQueueFlush(ur_queue_handle_t hQueue) try { ur_result_t urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) try { return hQueue->get().enqueueKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent); + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -429,17 +432,6 @@ ur_result_t urEnqueueCommandBufferExp( } catch (...) { return exceptionToResult(std::current_exception()); } -ur_result_t urEnqueueCooperativeKernelLaunchExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { - return hQueue->get().enqueueCooperativeKernelLaunchExp( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent); -} catch (...) { - return exceptionToResult(std::current_exception()); -} ur_result_t urEnqueueTimestampRecordingExp( ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { @@ -448,20 +440,6 @@ ur_result_t urEnqueueTimestampRecordingExp( } catch (...) { return exceptionToResult(std::current_exception()); } -ur_result_t urEnqueueKernelLaunchCustomExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_exp_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) try { - return hQueue->get().enqueueKernelLaunchCustomExp( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); -} catch (...) { - return exceptionToResult(std::current_exception()); -} ur_result_t urEnqueueEventsWaitWithBarrierExt( ur_queue_handle_t hQueue, const ur_exp_enqueue_ext_properties_t *pProperties, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp index 004b51f822c7f..b710f9d56b50d 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp @@ -30,7 +30,8 @@ struct ur_queue_t_ { virtual ur_result_t enqueueKernelLaunch(ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, const size_t *, uint32_t, - const ur_event_handle_t *, + const ur_kernel_launch_property_t *, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWait(uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; @@ -158,17 +159,9 @@ struct ur_queue_t_ { uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; - virtual ur_result_t enqueueCooperativeKernelLaunchExp( - ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, - const size_t *, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) = 0; virtual ur_result_t enqueueTimestampRecordingExp(bool, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) = 0; - virtual ur_result_t enqueueKernelLaunchCustomExp( - ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, - const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t, - const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, uint32_t, const ur_event_handle_t *, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 33c05a1402012..385394bd538c2 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -189,10 +189,29 @@ ur_queue_immediate_in_order_t::~ur_queue_immediate_in_order_t() { ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueKernelLaunch"); + for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; + propIndex++) { + if (launchPropList[propIndex].id == + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE && + launchPropList[propIndex].value.cooperative) { + return enqueueCooperativeKernelLaunchHelper( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent); + } + if (launchPropList[propIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE && + launchPropList[propIndex].id != + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { + // We don't support any other properties. + return UR_RESULT_ERROR_INVALID_OPERATION; + } + } + auto commandListLocked = commandListManager.lock(); UR_CALL(commandListLocked->appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, @@ -773,14 +792,11 @@ ur_queue_immediate_in_order_t::bindlessImagesSignalExternalSemaphoreExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( +ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchHelper( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp"); - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -918,17 +934,6 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBufferExp( return UR_RESULT_SUCCESS; } -ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( - ur_kernel_handle_t /*hKernel*/, uint32_t /*workDim*/, - const size_t * /*pGlobalWorkOffset*/, const size_t * /*pGlobalWorkSize*/, - const size_t * /*pLocalWorkSize*/, uint32_t /*numPropsInLaunchPropList*/, - const ur_exp_launch_property_t * /*launchPropList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - ur_result_t ur_queue_immediate_in_order_t::enqueueNativeCommandExp( ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index ef024dd65bd54..fb297369e29de 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -64,6 +64,12 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { void recordSubmittedKernel(ur_kernel_handle_t hKernel); + inline ur_result_t enqueueCooperativeKernelLaunchHelper( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + public: ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, const ur_queue_properties_t *); @@ -79,13 +85,13 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { ur_native_handle_t *phNativeQueue) override; ur_result_t queueFinish() override; ur_result_t queueFlush() override; - ur_result_t enqueueKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_result_t enqueueKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; @@ -256,12 +262,6 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; - ur_result_t enqueueCooperativeKernelLaunchExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; ur_result_t enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, @@ -271,13 +271,6 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; - ur_result_t enqueueKernelLaunchCustomExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, - const ur_exp_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; ur_result_t enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, diff --git a/unified-runtime/source/adapters/mock/ur_mock.cpp b/unified-runtime/source/adapters/mock/ur_mock.cpp index 7f85398abcc47..d24861e0989bf 100644 --- a/unified-runtime/source/adapters/mock/ur_mock.cpp +++ b/unified-runtime/source/adapters/mock/ur_mock.cpp @@ -106,7 +106,6 @@ context_t::context_t() { urGetEnqueueExpProcAddrTable(version, &urDdiTable.EnqueueExp); urGetEventProcAddrTable(version, &urDdiTable.Event); urGetKernelProcAddrTable(version, &urDdiTable.Kernel); - urGetKernelExpProcAddrTable(version, &urDdiTable.KernelExp); urGetMemProcAddrTable(version, &urDdiTable.Mem); urGetPhysicalMemProcAddrTable(version, &urDdiTable.PhysicalMem); urGetPlatformProcAddrTable(version, &urDdiTable.Platform); diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index a7977e775a38d..b4022a92bffa3 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -4815,6 +4815,71 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount +__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] handle of the device object + ur_device_handle_t hDevice, + /// [in] number of dimensions, from 1 to 3, to specify the work-group + /// work-items + uint32_t workDim, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of local work-items forming a work-group that will execute the + /// kernel function. + const size_t *pLocalWorkSize, + /// [in] size of dynamic shared memory, for each work-group, in bytes, + /// that will be used when the kernel is launched + size_t dynamicSharedMemorySize, + /// [out] pointer to maximum number of groups + uint32_t *pGroupCountRet) try { + ur_result_t result = UR_RESULT_SUCCESS; + + ur_kernel_suggest_max_cooperative_group_count_params_t params = { + &hKernel, + &hDevice, + &workDim, + &pLocalWorkSize, + &dynamicSharedMemorySize, + &pGroupCountRet}; + + auto beforeCallback = reinterpret_cast( + mock::getCallbacks().get_before_callback( + "urKernelSuggestMaxCooperativeGroupCount")); + if (beforeCallback) { + result = beforeCallback(¶ms); + if (result != UR_RESULT_SUCCESS) { + return result; + } + } + + auto replaceCallback = reinterpret_cast( + mock::getCallbacks().get_replace_callback( + "urKernelSuggestMaxCooperativeGroupCount")); + if (replaceCallback) { + result = replaceCallback(¶ms); + } else { + + result = UR_RESULT_SUCCESS; + } + + if (result != UR_RESULT_SUCCESS) { + return result; + } + + auto afterCallback = reinterpret_cast( + mock::getCallbacks().get_after_callback( + "urKernelSuggestMaxCooperativeGroupCount")); + if (afterCallback) { + return afterCallback(¶ms); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -5595,6 +5660,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -5615,6 +5685,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( &pGlobalWorkOffset, &pGlobalWorkSize, &pLocalWorkSize, + &numPropsInLaunchPropList, + &launchPropList, &numEventsInWaitList, &phEventWaitList, &phEvent}; @@ -11107,158 +11179,6 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return exceptionToResult(std::current_exception()); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueCooperativeKernelLaunchExp -__urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) try { - ur_result_t result = UR_RESULT_SUCCESS; - - ur_enqueue_cooperative_kernel_launch_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; - - auto beforeCallback = reinterpret_cast( - mock::getCallbacks().get_before_callback( - "urEnqueueCooperativeKernelLaunchExp")); - if (beforeCallback) { - result = beforeCallback(¶ms); - if (result != UR_RESULT_SUCCESS) { - return result; - } - } - - auto replaceCallback = reinterpret_cast( - mock::getCallbacks().get_replace_callback( - "urEnqueueCooperativeKernelLaunchExp")); - if (replaceCallback) { - result = replaceCallback(¶ms); - } else { - - // optional output handle - if (phEvent) { - *phEvent = mock::createDummyHandle(); - } - result = UR_RESULT_SUCCESS; - } - - if (result != UR_RESULT_SUCCESS) { - return result; - } - - auto afterCallback = reinterpret_cast( - mock::getCallbacks().get_after_callback( - "urEnqueueCooperativeKernelLaunchExp")); - if (afterCallback) { - return afterCallback(¶ms); - } - - return result; -} catch (...) { - return exceptionToResult(std::current_exception()); -} - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp -__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] handle of the device object - ur_device_handle_t hDevice, - /// [in] number of dimensions, from 1 to 3, to specify the work-group - /// work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of local work-items forming a work-group that will execute the - /// kernel function. - const size_t *pLocalWorkSize, - /// [in] size of dynamic shared memory, for each work-group, in bytes, - /// that will be used when the kernel is launched - size_t dynamicSharedMemorySize, - /// [out] pointer to maximum number of groups - uint32_t *pGroupCountRet) try { - ur_result_t result = UR_RESULT_SUCCESS; - - ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = { - &hKernel, - &hDevice, - &workDim, - &pLocalWorkSize, - &dynamicSharedMemorySize, - &pGroupCountRet}; - - auto beforeCallback = reinterpret_cast( - mock::getCallbacks().get_before_callback( - "urKernelSuggestMaxCooperativeGroupCountExp")); - if (beforeCallback) { - result = beforeCallback(¶ms); - if (result != UR_RESULT_SUCCESS) { - return result; - } - } - - auto replaceCallback = reinterpret_cast( - mock::getCallbacks().get_replace_callback( - "urKernelSuggestMaxCooperativeGroupCountExp")); - if (replaceCallback) { - result = replaceCallback(¶ms); - } else { - - result = UR_RESULT_SUCCESS; - } - - if (result != UR_RESULT_SUCCESS) { - return result; - } - - auto afterCallback = reinterpret_cast( - mock::getCallbacks().get_after_callback( - "urKernelSuggestMaxCooperativeGroupCountExp")); - if (afterCallback) { - return afterCallback(¶ms); - } - - return result; -} catch (...) { - return exceptionToResult(std::current_exception()); -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueTimestampRecordingExp __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( @@ -11328,99 +11248,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return exceptionToResult(std::current_exception()); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunchCustomExp -__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. If nullptr, the runtime implementation - /// will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_exp_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If nullptr, - /// the numEventsInWaitList must be 0, indicating that no wait event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) try { - ur_result_t result = UR_RESULT_SUCCESS; - - ur_enqueue_kernel_launch_custom_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; - - auto beforeCallback = reinterpret_cast( - mock::getCallbacks().get_before_callback( - "urEnqueueKernelLaunchCustomExp")); - if (beforeCallback) { - result = beforeCallback(¶ms); - if (result != UR_RESULT_SUCCESS) { - return result; - } - } - - auto replaceCallback = reinterpret_cast( - mock::getCallbacks().get_replace_callback( - "urEnqueueKernelLaunchCustomExp")); - if (replaceCallback) { - result = replaceCallback(¶ms); - } else { - - // optional output handle - if (phEvent) { - *phEvent = mock::createDummyHandle(); - } - result = UR_RESULT_SUCCESS; - } - - if (result != UR_RESULT_SUCCESS) { - return result; - } - - auto afterCallback = reinterpret_cast( - mock::getCallbacks().get_after_callback( - "urEnqueueKernelLaunchCustomExp")); - if (afterCallback) { - return afterCallback(¶ms); - } - - return result; -} catch (...) { - return exceptionToResult(std::current_exception()); -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -12351,8 +12178,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; - pDdiTable->pfnKernelLaunchCustomExp = driver::urEnqueueKernelLaunchCustomExp; - pDdiTable->pfnUSMDeviceAllocExp = driver::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = driver::urEnqueueUSMSharedAllocExp; @@ -12363,9 +12188,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCommandBufferExp = driver::urEnqueueCommandBufferExp; - pDdiTable->pfnCooperativeKernelLaunchExp = - driver::urEnqueueCooperativeKernelLaunchExp; - pDdiTable->pfnTimestampRecordingExp = driver::urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = driver::urEnqueueNativeCommandExp; @@ -12472,34 +12294,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetSpecializationConstants = driver::urKernelSetSpecializationConstants; - return result; -} catch (...) { - return exceptionToResult(std::current_exception()); -} - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's KernelExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - /// [in] API version requested - ur_api_version_t version, - /// [in,out] pointer to table of DDI function pointers - ur_kernel_exp_dditable_t *pDdiTable) try { - if (nullptr == pDdiTable) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (driver::d_context.version < version) - return UR_RESULT_ERROR_UNSUPPORTED_VERSION; - - ur_result_t result = UR_RESULT_SUCCESS; - - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = - driver::urKernelSuggestMaxCooperativeGroupCountExp; + pDdiTable->pfnSuggestMaxCooperativeGroupCount = + driver::urKernelSuggestMaxCooperativeGroupCount; return result; } catch (...) { diff --git a/unified-runtime/source/adapters/native_cpu/device.cpp b/unified-runtime/source/adapters/native_cpu/device.cpp index b220b227c6e34..e1f390ff709cf 100644 --- a/unified-runtime/source/adapters/native_cpu/device.cpp +++ b/unified-runtime/source/adapters/native_cpu/device.cpp @@ -444,18 +444,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: return ReturnValue(false); - case UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP: - return ReturnValue(false); - - case UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP: - return ReturnValue(false); - case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: return ReturnValue(true); case UR_DEVICE_INFO_GLOBAL_VARIABLE_SUPPORT: return ReturnValue(false); + case UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT: + return ReturnValue(0); + default: DIE_NO_IMPLEMENTATION; } diff --git a/unified-runtime/source/adapters/native_cpu/enqueue.cpp b/unified-runtime/source/adapters/native_cpu/enqueue.cpp index bff3c0eef7ac9..e70ce53b672da 100644 --- a/unified-runtime/source/adapters/native_cpu/enqueue.cpp +++ b/unified-runtime/source/adapters/native_cpu/enqueue.cpp @@ -66,8 +66,17 @@ static native_cpu::state getResizedState(const native_cpu::NDRDescT &ndr, UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + // We don't support any launch properties. + for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; + propIndex++) { + if (launchPropList[propIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + } urEventWait(numEventsInWaitList, phEventWaitList); UR_ASSERT(hQueue, UR_RESULT_ERROR_INVALID_NULL_HANDLE); diff --git a/unified-runtime/source/adapters/native_cpu/kernel.cpp b/unified-runtime/source/adapters/native_cpu/kernel.cpp index 970b184ccb126..500b2c6bcd8a5 100644 --- a/unified-runtime/source/adapters/native_cpu/kernel.cpp +++ b/unified-runtime/source/adapters/native_cpu/kernel.cpp @@ -285,3 +285,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( [[maybe_unused]] size_t *pSuggestedLocalWorkSize) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( + [[maybe_unused]] ur_kernel_handle_t hKernel, + [[maybe_unused]] ur_device_handle_t hDevice, + [[maybe_unused]] uint32_t workDim, + [[maybe_unused]] const size_t *pLocalWorkSize, + [[maybe_unused]] size_t dynamicSharedMemorySize, + [[maybe_unused]] uint32_t *pGroupCountRet) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index f06684b814603..1e2417952e661 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -125,6 +125,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnSuggestMaxCooperativeGroupCount = + urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; } @@ -409,7 +411,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return result; } - pDdiTable->pfnCooperativeKernelLaunchExp = nullptr; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; @@ -417,18 +418,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return UR_RESULT_SUCCESS; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr; - - return UR_RESULT_SUCCESS; -} - UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable( ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) { auto result = validateProcInputs(version, pDdiTable); @@ -465,7 +454,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version, urGetEnqueueExpProcAddrTable(version, &pDdiTable->EnqueueExp); urGetEventProcAddrTable(version, &pDdiTable->Event); urGetKernelProcAddrTable(version, &pDdiTable->Kernel); - urGetKernelExpProcAddrTable(version, &pDdiTable->KernelExp); urGetMemProcAddrTable(version, &pDdiTable->Mem); urGetPhysicalMemProcAddrTable(version, &pDdiTable->PhysicalMem); urGetPlatformProcAddrTable(version, &pDdiTable->Platform); diff --git a/unified-runtime/source/adapters/opencl/device.cpp b/unified-runtime/source/adapters/opencl/device.cpp index 169f6e9875f94..063c3f57ad813 100644 --- a/unified-runtime/source/adapters/opencl/device.cpp +++ b/unified-runtime/source/adapters/opencl/device.cpp @@ -1421,12 +1421,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP: return ReturnValue(false); - case UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP: - return ReturnValue(false); - case UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP: - return ReturnValue(true); case UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP: - return ReturnValue(false); + return ReturnValue(true); + case UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT: + return ReturnValue(0); // TODO: We can't query to check if these are supported, they will need to be // manually updated if support is ever implemented. case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS: @@ -1435,7 +1433,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMMAND_BUFFER_EVENT_SUPPORT_EXP: case UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP: case UR_DEVICE_INFO_LOW_POWER_EVENTS_SUPPORT_EXP: - case UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP: case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: case UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP: case UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP: diff --git a/unified-runtime/source/adapters/opencl/enqueue.cpp b/unified-runtime/source/adapters/opencl/enqueue.cpp index 9314c34c5b2d7..2580ac05346cf 100644 --- a/unified-runtime/source/adapters/opencl/enqueue.cpp +++ b/unified-runtime/source/adapters/opencl/enqueue.cpp @@ -42,8 +42,23 @@ void MapUREventsToCL(uint32_t numEvents, const ur_event_handle_t *UREvents, UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; + propIndex++) { + // Adapters that don't support cooperative kernels are currently expected + // to ignore COOPERATIVE launch properties. Ideally we should avoid passing + // these at the SYCL RT level instead, see + // https://github.com/intel/llvm/issues/18421 + if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE || + launchPropList[propIndex].id == + UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE) { + continue; + } + } + std::vector compiledLocalWorksize; if (!pLocalWorkSize) { cl_device_id device = nullptr; @@ -81,16 +96,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent); -} - UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { diff --git a/unified-runtime/source/adapters/opencl/kernel.cpp b/unified-runtime/source/adapters/opencl/kernel.cpp index 0be1574d8c084..f0c22a99749a9 100644 --- a/unified-runtime/source/adapters/opencl/kernel.cpp +++ b/unified-runtime/source/adapters/opencl/kernel.cpp @@ -454,7 +454,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( +UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( [[maybe_unused]] ur_kernel_handle_t hKernel, [[maybe_unused]] ur_device_handle_t hDevice, [[maybe_unused]] uint32_t workDim, diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index 93bc854c4affd..af78ce5941e2f 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -127,6 +127,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnSuggestMaxCooperativeGroupCount = + urKernelSuggestMaxCooperativeGroupCount; return UR_RESULT_SUCCESS; } @@ -414,8 +416,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return result; } - pDdiTable->pfnCooperativeKernelLaunchExp = - urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = urEnqueueNativeCommandExp; pDdiTable->pfnCommandBufferExp = urEnqueueCommandBufferExp; @@ -423,19 +423,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( return UR_RESULT_SUCCESS; } -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) { - auto result = validateProcInputs(version, pDdiTable); - if (UR_RESULT_SUCCESS != result) { - return result; - } - - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = - urKernelSuggestMaxCooperativeGroupCountExp; - - return UR_RESULT_SUCCESS; -} - UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable( ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) { auto result = validateProcInputs(version, pDdiTable); @@ -472,7 +459,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urAllAddrTable(ur_api_version_t version, urGetEnqueueExpProcAddrTable(version, &pDdiTable->EnqueueExp); urGetEventProcAddrTable(version, &pDdiTable->Event); urGetKernelProcAddrTable(version, &pDdiTable->Kernel); - urGetKernelExpProcAddrTable(version, &pDdiTable->KernelExp); urGetMemProcAddrTable(version, &pDdiTable->Mem); urGetPhysicalMemProcAddrTable(version, &pDdiTable->PhysicalMem); urGetPlatformProcAddrTable(version, &pDdiTable->Platform); diff --git a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp index cf41e3a1a0a64..8b8bcaeb2847c 100644 --- a/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -484,6 +484,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. If nullptr, the runtime implementation will /// choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -517,8 +522,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), numEventsInWaitList, phEventWaitList, - phEvent)); + LaunchInfo.LocalWorkSize.data(), numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent)); UR_CALL(getAsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -1404,58 +1409,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( return UR_RESULT_SUCCESS; } -ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - - UR_LOG_L(getContext()->logger, DEBUG, - "==== urEnqueueCooperativeKernelLaunchExp"); - - LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, - pLocalWorkSize, pGlobalWorkOffset, workDim); - UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); - - UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); - - UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnCooperativeKernelLaunchExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), numEventsInWaitList, phEventWaitList, - phEvent)); - - UR_CALL(getAsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); - - return UR_RESULT_SUCCESS; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelRetain __urdlllocal ur_result_t UR_APICALL urKernelRetain( @@ -1999,25 +1952,6 @@ __urdlllocal ur_result_t UR_APICALL urGetDeviceProcAddrTable( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's EnqueueExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -__urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( - /// [in,out] pointer to table of DDI function pointers - ur_enqueue_exp_dditable_t *pDdiTable) { - if (nullptr == pDdiTable) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - - pDdiTable->pfnCooperativeKernelLaunchExp = - ur_sanitizer_layer::asan::urEnqueueCooperativeKernelLaunchExp; - return UR_RESULT_SUCCESS; -} - template struct NotSupportedApi; template @@ -2213,11 +2147,6 @@ ur_result_t initAsanDDITable(ur_dditable_t *dditable) { UR_API_VERSION_CURRENT, &dditable->VirtualMem); } - if (UR_RESULT_SUCCESS == result) { - result = ur_sanitizer_layer::asan::urGetEnqueueExpProcAddrTable( - &dditable->EnqueueExp); - } - if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize ASAN DDI table failed: {}", result); diff --git a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp index 67a5c1276b138..bfb9e1632e7b4 100644 --- a/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/msan/msan_ddi.cpp @@ -411,6 +411,11 @@ ur_result_t urEnqueueKernelLaunch( /// execute the kernel function. If nullptr, the runtime implementation will /// choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -432,8 +437,8 @@ ur_result_t urEnqueueKernelLaunch( UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), numEventsInWaitList, phEventWaitList, - phEvent)); + LaunchInfo.LocalWorkSize.data(), numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent)); UR_CALL(getMsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -1314,59 +1319,6 @@ ur_result_t urEnqueueMemUnmap( return UR_RESULT_SUCCESS; } -ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - - UR_LOG_L(getContext()->logger, DEBUG, - "==== urEnqueueCooperativeKernelLaunchExp"); - - USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), - pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, - workDim); - UR_CALL(LaunchInfo.initialize()); - - UR_CALL(getMsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); - - UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnCooperativeKernelLaunchExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - LaunchInfo.LocalWorkSize.data(), numEventsInWaitList, phEventWaitList, - phEvent)); - - UR_CALL(getMsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); - - return UR_RESULT_SUCCESS; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelRetain ur_result_t urKernelRetain( @@ -1957,25 +1909,6 @@ ur_result_t urCheckVersion(ur_api_version_t version) { return UR_RESULT_SUCCESS; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's EnqueueExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -__urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( - /// [in,out] pointer to table of DDI function pointers - ur_enqueue_exp_dditable_t *pDdiTable) { - if (nullptr == pDdiTable) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - - pDdiTable->pfnCooperativeKernelLaunchExp = - ur_sanitizer_layer::msan::urEnqueueCooperativeKernelLaunchExp; - return UR_RESULT_SUCCESS; -} - } // namespace msan ur_result_t initMsanDDITable(ur_dditable_t *dditable) { @@ -2030,11 +1963,6 @@ ur_result_t initMsanDDITable(ur_dditable_t *dditable) { result = ur_sanitizer_layer::msan::urGetUSMProcAddrTable(&dditable->USM); } - if (UR_RESULT_SUCCESS == result) { - result = ur_sanitizer_layer::msan::urGetEnqueueExpProcAddrTable( - &dditable->EnqueueExp); - } - if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize MSAN DDI table failed: {}", result); diff --git a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp index 46c7142688959..f3802f652d614 100644 --- a/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp +++ b/unified-runtime/source/loader/layers/sanitizer/tsan/tsan_ddi.cpp @@ -1127,6 +1127,11 @@ ur_result_t urEnqueueKernelLaunch( /// execute the kernel function. If nullptr, the runtime implementation will /// choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch + /// properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -1145,55 +1150,8 @@ ur_result_t urEnqueueKernelLaunch( UR_CALL(getContext()->urDdiTable.Enqueue.pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent)); - - UR_CALL(getTsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - UR_LOG_L(getContext()->logger, DEBUG, - "==== urEnqueueCooperativeKernelLaunchExp"); - - LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue)); - - UR_CALL(getTsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); - - UR_CALL(getContext()->urDdiTable.EnqueueExp.pfnCooperativeKernelLaunchExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent)); + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent)); UR_CALL(getTsanInterceptor()->postLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -1378,25 +1336,6 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueProcAddrTable( return UR_RESULT_SUCCESS; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's EnqueueExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -__urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( - /// [in,out] pointer to table of DDI function pointers - ur_enqueue_exp_dditable_t *pDdiTable) { - if (nullptr == pDdiTable) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - - pDdiTable->pfnCooperativeKernelLaunchExp = - ur_sanitizer_layer::tsan::urEnqueueCooperativeKernelLaunchExp; - return UR_RESULT_SUCCESS; -} - } // namespace tsan ur_result_t initTsanDDITable(ur_dditable_t *dditable) { @@ -1441,11 +1380,6 @@ ur_result_t initTsanDDITable(ur_dditable_t *dditable) { ur_sanitizer_layer::tsan::urGetEnqueueProcAddrTable(&dditable->Enqueue); } - if (UR_RESULT_SUCCESS == result) { - result = ur_sanitizer_layer::tsan::urGetEnqueueExpProcAddrTable( - &dditable->EnqueueExp); - } - if (result != UR_RESULT_SUCCESS) { UR_LOG_L(getContext()->logger, ERR, "Initialize TSAN DDI table failed: {}", result); diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index c7f7f26f1ece2..3a2e22aa40710 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -3983,6 +3983,66 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount +__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] handle of the device object + ur_device_handle_t hDevice, + /// [in] number of dimensions, from 1 to 3, to specify the work-group + /// work-items + uint32_t workDim, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of local work-items forming a work-group that will execute the + /// kernel function. + const size_t *pLocalWorkSize, + /// [in] size of dynamic shared memory, for each work-group, in bytes, + /// that will be used when the kernel is launched + size_t dynamicSharedMemorySize, + /// [out] pointer to maximum number of groups + uint32_t *pGroupCountRet) { + auto pfnSuggestMaxCooperativeGroupCount = + getContext()->urDdiTable.Kernel.pfnSuggestMaxCooperativeGroupCount; + + if (nullptr == pfnSuggestMaxCooperativeGroupCount) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + ur_kernel_suggest_max_cooperative_group_count_params_t params = { + &hKernel, + &hDevice, + &workDim, + &pLocalWorkSize, + &dynamicSharedMemorySize, + &pGroupCountRet}; + uint64_t instance = getContext()->notify_begin( + UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT, + "urKernelSuggestMaxCooperativeGroupCount", ¶ms); + + auto &logger = getContext()->logger; + UR_LOG_L(logger, INFO, " ---> urKernelSuggestMaxCooperativeGroupCount\n"); + + ur_result_t result = pfnSuggestMaxCooperativeGroupCount( + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, + pGroupCountRet); + + getContext()->notify_end( + UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT, + "urKernelSuggestMaxCooperativeGroupCount", ¶ms, &result, instance); + + if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT, + ¶ms); + UR_LOG_L(logger, INFO, + " <--- urKernelSuggestMaxCooperativeGroupCount({}) -> {};\n", + args_str.str(), result); + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -4616,6 +4676,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -4639,6 +4704,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( &pGlobalWorkOffset, &pGlobalWorkSize, &pLocalWorkSize, + &numPropsInLaunchPropList, + &launchPropList, &numEventsInWaitList, &phEventWaitList, &phEvent}; @@ -4650,7 +4717,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_result_t result = pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH, "urEnqueueKernelLaunch", ¶ms, &result, instance); @@ -9390,144 +9458,6 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueCooperativeKernelLaunchExp -__urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - auto pfnCooperativeKernelLaunchExp = - getContext()->urDdiTable.EnqueueExp.pfnCooperativeKernelLaunchExp; - - if (nullptr == pfnCooperativeKernelLaunchExp) - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - - ur_enqueue_cooperative_kernel_launch_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; - uint64_t instance = getContext()->notify_begin( - UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP, - "urEnqueueCooperativeKernelLaunchExp", ¶ms); - - auto &logger = getContext()->logger; - UR_LOG_L(logger, INFO, " ---> urEnqueueCooperativeKernelLaunchExp\n"); - - ur_result_t result = pfnCooperativeKernelLaunchExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); - - getContext()->notify_end(UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP, - "urEnqueueCooperativeKernelLaunchExp", ¶ms, - &result, instance); - - if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP, ¶ms); - UR_LOG_L(logger, INFO, - " <--- urEnqueueCooperativeKernelLaunchExp({}) -> {};\n", - args_str.str(), result); - } - - return result; -} - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp -__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] handle of the device object - ur_device_handle_t hDevice, - /// [in] number of dimensions, from 1 to 3, to specify the work-group - /// work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of local work-items forming a work-group that will execute the - /// kernel function. - const size_t *pLocalWorkSize, - /// [in] size of dynamic shared memory, for each work-group, in bytes, - /// that will be used when the kernel is launched - size_t dynamicSharedMemorySize, - /// [out] pointer to maximum number of groups - uint32_t *pGroupCountRet) { - auto pfnSuggestMaxCooperativeGroupCountExp = - getContext()->urDdiTable.KernelExp.pfnSuggestMaxCooperativeGroupCountExp; - - if (nullptr == pfnSuggestMaxCooperativeGroupCountExp) - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - - ur_kernel_suggest_max_cooperative_group_count_exp_params_t params = { - &hKernel, - &hDevice, - &workDim, - &pLocalWorkSize, - &dynamicSharedMemorySize, - &pGroupCountRet}; - uint64_t instance = getContext()->notify_begin( - UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, - "urKernelSuggestMaxCooperativeGroupCountExp", ¶ms); - - auto &logger = getContext()->logger; - UR_LOG_L(logger, INFO, - " ---> urKernelSuggestMaxCooperativeGroupCountExp\n"); - - ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, - pGroupCountRet); - - getContext()->notify_end( - UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, - "urKernelSuggestMaxCooperativeGroupCountExp", ¶ms, &result, instance); - - if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, - ¶ms); - UR_LOG_L(logger, INFO, - " <--- urKernelSuggestMaxCooperativeGroupCountExp({}) -> {};\n", - args_str.str(), result); - } - - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueTimestampRecordingExp __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( @@ -9589,90 +9519,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunchCustomExp -__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. If nullptr, the runtime implementation - /// will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_exp_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If nullptr, - /// the numEventsInWaitList must be 0, indicating that no wait event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - auto pfnKernelLaunchCustomExp = - getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; - - if (nullptr == pfnKernelLaunchCustomExp) - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - - ur_enqueue_kernel_launch_custom_exp_params_t params = { - &hQueue, - &hKernel, - &workDim, - &pGlobalWorkOffset, - &pGlobalWorkSize, - &pLocalWorkSize, - &numPropsInLaunchPropList, - &launchPropList, - &numEventsInWaitList, - &phEventWaitList, - &phEvent}; - uint64_t instance = - getContext()->notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, - "urEnqueueKernelLaunchCustomExp", ¶ms); - - auto &logger = getContext()->logger; - UR_LOG_L(logger, INFO, " ---> urEnqueueKernelLaunchCustomExp\n"); - - ur_result_t result = pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); - - getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, - "urEnqueueKernelLaunchCustomExp", ¶ms, &result, - instance); - - if (logger.getLevel() <= UR_LOGGER_LEVEL_INFO) { - std::ostringstream args_str; - ur::extras::printFunctionParams( - args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, ¶ms); - UR_LOG_L(logger, INFO, - " <--- urEnqueueKernelLaunchCustomExp({}) -> {};\n", - args_str.str(), result); - } - - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -10642,10 +10488,6 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; - dditable.pfnKernelLaunchCustomExp = pDdiTable->pfnKernelLaunchCustomExp; - pDdiTable->pfnKernelLaunchCustomExp = - ur_tracing_layer::urEnqueueKernelLaunchCustomExp; - dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp; pDdiTable->pfnUSMDeviceAllocExp = ur_tracing_layer::urEnqueueUSMDeviceAllocExp; @@ -10663,11 +10505,6 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( dditable.pfnCommandBufferExp = pDdiTable->pfnCommandBufferExp; pDdiTable->pfnCommandBufferExp = ur_tracing_layer::urEnqueueCommandBufferExp; - dditable.pfnCooperativeKernelLaunchExp = - pDdiTable->pfnCooperativeKernelLaunchExp; - pDdiTable->pfnCooperativeKernelLaunchExp = - ur_tracing_layer::urEnqueueCooperativeKernelLaunchExp; - dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp; pDdiTable->pfnTimestampRecordingExp = ur_tracing_layer::urEnqueueTimestampRecordingExp; @@ -10809,38 +10646,10 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetSpecializationConstants = ur_tracing_layer::urKernelSetSpecializationConstants; - return result; -} -/////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's KernelExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION -__urdlllocal ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - /// [in] API version requested - ur_api_version_t version, - /// [in,out] pointer to table of DDI function pointers - ur_kernel_exp_dditable_t *pDdiTable) { - auto &dditable = ur_tracing_layer::getContext()->urDdiTable.KernelExp; - - if (nullptr == pDdiTable) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (UR_MAJOR_VERSION(ur_tracing_layer::getContext()->version) != - UR_MAJOR_VERSION(version) || - UR_MINOR_VERSION(ur_tracing_layer::getContext()->version) > - UR_MINOR_VERSION(version)) - return UR_RESULT_ERROR_UNSUPPORTED_VERSION; - - ur_result_t result = UR_RESULT_SUCCESS; - - dditable.pfnSuggestMaxCooperativeGroupCountExp = - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp; - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = - ur_tracing_layer::urKernelSuggestMaxCooperativeGroupCountExp; + dditable.pfnSuggestMaxCooperativeGroupCount = + pDdiTable->pfnSuggestMaxCooperativeGroupCount; + pDdiTable->pfnSuggestMaxCooperativeGroupCount = + ur_tracing_layer::urKernelSuggestMaxCooperativeGroupCount; return result; } @@ -11533,11 +11342,6 @@ ur_result_t context_t::init(ur_dditable_t *dditable, &dditable->Kernel); } - if (UR_RESULT_SUCCESS == result) { - result = ur_tracing_layer::urGetKernelExpProcAddrTable( - UR_API_VERSION_CURRENT, &dditable->KernelExp); - } - if (UR_RESULT_SUCCESS == result) { result = ur_tracing_layer::urGetMemProcAddrTable(UR_API_VERSION_CURRENT, &dditable->Mem); diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index d0c29b0987d77..36a3afb753fe2 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -3894,6 +3894,66 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount +__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] handle of the device object + ur_device_handle_t hDevice, + /// [in] number of dimensions, from 1 to 3, to specify the work-group + /// work-items + uint32_t workDim, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of local work-items forming a work-group that will execute the + /// kernel function. + const size_t *pLocalWorkSize, + /// [in] size of dynamic shared memory, for each work-group, in bytes, + /// that will be used when the kernel is launched + size_t dynamicSharedMemorySize, + /// [out] pointer to maximum number of groups + uint32_t *pGroupCountRet) { + auto pfnSuggestMaxCooperativeGroupCount = + getContext()->urDdiTable.Kernel.pfnSuggestMaxCooperativeGroupCount; + + if (nullptr == pfnSuggestMaxCooperativeGroupCount) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (getContext()->enableParameterValidation) { + if (NULL == pLocalWorkSize) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == pGroupCountRet) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + + if (NULL == hKernel) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (NULL == hDevice) + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + + if (workDim < 1 || workDim > 3) + return UR_RESULT_ERROR_INVALID_WORK_DIMENSION; + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hKernel)) { + URLOG_CTX_INVALID_REFERENCE(hKernel); + } + + if (getContext()->enableLifetimeValidation && + !getContext()->refCountContext->isReferenceValid(hDevice)) { + URLOG_CTX_INVALID_REFERENCE(hDevice); + } + + ur_result_t result = pfnSuggestMaxCooperativeGroupCount( + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, + pGroupCountRet); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -4489,6 +4549,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -4547,7 +4612,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_result_t result = pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS && phEvent) { @@ -10153,154 +10219,6 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueCooperativeKernelLaunchExp -__urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - auto pfnCooperativeKernelLaunchExp = - getContext()->urDdiTable.EnqueueExp.pfnCooperativeKernelLaunchExp; - - if (nullptr == pfnCooperativeKernelLaunchExp) { - return UR_RESULT_ERROR_UNINITIALIZED; - } - - if (getContext()->enableParameterValidation) { - if (NULL == pGlobalWorkOffset) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (NULL == pGlobalWorkSize) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (NULL == hQueue) - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - - if (NULL == hKernel) - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - - if (phEventWaitList == NULL && numEventsInWaitList > 0) - return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; - - if (phEventWaitList != NULL && numEventsInWaitList == 0) - return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; - - if (phEventWaitList != NULL && numEventsInWaitList > 0) { - for (uint32_t i = 0; i < numEventsInWaitList; ++i) { - if (phEventWaitList[i] == NULL) { - return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; - } - } - } - } - - if (getContext()->enableLifetimeValidation && - !getContext()->refCountContext->isReferenceValid(hQueue)) { - URLOG_CTX_INVALID_REFERENCE(hQueue); - } - - if (getContext()->enableLifetimeValidation && - !getContext()->refCountContext->isReferenceValid(hKernel)) { - URLOG_CTX_INVALID_REFERENCE(hKernel); - } - - ur_result_t result = pfnCooperativeKernelLaunchExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); - - if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS && - phEvent) { - getContext()->refCountContext->createRefCount(*phEvent); - } - - return result; -} - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp -__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] handle of the device object - ur_device_handle_t hDevice, - /// [in] number of dimensions, from 1 to 3, to specify the work-group - /// work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of local work-items forming a work-group that will execute the - /// kernel function. - const size_t *pLocalWorkSize, - /// [in] size of dynamic shared memory, for each work-group, in bytes, - /// that will be used when the kernel is launched - size_t dynamicSharedMemorySize, - /// [out] pointer to maximum number of groups - uint32_t *pGroupCountRet) { - auto pfnSuggestMaxCooperativeGroupCountExp = - getContext()->urDdiTable.KernelExp.pfnSuggestMaxCooperativeGroupCountExp; - - if (nullptr == pfnSuggestMaxCooperativeGroupCountExp) { - return UR_RESULT_ERROR_UNINITIALIZED; - } - - if (getContext()->enableParameterValidation) { - if (NULL == pLocalWorkSize) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (NULL == pGroupCountRet) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (NULL == hKernel) - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - - if (NULL == hDevice) - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - } - - if (getContext()->enableLifetimeValidation && - !getContext()->refCountContext->isReferenceValid(hKernel)) { - URLOG_CTX_INVALID_REFERENCE(hKernel); - } - - if (getContext()->enableLifetimeValidation && - !getContext()->refCountContext->isReferenceValid(hDevice)) { - URLOG_CTX_INVALID_REFERENCE(hDevice); - } - - ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( - hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, - pGroupCountRet); - - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueTimestampRecordingExp __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( @@ -10373,99 +10291,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunchCustomExp -__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. If nullptr, the runtime implementation - /// will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_exp_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If nullptr, - /// the numEventsInWaitList must be 0, indicating that no wait event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - auto pfnKernelLaunchCustomExp = - getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; - - if (nullptr == pfnKernelLaunchCustomExp) { - return UR_RESULT_ERROR_UNINITIALIZED; - } - - if (getContext()->enableParameterValidation) { - if (NULL == pGlobalWorkOffset) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (NULL == pGlobalWorkSize) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (NULL == launchPropList) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (NULL == hQueue) - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - - if (NULL == hKernel) - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - - if (phEventWaitList != NULL && numEventsInWaitList > 0) { - for (uint32_t i = 0; i < numEventsInWaitList; ++i) { - if (phEventWaitList[i] == NULL) { - return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; - } - } - } - } - - if (getContext()->enableLifetimeValidation && - !getContext()->refCountContext->isReferenceValid(hQueue)) { - URLOG_CTX_INVALID_REFERENCE(hQueue); - } - - if (getContext()->enableLifetimeValidation && - !getContext()->refCountContext->isReferenceValid(hKernel)) { - URLOG_CTX_INVALID_REFERENCE(hKernel); - } - - ur_result_t result = pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); - - if (getContext()->enableLeakChecking && result == UR_RESULT_SUCCESS && - phEvent) { - getContext()->refCountContext->createRefCount(*phEvent); - } - - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -11452,10 +11277,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; - dditable.pfnKernelLaunchCustomExp = pDdiTable->pfnKernelLaunchCustomExp; - pDdiTable->pfnKernelLaunchCustomExp = - ur_validation_layer::urEnqueueKernelLaunchCustomExp; - dditable.pfnUSMDeviceAllocExp = pDdiTable->pfnUSMDeviceAllocExp; pDdiTable->pfnUSMDeviceAllocExp = ur_validation_layer::urEnqueueUSMDeviceAllocExp; @@ -11474,11 +11295,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCommandBufferExp = ur_validation_layer::urEnqueueCommandBufferExp; - dditable.pfnCooperativeKernelLaunchExp = - pDdiTable->pfnCooperativeKernelLaunchExp; - pDdiTable->pfnCooperativeKernelLaunchExp = - ur_validation_layer::urEnqueueCooperativeKernelLaunchExp; - dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp; pDdiTable->pfnTimestampRecordingExp = ur_validation_layer::urEnqueueTimestampRecordingExp; @@ -11623,39 +11439,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetSpecializationConstants = ur_validation_layer::urKernelSetSpecializationConstants; - return result; -} - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's KernelExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - /// [in] API version requested - ur_api_version_t version, - /// [in,out] pointer to table of DDI function pointers - ur_kernel_exp_dditable_t *pDdiTable) { - auto &dditable = ur_validation_layer::getContext()->urDdiTable.KernelExp; - - if (nullptr == pDdiTable) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (UR_MAJOR_VERSION(ur_validation_layer::getContext()->version) != - UR_MAJOR_VERSION(version) || - UR_MINOR_VERSION(ur_validation_layer::getContext()->version) > - UR_MINOR_VERSION(version)) - return UR_RESULT_ERROR_UNSUPPORTED_VERSION; - - ur_result_t result = UR_RESULT_SUCCESS; - - dditable.pfnSuggestMaxCooperativeGroupCountExp = - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp; - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = - ur_validation_layer::urKernelSuggestMaxCooperativeGroupCountExp; + dditable.pfnSuggestMaxCooperativeGroupCount = + pDdiTable->pfnSuggestMaxCooperativeGroupCount; + pDdiTable->pfnSuggestMaxCooperativeGroupCount = + ur_validation_layer::urKernelSuggestMaxCooperativeGroupCount; return result; } @@ -12380,11 +12167,6 @@ ur_result_t context_t::init(ur_dditable_t *dditable, UR_API_VERSION_CURRENT, &dditable->Kernel); } - if (UR_RESULT_SUCCESS == result) { - result = ur_validation_layer::urGetKernelExpProcAddrTable( - UR_API_VERSION_CURRENT, &dditable->KernelExp); - } - if (UR_RESULT_SUCCESS == result) { result = ur_validation_layer::urGetMemProcAddrTable(UR_API_VERSION_CURRENT, &dditable->Mem); diff --git a/unified-runtime/source/loader/loader.def.in b/unified-runtime/source/loader/loader.def.in index d3d0704f7ca17..365790fd51fe1 100644 --- a/unified-runtime/source/loader/loader.def.in +++ b/unified-runtime/source/loader/loader.def.in @@ -69,14 +69,12 @@ EXPORTS urDeviceRetain urDeviceSelectBinary urEnqueueCommandBufferExp - urEnqueueCooperativeKernelLaunchExp urEnqueueDeviceGlobalVariableRead urEnqueueDeviceGlobalVariableWrite urEnqueueEventsWait urEnqueueEventsWaitWithBarrier urEnqueueEventsWaitWithBarrierExt urEnqueueKernelLaunch - urEnqueueKernelLaunchCustomExp urEnqueueMemBufferCopy urEnqueueMemBufferCopyRect urEnqueueMemBufferFill @@ -120,7 +118,6 @@ EXPORTS urGetEnqueueProcAddrTable urGetEventProcAddrTable urGetGlobalProcAddrTable - urGetKernelExpProcAddrTable urGetKernelProcAddrTable urGetMemProcAddrTable urGetPhysicalMemProcAddrTable @@ -149,7 +146,7 @@ EXPORTS urKernelSetArgValue urKernelSetExecInfo urKernelSetSpecializationConstants - urKernelSuggestMaxCooperativeGroupCountExp + urKernelSuggestMaxCooperativeGroupCount urLoaderConfigCreate urLoaderConfigEnableLayer urLoaderConfigGetInfo @@ -280,13 +277,11 @@ EXPORTS urPrintDeviceType urPrintDeviceUsmAccessCapabilityFlags urPrintEnqueueCommandBufferExpParams - urPrintEnqueueCooperativeKernelLaunchExpParams urPrintEnqueueDeviceGlobalVariableReadParams urPrintEnqueueDeviceGlobalVariableWriteParams urPrintEnqueueEventsWaitParams urPrintEnqueueEventsWaitWithBarrierExtParams urPrintEnqueueEventsWaitWithBarrierParams - urPrintEnqueueKernelLaunchCustomExpParams urPrintEnqueueKernelLaunchParams urPrintEnqueueMemBufferCopyParams urPrintEnqueueMemBufferCopyRectParams @@ -348,8 +343,6 @@ EXPORTS urPrintExpImageCopyFlags urPrintExpImageCopyRegion urPrintExpImageMemType - urPrintExpLaunchProperty - urPrintExpLaunchPropertyId urPrintExpPeerInfo urPrintExpSamplerAddrModes urPrintExpSamplerCubemapFilterMode @@ -380,6 +373,9 @@ EXPORTS urPrintKernelGetSuggestedLocalWorkSizeParams urPrintKernelGroupInfo urPrintKernelInfo + urPrintKernelLaunchPropertiesSupportFlags + urPrintKernelLaunchProperty + urPrintKernelLaunchPropertyId urPrintKernelNativeProperties urPrintKernelReleaseParams urPrintKernelRetainParams @@ -391,7 +387,7 @@ EXPORTS urPrintKernelSetExecInfoParams urPrintKernelSetSpecializationConstantsParams urPrintKernelSubGroupInfo - urPrintKernelSuggestMaxCooperativeGroupCountExpParams + urPrintKernelSuggestMaxCooperativeGroupCountParams urPrintLoaderConfigCreateParams urPrintLoaderConfigEnableLayerParams urPrintLoaderConfigGetInfoParams diff --git a/unified-runtime/source/loader/loader.map.in b/unified-runtime/source/loader/loader.map.in index 771188a80bbbf..0d44f8caabcb7 100644 --- a/unified-runtime/source/loader/loader.map.in +++ b/unified-runtime/source/loader/loader.map.in @@ -69,14 +69,12 @@ urDeviceRetain; urDeviceSelectBinary; urEnqueueCommandBufferExp; - urEnqueueCooperativeKernelLaunchExp; urEnqueueDeviceGlobalVariableRead; urEnqueueDeviceGlobalVariableWrite; urEnqueueEventsWait; urEnqueueEventsWaitWithBarrier; urEnqueueEventsWaitWithBarrierExt; urEnqueueKernelLaunch; - urEnqueueKernelLaunchCustomExp; urEnqueueMemBufferCopy; urEnqueueMemBufferCopyRect; urEnqueueMemBufferFill; @@ -120,7 +118,6 @@ urGetEnqueueProcAddrTable; urGetEventProcAddrTable; urGetGlobalProcAddrTable; - urGetKernelExpProcAddrTable; urGetKernelProcAddrTable; urGetMemProcAddrTable; urGetPhysicalMemProcAddrTable; @@ -149,7 +146,7 @@ urKernelSetArgValue; urKernelSetExecInfo; urKernelSetSpecializationConstants; - urKernelSuggestMaxCooperativeGroupCountExp; + urKernelSuggestMaxCooperativeGroupCount; urLoaderConfigCreate; urLoaderConfigEnableLayer; urLoaderConfigGetInfo; @@ -280,13 +277,11 @@ urPrintDeviceType; urPrintDeviceUsmAccessCapabilityFlags; urPrintEnqueueCommandBufferExpParams; - urPrintEnqueueCooperativeKernelLaunchExpParams; urPrintEnqueueDeviceGlobalVariableReadParams; urPrintEnqueueDeviceGlobalVariableWriteParams; urPrintEnqueueEventsWaitParams; urPrintEnqueueEventsWaitWithBarrierExtParams; urPrintEnqueueEventsWaitWithBarrierParams; - urPrintEnqueueKernelLaunchCustomExpParams; urPrintEnqueueKernelLaunchParams; urPrintEnqueueMemBufferCopyParams; urPrintEnqueueMemBufferCopyRectParams; @@ -348,8 +343,6 @@ urPrintExpImageCopyFlags; urPrintExpImageCopyRegion; urPrintExpImageMemType; - urPrintExpLaunchProperty; - urPrintExpLaunchPropertyId; urPrintExpPeerInfo; urPrintExpSamplerAddrModes; urPrintExpSamplerCubemapFilterMode; @@ -380,6 +373,9 @@ urPrintKernelGetSuggestedLocalWorkSizeParams; urPrintKernelGroupInfo; urPrintKernelInfo; + urPrintKernelLaunchPropertiesSupportFlags; + urPrintKernelLaunchProperty; + urPrintKernelLaunchPropertyId; urPrintKernelNativeProperties; urPrintKernelReleaseParams; urPrintKernelRetainParams; @@ -391,7 +387,7 @@ urPrintKernelSetExecInfoParams; urPrintKernelSetSpecializationConstantsParams; urPrintKernelSubGroupInfo; - urPrintKernelSuggestMaxCooperativeGroupCountExpParams; + urPrintKernelSuggestMaxCooperativeGroupCountParams; urPrintLoaderConfigCreateParams; urPrintLoaderConfigEnableLayerParams; urPrintLoaderConfigGetInfoParams; diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 3cf10f747db6b..8645ec16532b1 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -2206,6 +2206,39 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( pSuggestedLocalWorkSize); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCount +__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] handle of the device object + ur_device_handle_t hDevice, + /// [in] number of dimensions, from 1 to 3, to specify the work-group + /// work-items + uint32_t workDim, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of local work-items forming a work-group that will execute the + /// kernel function. + const size_t *pLocalWorkSize, + /// [in] size of dynamic shared memory, for each work-group, in bytes, + /// that will be used when the kernel is launched + size_t dynamicSharedMemorySize, + /// [out] pointer to maximum number of groups + uint32_t *pGroupCountRet) { + + auto *dditable = *reinterpret_cast(hKernel); + + auto *pfnSuggestMaxCooperativeGroupCount = + dditable->Kernel.pfnSuggestMaxCooperativeGroupCount; + if (nullptr == pfnSuggestMaxCooperativeGroupCount) + return UR_RESULT_ERROR_UNINITIALIZED; + + // forward to device-platform + return pfnSuggestMaxCooperativeGroupCount( + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, + pGroupCountRet); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -2550,6 +2583,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -2571,8 +2609,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( // forward to device-platform return pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, - phEventWaitList, phEvent); + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); } /////////////////////////////////////////////////////////////////////////////// @@ -5348,87 +5387,6 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return pfnGetNativeHandleExp(hCommandBuffer, phNativeCommandBuffer); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueCooperativeKernelLaunchExp -__urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - - auto *dditable = *reinterpret_cast(hQueue); - - auto *pfnCooperativeKernelLaunchExp = - dditable->EnqueueExp.pfnCooperativeKernelLaunchExp; - if (nullptr == pfnCooperativeKernelLaunchExp) - return UR_RESULT_ERROR_UNINITIALIZED; - - // forward to device-platform - return pfnCooperativeKernelLaunchExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); -} - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urKernelSuggestMaxCooperativeGroupCountExp -__urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] handle of the device object - ur_device_handle_t hDevice, - /// [in] number of dimensions, from 1 to 3, to specify the work-group - /// work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of local work-items forming a work-group that will execute the - /// kernel function. - const size_t *pLocalWorkSize, - /// [in] size of dynamic shared memory, for each work-group, in bytes, - /// that will be used when the kernel is launched - size_t dynamicSharedMemorySize, - /// [out] pointer to maximum number of groups - uint32_t *pGroupCountRet) { - - auto *dditable = *reinterpret_cast(hKernel); - - auto *pfnSuggestMaxCooperativeGroupCountExp = - dditable->KernelExp.pfnSuggestMaxCooperativeGroupCountExp; - if (nullptr == pfnSuggestMaxCooperativeGroupCountExp) - return UR_RESULT_ERROR_UNINITIALIZED; - - // forward to device-platform - return pfnSuggestMaxCooperativeGroupCountExp( - hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, - pGroupCountRet); -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urEnqueueTimestampRecordingExp __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( @@ -5469,59 +5427,6 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( phEventWaitList, phEvent); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Intercept function for urEnqueueKernelLaunchCustomExp -__urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. If nullptr, the runtime implementation - /// will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_exp_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If nullptr, - /// the numEventsInWaitList must be 0, indicating that no wait event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - - auto *dditable = *reinterpret_cast(hQueue); - - auto *pfnKernelLaunchCustomExp = - dditable->EnqueueExp.pfnKernelLaunchCustomExp; - if (nullptr == pfnKernelLaunchCustomExp) - return UR_RESULT_ERROR_UNINITIALIZED; - - // forward to device-platform - return pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -6265,15 +6170,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( if (ur_loader::getContext()->platforms.size() != 1 || ur_loader::getContext()->forceIntercept) { // return pointers to loader's DDIs - pDdiTable->pfnKernelLaunchCustomExp = - ur_loader::urEnqueueKernelLaunchCustomExp; pDdiTable->pfnUSMDeviceAllocExp = ur_loader::urEnqueueUSMDeviceAllocExp; pDdiTable->pfnUSMSharedAllocExp = ur_loader::urEnqueueUSMSharedAllocExp; pDdiTable->pfnUSMHostAllocExp = ur_loader::urEnqueueUSMHostAllocExp; pDdiTable->pfnUSMFreeExp = ur_loader::urEnqueueUSMFreeExp; pDdiTable->pfnCommandBufferExp = ur_loader::urEnqueueCommandBufferExp; - pDdiTable->pfnCooperativeKernelLaunchExp = - ur_loader::urEnqueueCooperativeKernelLaunchExp; pDdiTable->pfnTimestampRecordingExp = ur_loader::urEnqueueTimestampRecordingExp; pDdiTable->pfnNativeCommandExp = ur_loader::urEnqueueNativeCommandExp; @@ -6408,6 +6309,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgMemObj = ur_loader::urKernelSetArgMemObj; pDdiTable->pfnSetSpecializationConstants = ur_loader::urKernelSetSpecializationConstants; + pDdiTable->pfnSuggestMaxCooperativeGroupCount = + ur_loader::urKernelSuggestMaxCooperativeGroupCount; } else { // return pointers directly to platform's DDIs *pDdiTable = ur_loader::getContext()->platforms.front().dditable.Kernel; @@ -6417,60 +6320,6 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's KernelExp table -/// with current process' addresses -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION -UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( - /// [in] API version requested - ur_api_version_t version, - /// [in,out] pointer to table of DDI function pointers - ur_kernel_exp_dditable_t *pDdiTable) { - if (nullptr == pDdiTable) - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - - if (ur_loader::getContext()->version < version) - return UR_RESULT_ERROR_UNSUPPORTED_VERSION; - - ur_result_t result = UR_RESULT_SUCCESS; - - // Load the device-platform DDI tables - for (auto &platform : ur_loader::getContext()->platforms) { - // statically linked adapter inside of the loader - if (platform.handle == nullptr) - continue; - - if (platform.initStatus != UR_RESULT_SUCCESS) - continue; - auto getTable = reinterpret_cast( - ur_loader::LibLoader::getFunctionPtr(platform.handle.get(), - "urGetKernelExpProcAddrTable")); - if (!getTable) - continue; - platform.initStatus = getTable(version, &platform.dditable.KernelExp); - } - - if (UR_RESULT_SUCCESS == result) { - if (ur_loader::getContext()->platforms.size() != 1 || - ur_loader::getContext()->forceIntercept) { - // return pointers to loader's DDIs - pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = - ur_loader::urKernelSuggestMaxCooperativeGroupCountExp; - } else { - // return pointers directly to platform's DDIs - *pDdiTable = - ur_loader::getContext()->platforms.front().dditable.KernelExp; - } - } - - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Mem table /// with current process' addresses diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index 3fcd7462206bd..64a982e8b3c73 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -4404,6 +4404,57 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Query the maximum number of work groups for a cooperative kernel +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hDevice` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pLocalWorkSize` +/// + `NULL == pGroupCountRet` +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If ::UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT returns a +/// value without the +/// ::UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE bit set. +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `workDim < 1 || workDim > 3` +ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] handle of the device object + ur_device_handle_t hDevice, + /// [in] number of dimensions, from 1 to 3, to specify the work-group + /// work-items + uint32_t workDim, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of local work-items forming a work-group that will execute the + /// kernel function. + const size_t *pLocalWorkSize, + /// [in] size of dynamic shared memory, for each work-group, in bytes, + /// that will be used when the kernel is launched + size_t dynamicSharedMemorySize, + /// [out] pointer to maximum number of groups + uint32_t *pGroupCountRet) try { + auto pfnSuggestMaxCooperativeGroupCount = + ur_lib::getContext() + ->urDdiTable.Kernel.pfnSuggestMaxCooperativeGroupCount; + if (nullptr == pfnSuggestMaxCooperativeGroupCount) + return UR_RESULT_ERROR_UNINITIALIZED; + + return pfnSuggestMaxCooperativeGroupCount( + hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, + pGroupCountRet); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query information about a command queue /// @@ -5084,6 +5135,8 @@ ur_result_t UR_APICALL urEventSetCallback( /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If any property in `launchPropList` isn't supported by the device. ur_result_t UR_APICALL urEnqueueKernelLaunch( /// [in] handle of the queue object ur_queue_handle_t hQueue, @@ -5104,6 +5157,11 @@ ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -5122,8 +5180,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_ERROR_UNINITIALIZED; return pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, - phEventWaitList, phEvent); + pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -9839,121 +9898,6 @@ ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return exceptionToResult(std::current_exception()); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Enqueue a command to execute a cooperative kernel -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hQueue` -/// + `NULL == hKernel` -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` -/// + `NULL == pGlobalWorkSize` -/// - ::UR_RESULT_ERROR_INVALID_QUEUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -/// - ::UR_RESULT_ERROR_INVALID_EVENT -/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST -/// + `phEventWaitList == NULL && numEventsInWaitList > 0` -/// + `phEventWaitList != NULL && numEventsInWaitList == 0` -/// + If event objects in phEventWaitList are not valid events. -/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION -/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES -ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) try { - auto pfnCooperativeKernelLaunchExp = - ur_lib::getContext()->urDdiTable.EnqueueExp.pfnCooperativeKernelLaunchExp; - if (nullptr == pfnCooperativeKernelLaunchExp) - return UR_RESULT_ERROR_UNINITIALIZED; - - return pfnCooperativeKernelLaunchExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); -} catch (...) { - return exceptionToResult(std::current_exception()); -} - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Query the maximum number of work groups for a cooperative kernel -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hKernel` -/// + `NULL == hDevice` -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pLocalWorkSize` -/// + `NULL == pGroupCountRet` -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] handle of the device object - ur_device_handle_t hDevice, - /// [in] number of dimensions, from 1 to 3, to specify the work-group - /// work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of local work-items forming a work-group that will execute the - /// kernel function. - const size_t *pLocalWorkSize, - /// [in] size of dynamic shared memory, for each work-group, in bytes, - /// that will be used when the kernel is launched - size_t dynamicSharedMemorySize, - /// [out] pointer to maximum number of groups - uint32_t *pGroupCountRet) try { - auto pfnSuggestMaxCooperativeGroupCountExp = - ur_lib::getContext() - ->urDdiTable.KernelExp.pfnSuggestMaxCooperativeGroupCountExp; - if (nullptr == pfnSuggestMaxCooperativeGroupCountExp) - return UR_RESULT_ERROR_UNINITIALIZED; - - return pfnSuggestMaxCooperativeGroupCountExp( - hKernel, hDevice, workDim, pLocalWorkSize, dynamicSharedMemorySize, - pGroupCountRet); -} catch (...) { - return exceptionToResult(std::current_exception()); -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a command for recording the device timestamp /// @@ -10005,103 +9949,6 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return exceptionToResult(std::current_exception()); } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Launch kernel with custom launch properties -/// -/// @details -/// - Launches the kernel using the specified launch properties -/// - If numPropsInLaunchPropList == 0 then a regular kernel launch is used: -/// `urEnqueueKernelLaunch` -/// - Consult the appropriate adapter driver documentation for details of -/// adapter specific behavior and native error codes that may be returned. -/// -/// @remarks -/// _Analogues_ -/// - **cuLaunchKernelEx** -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hQueue` -/// + `NULL == hKernel` -/// + NULL == hQueue -/// + NULL == hKernel -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` -/// + `NULL == pGlobalWorkSize` -/// + `NULL == launchPropList` -/// + NULL == pGlobalWorkSize -/// + numPropsInLaunchpropList != 0 && launchPropList == NULL -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_QUEUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -/// - ::UR_RESULT_ERROR_INVALID_EVENT -/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST -/// + phEventWaitList == NULL && numEventsInWaitList > 0 -/// + phEventWaitList != NULL && numEventsInWaitList == 0 -/// + If event objects in phEventWaitList are not valid events. -/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS -/// + An event in phEventWaitList has ::UR_EVENT_STATUS_ERROR -/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION -/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES -ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. If nullptr, the runtime implementation - /// will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_exp_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If nullptr, - /// the numEventsInWaitList must be 0, indicating that no wait event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) try { - auto pfnKernelLaunchCustomExp = - ur_lib::getContext()->urDdiTable.EnqueueExp.pfnKernelLaunchCustomExp; - if (nullptr == pfnKernelLaunchCustomExp) - return UR_RESULT_ERROR_UNINITIALIZED; - - return pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, launchPropList, - numEventsInWaitList, phEventWaitList, phEvent); -} catch (...) { - return exceptionToResult(std::current_exception()); -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/unified-runtime/source/loader/ur_libddi.cpp b/unified-runtime/source/loader/ur_libddi.cpp index 085aca70c50b3..0c02960d53a94 100644 --- a/unified-runtime/source/loader/ur_libddi.cpp +++ b/unified-runtime/source/loader/ur_libddi.cpp @@ -66,11 +66,6 @@ __urdlllocal ur_result_t context_t::ddiInit() { urGetKernelProcAddrTable(UR_API_VERSION_CURRENT, &urDdiTable.Kernel); } - if (UR_RESULT_SUCCESS == result) { - result = urGetKernelExpProcAddrTable(UR_API_VERSION_CURRENT, - &urDdiTable.KernelExp); - } - if (UR_RESULT_SUCCESS == result) { result = urGetMemProcAddrTable(UR_API_VERSION_CURRENT, &urDdiTable.Mem); } diff --git a/unified-runtime/source/loader/ur_print.cpp b/unified-runtime/source/loader/ur_print.cpp index b9de89785f0be..8893a3dc05caf 100644 --- a/unified-runtime/source/loader/ur_print.cpp +++ b/unified-runtime/source/loader/ur_print.cpp @@ -292,6 +292,14 @@ urPrintDeviceThrottleReasonsFlags(enum ur_device_throttle_reasons_flag_t value, return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintKernelLaunchPropertiesSupportFlags( + enum ur_kernel_launch_properties_support_flag_t value, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintContextFlags(enum ur_context_flag_t value, char *buffer, const size_t buff_size, size_t *out_size) { std::stringstream ss; @@ -882,6 +890,24 @@ ur_result_t urPrintExecutionInfo(enum ur_execution_info_t value, char *buffer, return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t +urPrintKernelLaunchPropertyId(enum ur_kernel_launch_property_id_t value, + char *buffer, const size_t buff_size, + size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + +ur_result_t +urPrintKernelLaunchProperty(const struct ur_kernel_launch_property_t params, + char *buffer, const size_t buff_size, + size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintMapFlags(enum ur_map_flag_t value, char *buffer, const size_t buff_size, size_t *out_size) { std::stringstream ss; @@ -1105,23 +1131,6 @@ ur_result_t urPrintExpCommandBufferUpdateKernelLaunchDesc( return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t urPrintExpLaunchPropertyId(enum ur_exp_launch_property_id_t value, - char *buffer, const size_t buff_size, - size_t *out_size) { - std::stringstream ss; - ss << value; - return str_copy(&ss, buffer, buff_size, out_size); -} - -ur_result_t -urPrintExpLaunchProperty(const struct ur_exp_launch_property_t params, - char *buffer, const size_t buff_size, - size_t *out_size) { - std::stringstream ss; - ss << params; - return str_copy(&ss, buffer, buff_size, out_size); -} - ur_result_t urPrintExpPeerInfo(enum ur_exp_peer_info_t value, char *buffer, const size_t buff_size, size_t *out_size) { std::stringstream ss; @@ -1851,14 +1860,6 @@ ur_result_t urPrintEnqueueWriteHostPipeParams( return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t urPrintEnqueueKernelLaunchCustomExpParams( - const struct ur_enqueue_kernel_launch_custom_exp_params_t *params, - char *buffer, const size_t buff_size, size_t *out_size) { - std::stringstream ss; - ss << params; - return str_copy(&ss, buffer, buff_size, out_size); -} - ur_result_t urPrintEnqueueEventsWaitWithBarrierExtParams( const struct ur_enqueue_events_wait_with_barrier_ext_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { @@ -1907,14 +1908,6 @@ ur_result_t urPrintEnqueueCommandBufferExpParams( return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t urPrintEnqueueCooperativeKernelLaunchExpParams( - const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *params, - char *buffer, const size_t buff_size, size_t *out_size) { - std::stringstream ss; - ss << params; - return str_copy(&ss, buffer, buff_size, out_size); -} - ur_result_t urPrintEnqueueTimestampRecordingExpParams( const struct ur_enqueue_timestamp_recording_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { @@ -2130,9 +2123,8 @@ ur_result_t urPrintKernelSetSpecializationConstantsParams( return str_copy(&ss, buffer, buff_size, out_size); } -ur_result_t urPrintKernelSuggestMaxCooperativeGroupCountExpParams( - const struct ur_kernel_suggest_max_cooperative_group_count_exp_params_t - *params, +ur_result_t urPrintKernelSuggestMaxCooperativeGroupCountParams( + const struct ur_kernel_suggest_max_cooperative_group_count_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { std::stringstream ss; ss << params; diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index f11d9c4b6220c..bed38e9f53d26 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -3846,6 +3846,48 @@ ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Query the maximum number of work groups for a cooperative kernel +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hDevice` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pLocalWorkSize` +/// + `NULL == pGroupCountRet` +/// - ::UR_RESULT_ERROR_INVALID_KERNEL +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +/// + If ::UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT returns a +/// value without the +/// ::UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE bit set. +/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `workDim < 1 || workDim > 3` +ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCount( + /// [in] handle of the kernel object + ur_kernel_handle_t hKernel, + /// [in] handle of the device object + ur_device_handle_t hDevice, + /// [in] number of dimensions, from 1 to 3, to specify the work-group + /// work-items + uint32_t workDim, + /// [in] pointer to an array of workDim unsigned values that specify the + /// number of local work-items forming a work-group that will execute the + /// kernel function. + const size_t *pLocalWorkSize, + /// [in] size of dynamic shared memory, for each work-group, in bytes, + /// that will be used when the kernel is launched + size_t dynamicSharedMemorySize, + /// [out] pointer to maximum number of groups + uint32_t *pGroupCountRet) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query information about a command queue /// @@ -4438,6 +4480,8 @@ ur_result_t UR_APICALL urEventSetCallback( /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES +/// - ::UR_RESULT_ERROR_INVALID_OPERATION +/// + If any property in `launchPropList` isn't supported by the device. ur_result_t UR_APICALL urEnqueueKernelLaunch( /// [in] handle of the queue object ur_queue_handle_t hQueue, @@ -4458,6 +4502,11 @@ ur_result_t UR_APICALL urEnqueueKernelLaunch( /// execute the kernel function. /// If nullptr, the runtime implementation will choose the work-group size. const size_t *pLocalWorkSize, + /// [in] size of the launch prop list + uint32_t numPropsInLaunchPropList, + /// [in][optional][range(0, numPropsInLaunchPropList)] pointer to a list + /// of launch properties + const ur_kernel_launch_property_t *launchPropList, /// [in] size of the event wait list uint32_t numEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -8567,104 +8616,6 @@ ur_result_t UR_APICALL urCommandBufferGetNativeHandleExp( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Enqueue a command to execute a cooperative kernel -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hQueue` -/// + `NULL == hKernel` -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` -/// + `NULL == pGlobalWorkSize` -/// - ::UR_RESULT_ERROR_INVALID_QUEUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -/// - ::UR_RESULT_ERROR_INVALID_EVENT -/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST -/// + `phEventWaitList == NULL && numEventsInWaitList > 0` -/// + `phEventWaitList != NULL && numEventsInWaitList == 0` -/// + If event objects in phEventWaitList are not valid events. -/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION -/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES -ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. - /// If nullptr, the runtime implementation will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. - /// If nullptr, the numEventsInWaitList must be 0, indicating that no wait - /// event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - ur_result_t result = UR_RESULT_SUCCESS; - return result; -} - -/////////////////////////////////////////////////////////////////////////////// -/// @brief Query the maximum number of work groups for a cooperative kernel -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hKernel` -/// + `NULL == hDevice` -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pLocalWorkSize` -/// + `NULL == pGroupCountRet` -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] handle of the device object - ur_device_handle_t hDevice, - /// [in] number of dimensions, from 1 to 3, to specify the work-group - /// work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of local work-items forming a work-group that will execute the - /// kernel function. - const size_t *pLocalWorkSize, - /// [in] size of dynamic shared memory, for each work-group, in bytes, - /// that will be used when the kernel is launched - size_t dynamicSharedMemorySize, - /// [out] pointer to maximum number of groups - uint32_t *pGroupCountRet) { - ur_result_t result = UR_RESULT_SUCCESS; - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a command for recording the device timestamp /// @@ -8709,94 +8660,6 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( return result; } -/////////////////////////////////////////////////////////////////////////////// -/// @brief Launch kernel with custom launch properties -/// -/// @details -/// - Launches the kernel using the specified launch properties -/// - If numPropsInLaunchPropList == 0 then a regular kernel launch is used: -/// `urEnqueueKernelLaunch` -/// - Consult the appropriate adapter driver documentation for details of -/// adapter specific behavior and native error codes that may be returned. -/// -/// @remarks -/// _Analogues_ -/// - **cuLaunchKernelEx** -/// -/// @returns -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE -/// + `NULL == hQueue` -/// + `NULL == hKernel` -/// + NULL == hQueue -/// + NULL == hKernel -/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER -/// + `NULL == pGlobalWorkOffset` -/// + `NULL == pGlobalWorkSize` -/// + `NULL == launchPropList` -/// + NULL == pGlobalWorkSize -/// + numPropsInLaunchpropList != 0 && launchPropList == NULL -/// - ::UR_RESULT_SUCCESS -/// - ::UR_RESULT_ERROR_UNINITIALIZED -/// - ::UR_RESULT_ERROR_DEVICE_LOST -/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC -/// - ::UR_RESULT_ERROR_INVALID_QUEUE -/// - ::UR_RESULT_ERROR_INVALID_KERNEL -/// - ::UR_RESULT_ERROR_INVALID_EVENT -/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST -/// + phEventWaitList == NULL && numEventsInWaitList > 0 -/// + phEventWaitList != NULL && numEventsInWaitList == 0 -/// + If event objects in phEventWaitList are not valid events. -/// - ::UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS -/// + An event in phEventWaitList has ::UR_EVENT_STATUS_ERROR -/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION -/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE -/// - ::UR_RESULT_ERROR_INVALID_VALUE -/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES -ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( - /// [in] handle of the queue object - ur_queue_handle_t hQueue, - /// [in] handle of the kernel object - ur_kernel_handle_t hKernel, - /// [in] number of dimensions, from 1 to 3, to specify the global and - /// work-group work-items - uint32_t workDim, - /// [in] pointer to an array of workDim unsigned values that specify the - /// offset used to calculate the global ID of a work-item - const size_t *pGlobalWorkOffset, - /// [in] pointer to an array of workDim unsigned values that specify the - /// number of global work-items in workDim that will execute the kernel - /// function - const size_t *pGlobalWorkSize, - /// [in][optional] pointer to an array of workDim unsigned values that - /// specify the number of local work-items forming a work-group that will - /// execute the kernel function. If nullptr, the runtime implementation - /// will choose the work-group size. - const size_t *pLocalWorkSize, - /// [in] size of the launch prop list - uint32_t numPropsInLaunchPropList, - /// [in][range(0, numPropsInLaunchPropList)] pointer to a list of launch - /// properties - const ur_exp_launch_property_t *launchPropList, - /// [in] size of the event wait list - uint32_t numEventsInWaitList, - /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of - /// events that must be complete before the kernel execution. If nullptr, - /// the numEventsInWaitList must be 0, indicating that no wait event. - const ur_event_handle_t *phEventWaitList, - /// [out][optional][alloc] return an event object that identifies this - /// particular kernel execution instance. If phEventWaitList and phEvent - /// are not NULL, phEvent must not refer to an element of the - /// phEventWaitList array. - ur_event_handle_t *phEvent) { - ur_result_t result = UR_RESULT_SUCCESS; - return result; -} - /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/unified-runtime/test/adapters/cuda/kernel_tests.cpp b/unified-runtime/test/adapters/cuda/kernel_tests.cpp index 6d1937f8873e0..c88928a2f1869 100644 --- a/unified-runtime/test/adapters/cuda/kernel_tests.cpp +++ b/unified-runtime/test/adapters/cuda/kernel_tests.cpp @@ -236,7 +236,7 @@ TEST_P(cudaKernelTest, URKernelDispatch) { const size_t localWorkSize[] = {1}; ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, workDim, globalWorkOffset, globalWorkSize, localWorkSize, 0, - nullptr, nullptr)); + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); } @@ -271,6 +271,6 @@ TEST_P(cudaKernelTest, URKernelDispatchTwo) { const size_t localWorkSize[] = {1}; ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, workDim, globalWorkOffset, globalWorkSize, localWorkSize, 0, - nullptr, nullptr)); + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); } diff --git a/unified-runtime/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp b/unified-runtime/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp index 616f825bb2c57..e7f577dd26cd9 100644 --- a/unified-runtime/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp +++ b/unified-runtime/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp @@ -69,8 +69,8 @@ TEST_P(urLevelZeroKernelNativeHandleTest, OwnedHandleRelease) { size_t local_size = 1; size_t global_size = 1; ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &global_offset, - &local_size, &global_size, 0, nullptr, - nullptr)); + &local_size, &global_size, 0, nullptr, 0, + nullptr, nullptr)); ASSERT_SUCCESS(urKernelRelease(kernel)); ASSERT_SUCCESS(urProgramRelease(program)); diff --git a/unified-runtime/test/adapters/level_zero/v2/deferred_kernel.cpp b/unified-runtime/test/adapters/level_zero/v2/deferred_kernel.cpp index c5a6b5b4fcc64..99ab66e68a63a 100644 --- a/unified-runtime/test/adapters/level_zero/v2/deferred_kernel.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/deferred_kernel.cpp @@ -42,7 +42,7 @@ TEST_P(urEnqueueKernelLaunchTest, DeferredKernelRelease) { ASSERT_SUCCESS(urEnqueueEventsWait(queue, 1, &event, nullptr)); ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 0, - nullptr, nullptr)); + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urKernelRelease(kernel)); // Kernel should still be alive since kernel launch is pending @@ -134,13 +134,13 @@ TEST_P(urMultiQueueLaunchKernelDeferFreeTest, Success) { ASSERT_SUCCESS(urEnqueueEventsWait(queues[0], 1, &event1, nullptr)); ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernel, 1, &global_offset, - &global_size, nullptr, 0, nullptr, - nullptr)); + &global_size, nullptr, 0, nullptr, 0, + nullptr, nullptr)); ASSERT_SUCCESS(urEnqueueEventsWait(queues[1], 1, &event2, nullptr)); ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernel, 1, &global_offset, - &global_size, nullptr, 0, nullptr, - nullptr)); + &global_size, nullptr, 0, nullptr, 0, + nullptr, nullptr)); ASSERT_SUCCESS(urKernelRelease(kernel)); diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt index 9e29e727e2e2a..a640c370c1427 100644 --- a/unified-runtime/test/conformance/CMakeLists.txt +++ b/unified-runtime/test/conformance/CMakeLists.txt @@ -112,7 +112,6 @@ set(TEST_SUBDIRECTORIES_DPCXX "exp_command_buffer" "exp_enqueue_native" "exp_usm_p2p" - "exp_launch_properties" "memory-migrate" "usm" ) diff --git a/unified-runtime/test/conformance/device/urDeviceGetInfo.cpp b/unified-runtime/test/conformance/device/urDeviceGetInfo.cpp index a314d43843173..5c2bc54091a51 100644 --- a/unified-runtime/test/conformance/device/urDeviceGetInfo.cpp +++ b/unified-runtime/test/conformance/device/urDeviceGetInfo.cpp @@ -2057,23 +2057,6 @@ TEST_P(urDeviceGetInfoTest, SuccessCommandBufferEventSupport) { ASSERT_TRUE(casted_value == false || casted_value == true); } -TEST_P(urDeviceGetInfoTest, SuccessClusterLaunch) { - size_t property_size = 0; - const ur_device_info_t property_name = - UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP; - - UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( - urDeviceGetInfo(device, property_name, 0, nullptr, &property_size)); - ASSERT_EQ(property_size, sizeof(ur_bool_t)); - - ur_bool_t property_value = false; - ASSERT_SUCCESS(urDeviceGetInfo(device, property_name, property_size, - &property_value, nullptr)); - - bool casted_value = static_cast(property_value); - ASSERT_TRUE(casted_value == false || casted_value == true); -} - TEST_P(urDeviceGetInfoTest, SuccessBindlessImagesSupport) { size_t property_size = 0; const ur_device_info_t property_name = @@ -2755,3 +2738,21 @@ TEST_P(urDeviceGetInfoComponentDevicesTest, SuccessComponentDevices) { } } } + +TEST_P(urDeviceGetInfoTest, SuccessKernelLaunchPropertiesSupport) { + size_t property_size = 0; + const ur_device_info_t property_name = + UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT; + + ASSERT_SUCCESS_OR_OPTIONAL_QUERY( + urDeviceGetInfo(device, property_name, 0, nullptr, &property_size), + property_name); + ASSERT_EQ(property_size, sizeof(ur_kernel_launch_properties_support_flags_t)); + + ur_kernel_launch_properties_support_flags_t property_value = + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_FORCE_UINT32; + ASSERT_SUCCESS(urDeviceGetInfo(device, property_name, property_size, + &property_value, nullptr)); + + ASSERT_EQ(property_value & UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAGS_MASK, 0); +} diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueDeviceGlobalVariableRead.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueDeviceGlobalVariableRead.cpp index 96acb17d2dee7..d8499e997d5b1 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueDeviceGlobalVariableRead.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueDeviceGlobalVariableRead.cpp @@ -27,7 +27,7 @@ TEST_P(urEnqueueDeviceGetGlobalVariableReadWithParamTest, Success) { // execute the kernel ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 0, - nullptr, nullptr)); + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); // read global var back to host diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueDeviceGlobalVariableWrite.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueDeviceGlobalVariableWrite.cpp index 26b74956d7b2a..4ce2c931fd9d1 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueDeviceGlobalVariableWrite.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueDeviceGlobalVariableWrite.cpp @@ -32,7 +32,7 @@ TEST_P(urEnqueueDeviceGetGlobalVariableWriteWithParamTest, Success) { // execute the kernel ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 0, - nullptr, nullptr)); + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); // read global var back to host diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp index 4cc02636285d5..20806da38acf9 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueEventsWaitWithBarrier.cpp @@ -183,10 +183,12 @@ TEST_P(urEnqueueEventsWaitWithBarrierOrderingTest, nullptr, &event); EXPECT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 1, &event, nullptr)); EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, add_kernel, 1, &offset, &count, - nullptr, 0, nullptr, &event)); + nullptr, 0, nullptr, 0, nullptr, + &event)); EXPECT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 1, &event, nullptr)); EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, mul_kernel, 1, &offset, &count, - nullptr, 0, nullptr, &event)); + nullptr, 0, nullptr, 0, nullptr, + &event)); EXPECT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 1, &event, nullptr)); addHelper.ValidateBuffer(buffer, sizeof(uint32_t), 4004); } @@ -212,10 +214,12 @@ TEST_P(urEnqueueEventsWaitWithBarrierOrderingTest, nullptr, nullptr); EXPECT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &event)); EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, add_kernel, 1, &offset, &count, - nullptr, 1, &event, nullptr)); + nullptr, 0, nullptr, 1, &event, + nullptr)); EXPECT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &event)); EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, mul_kernel, 1, &offset, &count, - nullptr, 1, &event, nullptr)); + nullptr, 0, nullptr, 1, &event, + nullptr)); EXPECT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, &event)); addHelper.ValidateBuffer(buffer, sizeof(uint32_t), 4004); } @@ -241,11 +245,13 @@ TEST_P(urEnqueueEventsWaitWithBarrierOrderingTest, SuccessEventDependencies) { EXPECT_SUCCESS( urEnqueueEventsWaitWithBarrier(queue, 1, &event[0], &event[1])); EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, add_kernel, 1, &offset, &count, - nullptr, 1, &event[1], &event[2])); + nullptr, 0, nullptr, 1, &event[1], + &event[2])); EXPECT_SUCCESS( urEnqueueEventsWaitWithBarrier(queue, 1, &event[2], &event[3])); EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, mul_kernel, 1, &offset, &count, - nullptr, 1, &event[3], &event[4])); + nullptr, 0, nullptr, 1, &event[3], + &event[4])); EXPECT_SUCCESS( urEnqueueEventsWaitWithBarrier(queue, 1, &event[4], &event[5])); addHelper.ValidateBuffer(buffer, sizeof(uint32_t), 4004); @@ -271,10 +277,12 @@ TEST_P(urEnqueueEventsWaitWithBarrierOrderingTest, nullptr, nullptr); EXPECT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, nullptr)); EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, add_kernel, 1, &offset, &count, - nullptr, 0, nullptr, nullptr)); + nullptr, 0, nullptr, 0, nullptr, + nullptr)); EXPECT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, nullptr)); EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, mul_kernel, 1, &offset, &count, - nullptr, 0, nullptr, nullptr)); + nullptr, 0, nullptr, 0, nullptr, + nullptr)); EXPECT_SUCCESS(urEnqueueEventsWaitWithBarrier(queue, 0, nullptr, nullptr)); addHelper.ValidateBuffer(buffer, sizeof(uint32_t), 4004); } diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index 8e4a28826c4b7..408b648137253 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -73,7 +73,46 @@ TEST_P(urEnqueueKernelLaunchTest, Success) { AddPodArg(val); ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 0, - nullptr, nullptr)); + nullptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ValidateBuffer(buffer, sizeof(val) * global_size, val); +} + +TEST_P(urEnqueueKernelLaunchTest, SuccessWithLaunchProperties) { + std::vector props(1); + props[0].id = UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE; + + ur_kernel_launch_properties_support_flags_t supported_properties = 0; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT, + sizeof(supported_properties), &supported_properties, nullptr)); + + if (supported_properties & + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE) { + ur_kernel_launch_property_t coop_prop; + coop_prop.id = UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE; + coop_prop.value.cooperative = 1; + props.push_back(coop_prop); + } + + if (supported_properties & + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_CLUSTER_DIMENSION) { + ur_kernel_launch_property_t cluster_dims_prop; + cluster_dims_prop.id = UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION; + cluster_dims_prop.value.clusterDim[0] = 16; + cluster_dims_prop.value.clusterDim[1] = 1; + cluster_dims_prop.value.clusterDim[2] = 1; + + props.push_back(cluster_dims_prop); + } + + ur_mem_handle_t buffer = nullptr; + AddBuffer1DArg(sizeof(val) * global_size, &buffer); + AddPodArg(val); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, nullptr, 1, + &props[0], 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(buffer, sizeof(val) * global_size, val); } @@ -81,33 +120,33 @@ TEST_P(urEnqueueKernelLaunchTest, Success) { TEST_P(urEnqueueKernelLaunchTest, InvalidNullHandleQueue) { ASSERT_EQ_RESULT(urEnqueueKernelLaunch(nullptr, kernel, n_dimensions, &global_offset, &global_size, nullptr, - 0, nullptr, nullptr), + 0, nullptr, 0, nullptr, nullptr), UR_RESULT_ERROR_INVALID_NULL_HANDLE); } TEST_P(urEnqueueKernelLaunchTest, InvalidNullPointer) { ASSERT_EQ_RESULT(urEnqueueKernelLaunch(queue, kernel, n_dimensions, nullptr, - &global_size, nullptr, 0, nullptr, - nullptr), + &global_size, nullptr, 0, nullptr, 0, + nullptr, nullptr), UR_RESULT_ERROR_INVALID_NULL_POINTER); ASSERT_EQ_RESULT(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, nullptr, nullptr, 0, - nullptr, nullptr), + nullptr, 0, nullptr, nullptr), UR_RESULT_ERROR_INVALID_NULL_POINTER); } TEST_P(urEnqueueKernelLaunchTest, InvalidNullHandleKernel) { ASSERT_EQ_RESULT(urEnqueueKernelLaunch(queue, nullptr, n_dimensions, &global_offset, &global_size, nullptr, - 0, nullptr, nullptr), + 0, nullptr, 0, nullptr, nullptr), UR_RESULT_ERROR_INVALID_NULL_HANDLE); } TEST_P(urEnqueueKernelLaunchTest, InvalidNullPtrEventWaitList) { ASSERT_EQ_RESULT(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, - 1, nullptr, nullptr), + 0, nullptr, 1, nullptr, nullptr), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); ur_event_handle_t validEvent; @@ -115,13 +154,13 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidNullPtrEventWaitList) { ASSERT_EQ_RESULT(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, - 0, &validEvent, nullptr), + 0, nullptr, 0, &validEvent, nullptr), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); ur_event_handle_t inv_evt = nullptr; ASSERT_EQ_RESULT(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, - 1, &inv_evt, nullptr), + 0, nullptr, 1, &inv_evt, nullptr), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); ASSERT_SUCCESS(urEventRelease(validEvent)); } @@ -134,7 +173,7 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) { ASSERT_EQ_RESULT(urEnqueueKernelLaunch(queue, kernel, max_work_item_dimensions + 1, &global_offset, &global_size, nullptr, - 0, nullptr, nullptr), + 0, nullptr, 0, nullptr, nullptr), UR_RESULT_ERROR_INVALID_WORK_DIMENSION); } @@ -146,9 +185,9 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkGroupSize) { ur_mem_handle_t buffer = nullptr; AddBuffer1DArg(sizeof(val) * global_size, &buffer); AddPodArg(val); - auto result = - urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr); + auto result = urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, &local_size, + 0, nullptr, 0, nullptr, nullptr); ASSERT_TRUE(result == UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE || result == UR_RESULT_SUCCESS); } @@ -156,16 +195,16 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkGroupSize) { TEST_P(urEnqueueKernelLaunchKernelWgSizeTest, Success) { UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}); - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - global_offset.data(), global_size.data(), - nullptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, global_offset.data(), global_size.data(), + nullptr, 0, nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); } TEST_P(urEnqueueKernelLaunchKernelWgSizeTest, SuccessWithExplicitLocalSize) { - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - global_offset.data(), global_size.data(), - wg_size.data(), 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, global_offset.data(), global_size.data(), + wg_size.data(), 0, nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); } @@ -176,7 +215,7 @@ TEST_P(urEnqueueKernelLaunchKernelWgSizeTest, NonMatchingLocalSize) { ASSERT_EQ_RESULT( urEnqueueKernelLaunch(queue, kernel, n_dimensions, global_offset.data(), global_size.data(), wrong_wg_size.data(), 0, - nullptr, nullptr), + nullptr, 0, nullptr, nullptr), UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); } @@ -185,9 +224,9 @@ TEST_P(urEnqueueKernelLaunchKernelSubGroupTest, Success) { ur_mem_handle_t buffer = nullptr; AddBuffer1DArg(sizeof(size_t), &buffer); - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - global_offset.data(), global_size.data(), - nullptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, global_offset.data(), global_size.data(), + nullptr, 0, nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); // We specify this subgroup size in the kernel source, and then the kernel // queries for its subgroup size at runtime and writes it to the buffer. @@ -210,8 +249,8 @@ TEST_P(urEnqueueKernelLaunchKernelStandardTest, Success) { AddPodArg(11.0); ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &offset, - &global_size, nullptr, 0, nullptr, - nullptr)); + &global_size, nullptr, 0, nullptr, 0, + nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(output, sizeof(uint32_t), expected_result); } @@ -301,7 +340,7 @@ TEST_P(urEnqueueKernelLaunchTestWithParam, Success) { AddPodArg(val); ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, global_offset, global_range, nullptr, 0, - nullptr, nullptr)); + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(buffer, buffer_size, val); } @@ -357,8 +396,8 @@ TEST_P(urEnqueueKernelLaunchWithUSM, Success) { ur_event_handle_t kernel_evt; ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, work_dim, &global_offset, - &global_size, nullptr, 0, nullptr, - &kernel_evt)); + &global_size, nullptr, 0, nullptr, 0, + nullptr, &kernel_evt)); ASSERT_SUCCESS(urQueueFinish(queue)); @@ -388,8 +427,8 @@ TEST_P(urEnqueueKernelLaunchWithUSM, WithMemcpy) { ur_event_handle_t kernel_evt; ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, work_dim, &global_offset, - &global_size, nullptr, 0, nullptr, - &kernel_evt)); + &global_size, nullptr, 0, nullptr, 0, + nullptr, &kernel_evt)); ur_event_handle_t memcpy_event; ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, data.data(), usmPtr, @@ -476,8 +515,8 @@ TEST_P(urEnqueueKernelLaunchWithVirtualMemory, Success) { ur_event_handle_t kernel_evt; ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, work_dim, &global_offset, - &global_size, nullptr, 0, nullptr, - &kernel_evt)); + &global_size, nullptr, 0, nullptr, 0, + nullptr, &kernel_evt)); std::vector data(global_size); ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, true, data.data(), virtual_ptr, @@ -554,7 +593,7 @@ TEST_P(urEnqueueKernelLaunchMultiDeviceTest, KernelLaunchReadDifferentQueues) { helper.AddPodArg(val); ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernel, n_dimensions, &global_offset, &global_size, nullptr, 0, - nullptr, nullptr)); + nullptr, 0, nullptr, nullptr)); // Wait for the queue to finish executing. EXPECT_SUCCESS(urEnqueueEventsWait(queues[0], 0, nullptr, nullptr)); @@ -667,8 +706,8 @@ TEST_P(urEnqueueKernelLaunchUSMLinkedList, Success) { // Run kernel which will iterate the list and modify the values ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, list_head)); ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &global_offset, - &global_size, nullptr, 0, nullptr, - nullptr)); + &global_size, nullptr, 0, nullptr, 0, + nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); // Verify values diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp index 24dfb5c883468..e1576d158ee2d 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunchAndMemcpyInOrder.cpp @@ -203,8 +203,8 @@ TEST_P(urEnqueueKernelLaunchIncrementTest, Success) { // execute kernel that increments each element by 1 ASSERT_SUCCESS(urEnqueueKernelLaunch( - queue, kernels[i], n_dimensions, &global_offset, &ArraySize, nullptr, - bool(lastMemcpyEvent), lastMemcpyEvent, kernelEvent)); + queue, kernels[i], n_dimensions, &global_offset, &ArraySize, nullptr, 0, + nullptr, bool(lastMemcpyEvent), lastMemcpyEvent, kernelEvent)); // copy the memory (input for the next kernel) if (i < numOps - 1) { @@ -296,9 +296,10 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceTest, Success) { memcpyEvent = i < devices.size() - 1 ? memcpyEvents[i].ptr() : nullptr; // execute kernel that increments each element by 1 - ASSERT_SUCCESS(urEnqueueKernelLaunch( - queues[i], kernels[i], n_dimensions, &global_offset, &ArraySize, - nullptr, bool(lastMemcpyEvent), lastMemcpyEvent, kernelEvent)); + ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[i], kernels[i], n_dimensions, + &global_offset, &ArraySize, nullptr, 0, + nullptr, bool(lastMemcpyEvent), + lastMemcpyEvent, kernelEvent)); // copy the memory to next device if (i < devices.size() - 1) { @@ -395,8 +396,8 @@ TEST_P(urEnqueueKernelLaunchIncrementMultiDeviceMultiThreadTest, Success) { // execute kernel that increments each element by 1 ASSERT_SUCCESS(urEnqueueKernelLaunch( - queue, kernel, n_dimensions, &global_offset, &ArraySize, nullptr, - waitNum, lastEvent, signalEvent)); + queue, kernel, n_dimensions, &global_offset, &ArraySize, nullptr, 0, + nullptr, waitNum, lastEvent, signalEvent)); } std::vector data(ArraySize); diff --git a/unified-runtime/test/conformance/exp_command_buffer/regression/usm_copy.cpp b/unified-runtime/test/conformance/exp_command_buffer/regression/usm_copy.cpp index cc691517d136f..21616b0b83fcf 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/regression/usm_copy.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/regression/usm_copy.cpp @@ -84,7 +84,7 @@ TEST_P(urCommandBufferUSMCopyInOrderTest, Success) { // D[0] = 44 ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 0, - nullptr, nullptr)); + nullptr, 0, nullptr, nullptr)); // command-buffer sync point used to enforce linear dependencies when // appending commands to the command-buffer. diff --git a/unified-runtime/test/conformance/exp_launch_properties/CMakeLists.txt b/unified-runtime/test/conformance/exp_launch_properties/CMakeLists.txt deleted file mode 100644 index db598831494d2..0000000000000 --- a/unified-runtime/test/conformance/exp_launch_properties/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) 2024 Intel Corporation -# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. -# See LICENSE.TXT -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -add_conformance_test_with_kernels_environment(exp_launch_properties - launch_properties.cpp - ) - diff --git a/unified-runtime/test/conformance/exp_launch_properties/launch_properties.cpp b/unified-runtime/test/conformance/exp_launch_properties/launch_properties.cpp deleted file mode 100644 index 43be352da7fcd..0000000000000 --- a/unified-runtime/test/conformance/exp_launch_properties/launch_properties.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (C) 2024 Intel Corporation -// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM -// Exceptions. See LICENSE.TXT -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include -#include - -struct urEnqueueKernelLaunchCustomTest : uur::urKernelExecutionTest { - void SetUp() override { - program_name = "fill"; - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); - } - - uint32_t val = 42; - size_t global_size = 32; - size_t global_offset = 0; - size_t n_dimensions = 1; -}; -UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchCustomTest); - -TEST_P(urEnqueueKernelLaunchCustomTest, Success) { - UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); - - ur_bool_t launch_properties_support = false; - ASSERT_SUCCESS(urDeviceGetInfo( - device, UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP, - sizeof(launch_properties_support), &launch_properties_support, nullptr)); - if (!launch_properties_support) { - GTEST_SKIP() << "EXP launch properties feature is not supported."; - } - - std::vector props(1); - props[0].id = UR_EXP_LAUNCH_PROPERTY_ID_IGNORE; - - size_t returned_size = 0; - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_PROFILE, 0, nullptr, - &returned_size)); - - std::unique_ptr returned_backend(new char[returned_size]); - - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_PROFILE, returned_size, - returned_backend.get(), nullptr)); - - std::string_view backend_string(returned_backend.get()); - const bool cuda_backend = backend_string.find("CUDA") != std::string::npos; - - if (cuda_backend) { - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_VERSION, 0, nullptr, - &returned_size)); - - std::unique_ptr returned_compute_capability( - new char[returned_size]); - - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_VERSION, - returned_size, - returned_compute_capability.get(), nullptr)); - - auto compute_capability = - std::stof(std::string(returned_compute_capability.get())); - - if (compute_capability >= 6.0) { - ur_exp_launch_property_t coop_prop; - coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE; - coop_prop.value.cooperative = 1; - props.push_back(coop_prop); - } - - ur_bool_t cluster_launch_supported = false; - ASSERT_SUCCESS( - urDeviceGetInfo(device, UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP, - sizeof(ur_bool_t), &cluster_launch_supported, nullptr)); - - if (cluster_launch_supported) { - ur_exp_launch_property_t cluster_dims_prop; - cluster_dims_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION; - cluster_dims_prop.value.clusterDim[0] = 16; - cluster_dims_prop.value.clusterDim[1] = 1; - cluster_dims_prop.value.clusterDim[2] = 1; - - props.push_back(cluster_dims_prop); - } - } - ur_mem_handle_t buffer = nullptr; - AddBuffer1DArg(sizeof(val) * global_size, &buffer); - AddPodArg(val); - - ASSERT_SUCCESS(urEnqueueKernelLaunchCustomExp( - queue, kernel, n_dimensions, &global_offset, &global_size, nullptr, 1, - &props[0], 0, nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - ValidateBuffer(buffer, sizeof(val) * global_size, val); -} diff --git a/unified-runtime/test/conformance/integration/QueueBuffer.cpp b/unified-runtime/test/conformance/integration/QueueBuffer.cpp index 91ea52ed13617..8be98bfe6ba92 100644 --- a/unified-runtime/test/conformance/integration/QueueBuffer.cpp +++ b/unified-runtime/test/conformance/integration/QueueBuffer.cpp @@ -87,7 +87,7 @@ TEST_P(QueueBufferTestWithParam, QueueBufferTest) { ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, &GlobalOffset, &ArraySize, nullptr, 0, - nullptr, &Event)); + nullptr, 0, nullptr, &Event)); ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); CurValueMem2 = CurValueMem1 * 2; @@ -100,7 +100,7 @@ TEST_P(QueueBufferTestWithParam, QueueBufferTest) { ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, &GlobalOffset, &ArraySize, nullptr, 0, - nullptr, &Event)); + nullptr, 0, nullptr, &Event)); ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); CurValueMem1 = CurValueMem2 * 2; diff --git a/unified-runtime/test/conformance/integration/QueueEmptyStatus.cpp b/unified-runtime/test/conformance/integration/QueueEmptyStatus.cpp index e6308492d172f..2885bf51d0368 100644 --- a/unified-runtime/test/conformance/integration/QueueEmptyStatus.cpp +++ b/unified-runtime/test/conformance/integration/QueueEmptyStatus.cpp @@ -60,7 +60,7 @@ struct QueueEmptyStatusTestWithParam : uur::IntegrationQueueTestWithParam { for (uint32_t i = 0; i < num_iterations; ++i) { ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, n_dimensions, &global_offset, &ArraySize, nullptr, - 0, nullptr, &Event)); + 0, nullptr, 0, nullptr, &Event)); ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); } diff --git a/unified-runtime/test/conformance/integration/QueueUSM.cpp b/unified-runtime/test/conformance/integration/QueueUSM.cpp index 7f0dce10d5cc1..d9958dfb20fdc 100644 --- a/unified-runtime/test/conformance/integration/QueueUSM.cpp +++ b/unified-runtime/test/conformance/integration/QueueUSM.cpp @@ -100,7 +100,7 @@ TEST_P(QueueUSMTestWithParam, QueueUSMTest) { ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, &GlobalOffset, &ArraySize, nullptr, 0, - nullptr, &Event)); + nullptr, 0, nullptr, &Event)); ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); CurValueMem2 = CurValueMem1 * 2; @@ -111,7 +111,7 @@ TEST_P(QueueUSMTestWithParam, QueueUSMTest) { ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, &GlobalOffset, &ArraySize, nullptr, 0, - nullptr, &Event)); + nullptr, 0, nullptr, &Event)); ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); CurValueMem1 = CurValueMem2 * 2; diff --git a/unified-runtime/test/conformance/kernel/CMakeLists.txt b/unified-runtime/test/conformance/kernel/CMakeLists.txt index 73ab3f1101a05..cf54f65c14a1a 100644 --- a/unified-runtime/test/conformance/kernel/CMakeLists.txt +++ b/unified-runtime/test/conformance/kernel/CMakeLists.txt @@ -19,4 +19,5 @@ add_conformance_test_with_kernels_environment(kernel urKernelSetArgValue.cpp urKernelSetExecInfo.cpp urKernelSetSpecializationConstants.cpp + urKernelSuggestMaxCooperativeGroupCount.cpp urKernelGetSuggestedLocalWorkSize.cpp) diff --git a/unified-runtime/test/conformance/kernel/urKernelCreate.cpp b/unified-runtime/test/conformance/kernel/urKernelCreate.cpp index 7bd45a06bc148..651193b7e2823 100644 --- a/unified-runtime/test/conformance/kernel/urKernelCreate.cpp +++ b/unified-runtime/test/conformance/kernel/urKernelCreate.cpp @@ -84,9 +84,9 @@ TEST_P(urMultiDeviceKernelCreateTest, WithProgramBuild) { ASSERT_SUCCESS( urKernelCreate(program.get(), kernelName.data(), kernel.ptr())); - ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[i], kernel.get(), n_dimensions, - &global_offset, &local_size, - &global_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queues[i], kernel.get(), n_dimensions, &global_offset, &local_size, + &global_size, 0, nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queues[i])); } @@ -126,9 +126,9 @@ TEST_P(urMultiDeviceKernelCreateTest, WithProgramCompileAndLink) { ASSERT_SUCCESS( urKernelCreate(linked_program.get(), kernelName.data(), kernel.ptr())); - ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[i], kernel.get(), n_dimensions, - &global_offset, &local_size, - &global_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queues[i], kernel.get(), n_dimensions, &global_offset, &local_size, + &global_size, 0, nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queues[i])); } diff --git a/unified-runtime/test/conformance/kernel/urKernelSetArgLocal.cpp b/unified-runtime/test/conformance/kernel/urKernelSetArgLocal.cpp index 323ee6da97a0f..038fcd5a6b967 100644 --- a/unified-runtime/test/conformance/kernel/urKernelSetArgLocal.cpp +++ b/unified-runtime/test/conformance/kernel/urKernelSetArgLocal.cpp @@ -150,9 +150,9 @@ struct urKernelSetArgLocalMultiTest : uur::urKernelExecutionTest { UUR_INSTANTIATE_DEVICE_TEST_SUITE(urKernelSetArgLocalMultiTest); TEST_P(urKernelSetArgLocalMultiTest, Basic) { - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - &global_offset, &global_size, - &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, &global_offset, &global_size, &local_size, 0, + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); uint32_t *output = (uint32_t *)shared_ptrs[0]; @@ -162,9 +162,9 @@ TEST_P(urKernelSetArgLocalMultiTest, Basic) { } TEST_P(urKernelSetArgLocalMultiTest, ReLaunch) { - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - &global_offset, &global_size, - &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, &global_offset, &global_size, &local_size, 0, + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); uint32_t *output = (uint32_t *)shared_ptrs[0]; @@ -173,9 +173,9 @@ TEST_P(urKernelSetArgLocalMultiTest, ReLaunch) { Validate(output, X, Y, A, global_size, local_size); // Relaunch with new arguments - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - &global_offset, &global_size, - &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, &global_offset, &global_size, &local_size, 0, + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); uint32_t *new_output = (uint32_t *)shared_ptrs[0]; uint32_t *new_X = (uint32_t *)shared_ptrs[3]; @@ -185,9 +185,9 @@ TEST_P(urKernelSetArgLocalMultiTest, ReLaunch) { // Overwrite local args to a larger value, then reset back to original TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - &global_offset, &global_size, - &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, &global_offset, &global_size, &local_size, 0, + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); uint32_t *output = (uint32_t *)shared_ptrs[0]; @@ -230,9 +230,9 @@ TEST_P(urKernelSetArgLocalMultiTest, Overwrite) { &hip_local_offset)); } - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - &global_offset, &global_size, - &new_local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, &global_offset, &global_size, + &new_local_size, 0, nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); Validate(output, X, Y, A, global_size, new_local_size); @@ -326,9 +326,9 @@ struct urKernelSetArgLocalOutOfOrder : urKernelSetArgLocalMultiTest { UUR_INSTANTIATE_DEVICE_TEST_SUITE(urKernelSetArgLocalOutOfOrder); TEST_P(urKernelSetArgLocalOutOfOrder, Success) { - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - &global_offset, &global_size, - &local_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, &global_offset, &global_size, &local_size, 0, + nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); uint32_t *output = (uint32_t *)shared_ptrs[0]; diff --git a/unified-runtime/test/conformance/kernel/urKernelSuggestMaxCooperativeGroupCount.cpp b/unified-runtime/test/conformance/kernel/urKernelSuggestMaxCooperativeGroupCount.cpp new file mode 100644 index 0000000000000..a7dfc7b3ff515 --- /dev/null +++ b/unified-runtime/test/conformance/kernel/urKernelSuggestMaxCooperativeGroupCount.cpp @@ -0,0 +1,81 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +struct urKernelSuggestMaxCooperativeGroupCountTest + : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "bar"; + + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + + ur_kernel_launch_properties_support_flags_t supported_properties = 0; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT, + sizeof(supported_properties), &supported_properties, nullptr)); + if (!(supported_properties & + UR_KERNEL_LAUNCH_PROPERTIES_SUPPORT_FLAG_COOPERATIVE)) { + GTEST_SKIP() << "Cooperative launch is not supported."; + } + } + + uint32_t suggested_work_groups = 0; + const uint32_t n_dimensions = 1; + const size_t local_size = 1; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urKernelSuggestMaxCooperativeGroupCountTest); + +TEST_P(urKernelSuggestMaxCooperativeGroupCountTest, Success) { + ASSERT_SUCCESS(urKernelSuggestMaxCooperativeGroupCount( + kernel, device, n_dimensions, &local_size, 0, &suggested_work_groups)); + ASSERT_GE(suggested_work_groups, 0); +} + +TEST_P(urKernelSuggestMaxCooperativeGroupCountTest, InvalidNullHandleKernel) { + ASSERT_EQ_RESULT(urKernelSuggestMaxCooperativeGroupCount( + nullptr, device, n_dimensions, &local_size, 0, + &suggested_work_groups), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urKernelSuggestMaxCooperativeGroupCountTest, InvalidNullHandleDevice) { + ASSERT_EQ_RESULT(urKernelSuggestMaxCooperativeGroupCount( + kernel, nullptr, n_dimensions, &local_size, 0, + &suggested_work_groups), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urKernelSuggestMaxCooperativeGroupCountTest, InvalidWorkDimension) { + // Only supports 1-3 dimensions. + ASSERT_EQ_RESULT( + urKernelSuggestMaxCooperativeGroupCount(kernel, device, 4, &local_size, 0, + &suggested_work_groups), + UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + ASSERT_EQ_RESULT( + urKernelSuggestMaxCooperativeGroupCount(kernel, device, 0, &local_size, 0, + &suggested_work_groups), + UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + ASSERT_EQ_RESULT( + urKernelSuggestMaxCooperativeGroupCount( + kernel, device, UINT32_MAX, &local_size, 0, &suggested_work_groups), + UR_RESULT_ERROR_INVALID_WORK_DIMENSION); +} + +TEST_P(urKernelSuggestMaxCooperativeGroupCountTest, + InvalidNullPointerLocalSize) { + ASSERT_EQ_RESULT( + urKernelSuggestMaxCooperativeGroupCount( + kernel, device, n_dimensions, nullptr, 0, &suggested_work_groups), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} + +TEST_P(urKernelSuggestMaxCooperativeGroupCountTest, + InvalidNullPointerGroupCountRet) { + ASSERT_EQ_RESULT(urKernelSuggestMaxCooperativeGroupCount( + kernel, device, n_dimensions, &local_size, 0, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} diff --git a/unified-runtime/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp b/unified-runtime/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp index 8a70545371275..ff5ee3647822c 100644 --- a/unified-runtime/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp +++ b/unified-runtime/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp @@ -217,8 +217,8 @@ TEST_P(urMultiDeviceContextMemBufferTest, WriteKernelRead) { // Kernel increments the fill val by 1 ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/, - offset, work_dims, nullptr, 1, &e1, - &e2)); + offset, work_dims, nullptr, 0, nullptr, + 1, &e1, &e2)); ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, @@ -253,13 +253,13 @@ TEST_P(urMultiDeviceContextMemBufferTest, WriteKernelKernelRead) { // Kernel increments the fill val by 1 ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/, - offset, work_dims, nullptr, 1, &e1, - &e2)); + offset, work_dims, nullptr, 0, nullptr, + 1, &e1, &e2)); // Kernel increments the fill val by 1 ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernels[0], 1 /*workDim=*/, - offset, work_dims, nullptr, 1, &e2, - &e3)); + offset, work_dims, nullptr, 0, nullptr, + 1, &e2, &e3)); ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, buffer_size_bytes, out_vec.data(), 1, diff --git a/unified-runtime/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp b/unified-runtime/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp index 3eea89322665b..5f541fc9bc98f 100644 --- a/unified-runtime/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp +++ b/unified-runtime/test/conformance/program/urMultiDeviceProgramCreateWithBinary.cpp @@ -73,9 +73,9 @@ TEST_P(urMultiDeviceProgramCreateWithBinaryTest, ASSERT_SUCCESS( urKernelCreate(binary_program, kernelName.data(), kernel.ptr())); - ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[i], kernel.get(), n_dimensions, - &global_offset, &local_size, - &global_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queues[i], kernel.get(), n_dimensions, &global_offset, &local_size, + &global_size, 0, nullptr, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queues[i])); } diff --git a/unified-runtime/test/conformance/testing/include/uur/fixtures.h b/unified-runtime/test/conformance/testing/include/uur/fixtures.h index f792448be26a1..47cb1f345634b 100644 --- a/unified-runtime/test/conformance/testing/include/uur/fixtures.h +++ b/unified-runtime/test/conformance/testing/include/uur/fixtures.h @@ -1454,7 +1454,7 @@ struct KernelLaunchHelper { size_t offset = 0; ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &offset, &global_size, &local_size, 0, nullptr, - nullptr)); + 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); } diff --git a/unified-runtime/test/conformance/usm/urUSMFree.cpp b/unified-runtime/test/conformance/usm/urUSMFree.cpp index 03d3f78867d09..49cc3e46c60bb 100644 --- a/unified-runtime/test/conformance/usm/urUSMFree.cpp +++ b/unified-runtime/test/conformance/usm/urUSMFree.cpp @@ -133,8 +133,9 @@ TEST_P(urUSMFreeDuringExecutionTest, SuccessHost) { EXPECT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, allocation)); EXPECT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(data), nullptr, &data)); - EXPECT_SUCCESS(urEnqueueKernelLaunch( - queue, kernel, 1, &wg_offset, &array_size, nullptr, 0, nullptr, nullptr)); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &wg_offset, + &array_size, nullptr, 0, nullptr, 0, + nullptr, nullptr)); ASSERT_SUCCESS(urUSMFree(context, allocation)); ASSERT_SUCCESS(urQueueFinish(queue)); } @@ -153,8 +154,9 @@ TEST_P(urUSMFreeDuringExecutionTest, SuccessDevice) { EXPECT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, allocation)); EXPECT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(data), nullptr, &data)); - EXPECT_SUCCESS(urEnqueueKernelLaunch( - queue, kernel, 1, &wg_offset, &array_size, nullptr, 0, nullptr, nullptr)); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &wg_offset, + &array_size, nullptr, 0, nullptr, 0, + nullptr, nullptr)); ASSERT_SUCCESS(urUSMFree(context, allocation)); ASSERT_SUCCESS(urQueueFinish(queue)); } @@ -173,8 +175,9 @@ TEST_P(urUSMFreeDuringExecutionTest, SuccessShared) { EXPECT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, allocation)); EXPECT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(data), nullptr, &data)); - EXPECT_SUCCESS(urEnqueueKernelLaunch( - queue, kernel, 1, &wg_offset, &array_size, nullptr, 0, nullptr, nullptr)); + EXPECT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, 1, &wg_offset, + &array_size, nullptr, 0, nullptr, 0, + nullptr, nullptr)); ASSERT_SUCCESS(urUSMFree(context, allocation)); ASSERT_SUCCESS(urQueueFinish(queue)); } diff --git a/unified-runtime/tools/urinfo/urinfo.hpp b/unified-runtime/tools/urinfo/urinfo.hpp index 9992b08b51bb8..89c9b12e16f9f 100644 --- a/unified-runtime/tools/urinfo/urinfo.hpp +++ b/unified-runtime/tools/urinfo/urinfo.hpp @@ -344,6 +344,9 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, printDeviceInfo(hDevice, UR_DEVICE_INFO_BFLOAT16_CONVERSIONS_NATIVE); std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_KERNEL_LAUNCH_PROPERTIES_SUPPORT); + std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP); std::cout << prefix; @@ -356,9 +359,6 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, printDeviceInfo( hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_SUBGRAPH_SUPPORT_EXP); std::cout << prefix; - printDeviceInfo(hDevice, - UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP); - std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP); std::cout << prefix; @@ -444,14 +444,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, printDeviceInfo(hDevice, UR_DEVICE_INFO_ASYNC_USM_ALLOCATIONS_SUPPORT_EXP); std::cout << prefix; - printDeviceInfo(hDevice, - UR_DEVICE_INFO_LAUNCH_PROPERTIES_SUPPORT_EXP); - std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP); std::cout << prefix; - printDeviceInfo(hDevice, - UR_DEVICE_INFO_COOPERATIVE_KERNEL_SUPPORT_EXP); - std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_MULTI_DEVICE_COMPILE_SUPPORT_EXP); }