From 172d5042ab4304d26e653c0ba727cc7fd1ceb6f8 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Mon, 12 May 2025 13:31:37 +0100 Subject: [PATCH 01/11] Defer arg extraction until handler::finalize Rather than extracting arguments from the lambda when the kernel is enqueued, store a pointer to the lambda alongside relevant information from the integration header or compiler builtins. Storing this information will allow us to defer the extraction of arguments until we reach handler::finalize(), at which point it may be possible to set the kernel arguments directly without populating MArgs. Signed-off-by: John Pennycook --- sycl/include/sycl/handler.hpp | 18 +++++++++++------- sycl/source/detail/handler_impl.hpp | 6 ++++++ sycl/source/handler.cpp | 21 ++++++++++++++++++++- sycl/test/abi/sycl_symbols_linux.dump | 1 + sycl/test/abi/sycl_symbols_windows.dump | 1 + 5 files changed, 39 insertions(+), 8 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index bfda64639683c..d15014323d604 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -749,8 +749,8 @@ class __SYCL_EXPORT handler { /// Stores lambda to the template-free object /// - /// Also initializes kernel name, list of arguments and requirements using - /// information from the integration header/built-ins. + /// Also initializes the kernel name and prepares for arguments to + /// be extracted from the lambda in handler::finalize(). /// /// \param KernelFunc is a SYCL kernel function /// \param ParamDescs is the vector of kernel parameter descriptors. @@ -796,11 +796,11 @@ class __SYCL_EXPORT handler { if constexpr (KernelHasName) { // TODO support ESIMD in no-integration-header case too. - clearArgs(); - extractArgsAndReqsFromLambda(MHostKernel->getPtr(), - &(detail::getKernelParamDesc), - detail::getKernelNumParams(), - detail::isKernelESIMD()); + setKernelInfo((void *)MHostKernel->getPtr(), + detail::getKernelNumParams(), + &(detail::getKernelParamDesc), + detail::isKernelESIMD()); + MKernelName = detail::getKernelName(); } else { // In case w/o the integration header it is necessary to process @@ -3761,6 +3761,10 @@ class __SYCL_EXPORT handler { sycl::range<3> LocalSize, sycl::id<3> Offset, int Dims); + void setKernelInfo(void *KernelFuncPtr, int KernelNumArgs, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + bool KernelIsESIMD); + friend class detail::HandlerAccess; #ifdef __INTEL_PREVIEW_BREAKING_CHANGES diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 42b1991f153f5..31fc46638bce5 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -198,6 +198,12 @@ class handler_impl { // Allocation ptr to be freed asynchronously. void *MFreePtr = nullptr; + + // Store information about the kernel arguments. + void *MKernelFuncPtr = nullptr; + int MKernelNumArgs = 0; + detail::kernel_param_desc_t (*MKernelParamDescGetter)(int) = nullptr; + bool MKernelIsESIMD = false; }; } // namespace detail diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 039d07672e0bb..da11263962d54 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -411,6 +411,16 @@ event handler::finalize() { return MLastEvent; MIsFinalized = true; + // Extract arguments from the kernel lambda. + // TODO: Skip this in simple cases. + const auto &type = getType(); + if (type == detail::CGType::Kernel && impl->MKernelFuncPtr) { + clearArgs(); + extractArgsAndReqsFromLambda((char *)impl->MKernelFuncPtr, + impl->MKernelParamDescGetter, + impl->MKernelNumArgs, impl->MKernelIsESIMD); + } + // According to 4.7.6.9 of SYCL2020 spec, if a placeholder accessor is passed // to a command without being bound to a command group, an exception should // be thrown. @@ -448,7 +458,6 @@ event handler::finalize() { } } - const auto &type = getType(); if (type == detail::CGType::Kernel) { // If there were uses of set_specialization_constant build the kernel_bundle std::shared_ptr KernelBundleImpPtr = @@ -2254,6 +2263,16 @@ void handler::setNDRangeDescriptorPadded(sycl::range<3> NumWorkItems, impl->MNDRDesc = NDRDescT{NumWorkItems, LocalSize, Offset, Dims}; } +void handler::setKernelInfo( + void *KernelFuncPtr, int KernelNumArgs, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + bool KernelIsESIMD) { + impl->MKernelFuncPtr = KernelFuncPtr; + impl->MKernelNumArgs = KernelNumArgs; + impl->MKernelParamDescGetter = KernelParamDescGetter; + impl->MKernelIsESIMD = KernelIsESIMD; +} + void handler::saveCodeLoc(detail::code_location CodeLoc, bool IsTopCodeLoc) { MCodeLoc = CodeLoc; impl->MIsTopCodeLoc = IsTopCodeLoc; diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 1e1ca428eff80..3c935cb4e52c5 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3523,6 +3523,7 @@ _ZN4sycl3_V17handler12addReductionERKSt10shared_ptrIKvE _ZN4sycl3_V17handler12setArgHelperEiRNS0_3ext6oneapi12experimental6detail30dynamic_work_group_memory_baseE _ZN4sycl3_V17handler12setArgHelperEiRNS0_6detail22work_group_memory_implE _ZN4sycl3_V17handler13getKernelNameEv +_ZN4sycl3_V17handler13setKernelInfoEPviPFNS0_6detail19kernel_param_desc_tEiEb _ZN4sycl3_V17handler14addAccessorReqESt10shared_ptrINS0_6detail16AccessorImplHostEE _ZN4sycl3_V17handler14setNDRangeUsedEb _ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleENS0_5rangeILi3EEERKNS4_16image_descriptorEPvS7_S7_S7_ diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 13c10fe74d305..040c2b32acd16 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4377,6 +4377,7 @@ ?setHandlerKernelBundle@handler@_V1@sycl@@AEAAXVkernel@23@@Z ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4StableKernelCacheConfig@123@@Z ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$02@23@H@Z +?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z ?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z From d046b0a7636a6c4a1c75f4d2e799dd9831473d6b Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 13 May 2025 11:08:00 +0200 Subject: [PATCH 02/11] Fix warnings/tests related to hasSpecialCaptures hasSpecialCaptures requires getParamDesc to be constexpr. Several tests were previously incompatible with this requirement, but it was only discovered when trying to call hasSpecialCaptures for each kernel. Signed-off-by: John Pennycook --- sycl/include/sycl/detail/kernel_desc.hpp | 2 +- sycl/unittests/Extensions/USMMemcpy2D.cpp | 100 ++++++++++-------- .../WorkGroupMemoryBackendArgument.cpp | 8 +- sycl/unittests/buffer/KernelArgMemObj.cpp | 10 +- .../arg_mask/EliminatedArgMask.cpp | 18 ++++ 5 files changed, 84 insertions(+), 54 deletions(-) diff --git a/sycl/include/sycl/detail/kernel_desc.hpp b/sycl/include/sycl/detail/kernel_desc.hpp index 67798a18e3aaa..1302ff07850c3 100644 --- a/sycl/include/sycl/detail/kernel_desc.hpp +++ b/sycl/include/sycl/detail/kernel_desc.hpp @@ -262,7 +262,7 @@ template constexpr int64_t getKernelSize() { template constexpr bool hasSpecialCaptures() { bool FoundSpecialCapture = false; - for (int I = 0; I < getKernelNumParams(); ++I) { + for (unsigned I = 0; I < getKernelNumParams(); ++I) { auto ParamDesc = getKernelParamDesc(I); bool IsSpecialCapture = (ParamDesc.kind != kernel_param_kind_t::kind_std_layout && diff --git a/sycl/unittests/Extensions/USMMemcpy2D.cpp b/sycl/unittests/Extensions/USMMemcpy2D.cpp index 880ed9bd69385..546767b55eb7d 100644 --- a/sycl/unittests/Extensions/USMMemcpy2D.cpp +++ b/sycl/unittests/Extensions/USMMemcpy2D.cpp @@ -29,22 +29,24 @@ struct KernelInfo> : public unittest::MockKernelInfoBase { static constexpr const char *getName() { return USMFillHelperKernelNameLong; } static constexpr unsigned getNumParams() { return 7; } - static const kernel_param_desc_t &getParamDesc(int Idx) { - // Actual signature does not matter, but we need entries for each param. - static constexpr const kernel_param_desc_t DummySignature[] = { - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - }; + static constexpr const kernel_param_desc_t &getParamDesc(int Idx) { return DummySignature[Idx]; } static constexpr int64_t getKernelSize() { return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 3 * sizeof(size_t); } + +private: + // Actual signature does not matter, but we need entries for each param. + static constexpr const kernel_param_desc_t DummySignature[] = { + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + }; }; template <> @@ -52,22 +54,24 @@ struct KernelInfo> : public unittest::MockKernelInfoBase { static constexpr const char *getName() { return USMFillHelperKernelNameChar; } static constexpr unsigned getNumParams() { return 7; } - static const kernel_param_desc_t &getParamDesc(int Idx) { - // Actual signature does not matter, but we need entries for each param. - static constexpr const kernel_param_desc_t DummySignature[] = { - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - }; + static constexpr const kernel_param_desc_t &getParamDesc(int Idx) { return DummySignature[Idx]; } static constexpr int64_t getKernelSize() { return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 3 * sizeof(size_t); } + +private: + // Actual signature does not matter, but we need entries for each param. + static constexpr const kernel_param_desc_t DummySignature[] = { + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + }; }; template <> @@ -77,23 +81,25 @@ struct KernelInfo> return USMMemcpyHelperKernelNameLong; } static constexpr unsigned getNumParams() { return 8; } - static const kernel_param_desc_t &getParamDesc(int Idx) { - // Actual signature does not matter, but we need entries for each param. - static constexpr const kernel_param_desc_t DummySignature[] = { - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - }; + static constexpr const kernel_param_desc_t &getParamDesc(int Idx) { return DummySignature[Idx]; } static constexpr int64_t getKernelSize() { return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 4 * sizeof(size_t); } + +private: + // Actual signature does not matter, but we need entries for each param. + static constexpr const kernel_param_desc_t DummySignature[] = { + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + }; }; template <> @@ -103,23 +109,25 @@ struct KernelInfo> return USMMemcpyHelperKernelNameChar; } static constexpr unsigned getNumParams() { return 8; } - static const kernel_param_desc_t &getParamDesc(int Idx) { - // Actual signature does not matter, but we need entries for each param. - static constexpr const kernel_param_desc_t DummySignature[] = { - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - {kernel_param_kind_t::kind_std_layout, 4, 0}, - }; + static constexpr const kernel_param_desc_t &getParamDesc(int Idx) { return DummySignature[Idx]; } static constexpr int64_t getKernelSize() { return 2 * sizeof(void *) + 2 * sizeof(sycl::id<2>) + 4 * sizeof(size_t); } + +private: + // Actual signature does not matter, but we need entries for each param. + static constexpr const kernel_param_desc_t DummySignature[] = { + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + {kernel_param_kind_t::kind_std_layout, 4, 0}, + }; }; } // namespace detail } // namespace _V1 diff --git a/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp b/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp index 701fd6bfa1466..8febd9676fb9f 100644 --- a/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp +++ b/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp @@ -22,9 +22,7 @@ inline namespace _V1 { namespace detail { template <> struct KernelInfo { static constexpr unsigned getNumParams() { return 1; } - static const detail::kernel_param_desc_t &getParamDesc(int) { - static detail::kernel_param_desc_t WorkGroupMemory = { - detail::kernel_param_kind_t::kind_work_group_memory, 0, 0}; + static constexpr const detail::kernel_param_desc_t &getParamDesc(int) { return WorkGroupMemory; } static constexpr bool isESIMD() { return false; } @@ -32,6 +30,10 @@ template <> struct KernelInfo { static constexpr bool callsAnyThisFreeFunction() { return false; } static constexpr int64_t getKernelSize() { return sizeof(arg_type); } static constexpr const char *getName() { return "WorkGroupMemoryKernel"; } + +private: + static constexpr detail::kernel_param_desc_t WorkGroupMemory = { + detail::kernel_param_kind_t::kind_work_group_memory, 0, 0}; }; } // namespace detail diff --git a/sycl/unittests/buffer/KernelArgMemObj.cpp b/sycl/unittests/buffer/KernelArgMemObj.cpp index 2dd1eabfca9a1..b826e89a128b8 100644 --- a/sycl/unittests/buffer/KernelArgMemObj.cpp +++ b/sycl/unittests/buffer/KernelArgMemObj.cpp @@ -21,13 +21,15 @@ template <> struct KernelInfo : public unittest::MockKernelInfoBase { static constexpr const char *getName() { return "TestKernelWithMemObj"; } static constexpr unsigned getNumParams() { return 1; } - static const detail::kernel_param_desc_t &getParamDesc(int) { - static detail::kernel_param_desc_t desc{ - detail::kernel_param_kind_t::kind_accessor, - int(access::target::device) /*info*/, 0 /*offset*/}; + static constexpr const detail::kernel_param_desc_t &getParamDesc(int) { return desc; } static constexpr uint32_t getKernelSize() { return 32; } + +private: + static constexpr detail::kernel_param_desc_t desc{ + detail::kernel_param_kind_t::kind_accessor, + int(access::target::device) /*info*/, 0 /*offset*/}; }; } // namespace detail } // namespace _V1 diff --git a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp index 55cc4f790353b..1f983aea8bd54 100644 --- a/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp +++ b/sycl/unittests/program_manager/arg_mask/EliminatedArgMask.cpp @@ -36,18 +36,36 @@ template <> struct KernelInfo : public unittest::MockKernelInfoBase { static constexpr unsigned getNumParams() { return EAMTestKernelNumArgs; } static constexpr const char *getName() { return EAMTestKernelName; } + static constexpr const kernel_param_desc_t &getParamDesc(int i) { + return Dummy; + } + +private: + static constexpr kernel_param_desc_t Dummy{}; }; template <> struct KernelInfo : public unittest::MockKernelInfoBase { static constexpr unsigned getNumParams() { return 0; } static constexpr const char *getName() { return EAMTestKernel2Name; } + static constexpr const kernel_param_desc_t &getParamDesc(int i) { + return Dummy; + } + +private: + static constexpr kernel_param_desc_t Dummy{}; }; template <> struct KernelInfo : public unittest::MockKernelInfoBase { static constexpr unsigned getNumParams() { return EAMTestKernelNumArgs; } static constexpr const char *getName() { return EAMTestKernel3Name; } + static constexpr const kernel_param_desc_t &getParamDesc(int i) { + return Dummy; + } + +private: + static constexpr kernel_param_desc_t Dummy{}; }; } // namespace detail From a92ff30f72cd38cb37bee84a500baae6259c3939 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 13 May 2025 11:17:18 +0200 Subject: [PATCH 03/11] Store hasSpecialCaptures in kernel information We are currently only able to skip argument extraction in the case where a lambda has no special captures. We can only detect this while we have the kernel type name, and must carry it through until we call handler::finalize(). Signed-off-by: John Pennycook --- sycl/include/sycl/handler.hpp | 5 +++-- sycl/source/detail/handler_impl.hpp | 1 + sycl/source/handler.cpp | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index d15014323d604..a4f0ff3ee0f62 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -799,7 +799,8 @@ class __SYCL_EXPORT handler { setKernelInfo((void *)MHostKernel->getPtr(), detail::getKernelNumParams(), &(detail::getKernelParamDesc), - detail::isKernelESIMD()); + detail::isKernelESIMD(), + detail::hasSpecialCaptures()); MKernelName = detail::getKernelName(); } else { @@ -3763,7 +3764,7 @@ class __SYCL_EXPORT handler { void setKernelInfo(void *KernelFuncPtr, int KernelNumArgs, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), - bool KernelIsESIMD); + bool KernelIsESIMD, bool KernelHasSpecialCaptures); friend class detail::HandlerAccess; diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 31fc46638bce5..787fede1627ba 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -204,6 +204,7 @@ class handler_impl { int MKernelNumArgs = 0; detail::kernel_param_desc_t (*MKernelParamDescGetter)(int) = nullptr; bool MKernelIsESIMD = false; + bool MKernelHasSpecialCaptures = false; }; } // namespace detail diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index da11263962d54..d1699d4823dcc 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -2266,11 +2266,12 @@ void handler::setNDRangeDescriptorPadded(sycl::range<3> NumWorkItems, void handler::setKernelInfo( void *KernelFuncPtr, int KernelNumArgs, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), - bool KernelIsESIMD) { + bool KernelIsESIMD, bool KernelHasSpecialCaptures) { impl->MKernelFuncPtr = KernelFuncPtr; impl->MKernelNumArgs = KernelNumArgs; impl->MKernelParamDescGetter = KernelParamDescGetter; impl->MKernelIsESIMD = KernelIsESIMD; + impl->MKernelHasSpecialCaptures = KernelHasSpecialCaptures; } void handler::saveCodeLoc(detail::code_location CodeLoc, bool IsTopCodeLoc) { From c2813f006d62af936e13c3bbb13455b421e0a72a Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 13 May 2025 11:37:55 +0200 Subject: [PATCH 04/11] Extract kernel arguments directly on the fast path Signed-off-by: John Pennycook --- sycl/source/detail/scheduler/commands.cpp | 50 ++++++++++++++++++----- sycl/source/detail/scheduler/commands.hpp | 5 ++- sycl/source/handler.cpp | 35 +++++++++------- 3 files changed, 64 insertions(+), 26 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 0483dd7318a07..04621bb570180 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2385,7 +2385,10 @@ static ur_result_t SetKernelParamsAndLaunch( const std::function &getMemAllocationFunc, bool IsCooperative, bool KernelUsesClusterLaunch, uint32_t WorkGroupMemorySize, const RTDeviceBinaryImage *BinImage, - KernelNameStrRefT KernelName) { + KernelNameStrRefT KernelName, void *KernelFuncPtr = nullptr, + int KernelNumArgs = 0, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr, + bool KernelHasSpecialCaptures = false) { assert(Queue && "Kernel submissions should have an associated queue"); const AdapterPtr &Adapter = Queue->getAdapter(); @@ -2397,13 +2400,37 @@ static ur_result_t SetKernelParamsAndLaunch( : Empty); } - auto setFunc = [&Adapter, Kernel, &DeviceImageImpl, &getMemAllocationFunc, - &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) { - SetArgBasedOnType(Adapter, Kernel, DeviceImageImpl, getMemAllocationFunc, - Queue->getContextImplPtr(), Arg, NextTrueIndex); - }; - - applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); + if (KernelFuncPtr && !KernelHasSpecialCaptures) { + // TODO: Refactor to avoid SetArgBasedOnType duplication + // TODO: Find a way to use the built-ins instead of variables. + for (int I = 0; I < KernelNumArgs; ++I) { + auto ParamDesc = KernelParamDescGetter(I); + const void *ArgPtr = (const char *)KernelFuncPtr + ParamDesc.offset; + switch (ParamDesc.kind) { + case kernel_param_kind_t::kind_std_layout: { + int Size = ParamDesc.info; + Adapter->call(Kernel, I, Size, nullptr, + ArgPtr); + break; + } + case kernel_param_kind_t::kind_pointer: { + const void *Ptr = *static_cast(ArgPtr); + Adapter->call(Kernel, I, nullptr, + Ptr); + break; + } + default: + throw std::runtime_error("Direct kernel argument copy failed."); + } + } + } else { + auto setFunc = [&Adapter, Kernel, &DeviceImageImpl, &getMemAllocationFunc, + &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) { + SetArgBasedOnType(Adapter, Kernel, DeviceImageImpl, getMemAllocationFunc, + Queue->getContextImplPtr(), Arg, NextTrueIndex); + }; + applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); + } std::optional ImplicitLocalArg = ProgramManager::getInstance().kernelImplicitLocalArgPos(KernelName); @@ -2655,7 +2682,9 @@ void enqueueImpKernel( const std::function &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, const bool KernelIsCooperative, const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize, - const RTDeviceBinaryImage *BinImage) { + const RTDeviceBinaryImage *BinImage, void *KernelFuncPtr, int KernelNumArgs, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int), + bool KernelHasSpecialCaptures) { assert(Queue && "Kernel submissions should have an associated queue"); // Run OpenCL kernel auto &ContextImpl = Queue->getContextImplPtr(); @@ -2739,7 +2768,8 @@ void enqueueImpKernel( Queue, Args, DeviceImageImpl, Kernel, NDRDesc, EventsWaitList, OutEventImpl, EliminatedArgMask, getMemAllocationFunc, KernelIsCooperative, KernelUsesClusterLaunch, WorkGroupMemorySize, - BinImage, KernelName); + BinImage, KernelName, KernelFuncPtr, KernelNumArgs, + KernelParamDescGetter, KernelHasSpecialCaptures); const AdapterPtr &Adapter = Queue->getAdapter(); if (!SyclKernelImpl && !MSyclKernel) { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 1acdae47d36ab..b412738736cfd 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -626,7 +626,10 @@ void enqueueImpKernel( const std::function &getMemAllocationFunc, ur_kernel_cache_config_t KernelCacheConfig, bool KernelIsCooperative, const bool KernelUsesClusterLaunch, const size_t WorkGroupMemorySize, - const RTDeviceBinaryImage *BinImage = nullptr); + const RTDeviceBinaryImage *BinImage = nullptr, + void *KernelFuncPtr = nullptr, int KernelNumArgs = 0, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr, + bool KernelHasSpecialCaptures = false); /// The exec CG command enqueues execution of kernel or explicit memory /// operation. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index d1699d4823dcc..ec8a445d272b9 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -411,10 +411,18 @@ event handler::finalize() { return MLastEvent; MIsFinalized = true; - // Extract arguments from the kernel lambda. - // TODO: Skip this in simple cases. const auto &type = getType(); - if (type == detail::CGType::Kernel && impl->MKernelFuncPtr) { + const bool KernelFastPath = + (MQueue && !impl->MGraph && !impl->MSubgraphNode && + !MQueue->hasCommandGraph() && !impl->CGData.MRequirements.size() && + !MStreamStorage.size() && + detail::Scheduler::areEventsSafeForSchedulerBypass( + impl->CGData.MEvents, MQueue->getContextImplPtr())); + + // Extract arguments from the kernel lambda, if required. + // Skipping this is currently limited to simple kernels on the fast path. + if (type == detail::CGType::Kernel && impl->MKernelFuncPtr && + !(KernelFastPath && impl->MKernelHasSpecialCaptures)) { clearArgs(); extractArgsAndReqsFromLambda((char *)impl->MKernelFuncPtr, impl->MKernelParamDescGetter, @@ -516,11 +524,7 @@ event handler::finalize() { } } - if (MQueue && !impl->MGraph && !impl->MSubgraphNode && - !MQueue->hasCommandGraph() && !impl->CGData.MRequirements.size() && - !MStreamStorage.size() && - detail::Scheduler::areEventsSafeForSchedulerBypass( - impl->CGData.MEvents, MQueue->getContextImplPtr())) { + if (KernelFastPath) { // if user does not add a new dependency to the dependency graph, i.e. // the graph is not changed, then this faster path is used to submit // kernel bypassing scheduler and avoiding CommandGroup, Command objects @@ -566,13 +570,14 @@ event handler::finalize() { detail::retrieveKernelBinary(MQueue, MKernelName.data()); assert(BinImage && "Failed to obtain a binary image."); } - enqueueImpKernel(MQueue, impl->MNDRDesc, impl->MArgs, - KernelBundleImpPtr, MKernel.get(), MKernelName.data(), - RawEvents, - DiscardEvent ? nullptr : LastEventImpl.get(), nullptr, - impl->MKernelCacheConfig, impl->MKernelIsCooperative, - impl->MKernelUsesClusterLaunch, - impl->MKernelWorkGroupMemorySize, BinImage); + enqueueImpKernel( + MQueue, impl->MNDRDesc, impl->MArgs, KernelBundleImpPtr, + MKernel.get(), MKernelName.data(), RawEvents, + DiscardEvent ? nullptr : LastEventImpl.get(), nullptr, + impl->MKernelCacheConfig, impl->MKernelIsCooperative, + impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize, + BinImage, impl->MKernelFuncPtr, impl->MKernelNumArgs, + impl->MKernelParamDescGetter, impl->MKernelHasSpecialCaptures); #ifdef XPTI_ENABLE_INSTRUMENTATION if (xptiEnabled) { // Emit signal only when event is created From bebc2cf15709f311c4245a9abf28a40cde02038f Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 13 May 2025 12:17:31 +0200 Subject: [PATCH 05/11] Fix bug in fast path detection logic Signed-off-by: John Pennycook --- sycl/source/handler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index ec8a445d272b9..92099a57df1d9 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -422,7 +422,7 @@ event handler::finalize() { // Extract arguments from the kernel lambda, if required. // Skipping this is currently limited to simple kernels on the fast path. if (type == detail::CGType::Kernel && impl->MKernelFuncPtr && - !(KernelFastPath && impl->MKernelHasSpecialCaptures)) { + (!KernelFastPath || impl->MKernelHasSpecialCaptures)) { clearArgs(); extractArgsAndReqsFromLambda((char *)impl->MKernelFuncPtr, impl->MKernelParamDescGetter, From 3caf6b08e20b8a2cf306bd6d863fc3d88ce48a7b Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 13 May 2025 12:34:14 +0200 Subject: [PATCH 06/11] Evaluate hasSpecialCaptures at compile-time This shouldn't be necessary, but in my experiments, the compiler does not optimize the function call away unless it is used in a constexpr if. Signed-off-by: John Pennycook --- sycl/include/sycl/handler.hpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index a4f0ff3ee0f62..799d8b805d861 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -796,11 +796,19 @@ class __SYCL_EXPORT handler { if constexpr (KernelHasName) { // TODO support ESIMD in no-integration-header case too. - setKernelInfo((void *)MHostKernel->getPtr(), - detail::getKernelNumParams(), - &(detail::getKernelParamDesc), - detail::isKernelESIMD(), - detail::hasSpecialCaptures()); + // Force hasSpecialCaptures to be evaluated at compile-time. + // FIXME: This shouldn't be necessary! + if constexpr (detail::hasSpecialCaptures()) { + setKernelInfo((void *)MHostKernel->getPtr(), + detail::getKernelNumParams(), + &(detail::getKernelParamDesc), + detail::isKernelESIMD(), true); + } else { + setKernelInfo((void *)MHostKernel->getPtr(), + detail::getKernelNumParams(), + &(detail::getKernelParamDesc), + detail::isKernelESIMD(), false); + } MKernelName = detail::getKernelName(); } else { From 47472d4f7b04c46b1e701df4e5be717f391f74cd Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 13 May 2025 12:55:07 +0200 Subject: [PATCH 07/11] Fix ABI symbol dumps Signed-off-by: John Pennycook --- sycl/test/abi/sycl_symbols_linux.dump | 2 +- sycl/test/abi/sycl_symbols_windows.dump | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 3c935cb4e52c5..7b9f72314a44a 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3523,7 +3523,7 @@ _ZN4sycl3_V17handler12addReductionERKSt10shared_ptrIKvE _ZN4sycl3_V17handler12setArgHelperEiRNS0_3ext6oneapi12experimental6detail30dynamic_work_group_memory_baseE _ZN4sycl3_V17handler12setArgHelperEiRNS0_6detail22work_group_memory_implE _ZN4sycl3_V17handler13getKernelNameEv -_ZN4sycl3_V17handler13setKernelInfoEPviPFNS0_6detail19kernel_param_desc_tEiEb +_ZN4sycl3_V17handler13setKernelInfoEPviPFNS0_6detail19kernel_param_desc_tEiEbb _ZN4sycl3_V17handler14addAccessorReqESt10shared_ptrINS0_6detail16AccessorImplHostEE _ZN4sycl3_V17handler14setNDRangeUsedEb _ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleENS0_5rangeILi3EEERKNS4_16image_descriptorEPvS7_S7_S7_ diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 040c2b32acd16..831c605f3d5e7 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4377,7 +4377,7 @@ ?setHandlerKernelBundle@handler@_V1@sycl@@AEAAXVkernel@23@@Z ?setKernelCacheConfig@handler@_V1@sycl@@AEAAXW4StableKernelCacheConfig@123@@Z ?setKernelClusterLaunch@handler@_V1@sycl@@AEAAXV?$range@$02@23@H@Z -?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N@Z +?setKernelInfo@handler@_V1@sycl@@AEAAXPEAXHP6A?AUkernel_param_desc_t@detail@23@H@Z_N2@Z ?setKernelIsCooperative@handler@_V1@sycl@@AEAAX_N@Z ?setKernelWorkGroupMem@handler@_V1@sycl@@AEAAX_K@Z ?setLocalAccessorArgHelper@handler@_V1@sycl@@AEAAXHAEAVLocalAccessorBaseHost@detail@23@@Z From 015a5a20d965fbff762515039eef6c3b44f939bf Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 13 May 2025 14:23:32 +0200 Subject: [PATCH 08/11] Assume a kernel has special captures by default Assuming the alternative could lead to skipping MArgs inconsistently. Signed-off-by: John Pennycook --- sycl/source/detail/handler_impl.hpp | 2 +- sycl/source/detail/scheduler/commands.cpp | 2 +- sycl/source/detail/scheduler/commands.hpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 787fede1627ba..cbee08ec10a47 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -204,7 +204,7 @@ class handler_impl { int MKernelNumArgs = 0; detail::kernel_param_desc_t (*MKernelParamDescGetter)(int) = nullptr; bool MKernelIsESIMD = false; - bool MKernelHasSpecialCaptures = false; + bool MKernelHasSpecialCaptures = true; }; } // namespace detail diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 04621bb570180..d77f4040fb3de 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2388,7 +2388,7 @@ static ur_result_t SetKernelParamsAndLaunch( KernelNameStrRefT KernelName, void *KernelFuncPtr = nullptr, int KernelNumArgs = 0, detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr, - bool KernelHasSpecialCaptures = false) { + bool KernelHasSpecialCaptures = true) { assert(Queue && "Kernel submissions should have an associated queue"); const AdapterPtr &Adapter = Queue->getAdapter(); diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index b412738736cfd..bfd34210797b2 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -629,7 +629,7 @@ void enqueueImpKernel( const RTDeviceBinaryImage *BinImage = nullptr, void *KernelFuncPtr = nullptr, int KernelNumArgs = 0, detail::kernel_param_desc_t (*KernelParamDescGetter)(int) = nullptr, - bool KernelHasSpecialCaptures = false); + bool KernelHasSpecialCaptures = true); /// The exec CG command enqueues execution of kernel or explicit memory /// operation. From a58c502ec2456ea613dae67620a2a915ff5d6560 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 13 May 2025 15:36:12 +0200 Subject: [PATCH 09/11] Support elimination mask on the fast path Kernels without special captures might still have an elimination mask. Signed-off-by: John Pennycook --- sycl/source/detail/scheduler/commands.cpp | 19 ++++++++++--------- sycl/source/detail/scheduler/commands.hpp | 22 ++++++++++++++++++++++ 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index d77f4040fb3de..995fc4dcc950b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2401,28 +2401,29 @@ static ur_result_t SetKernelParamsAndLaunch( } if (KernelFuncPtr && !KernelHasSpecialCaptures) { - // TODO: Refactor to avoid SetArgBasedOnType duplication - // TODO: Find a way to use the built-ins instead of variables. - for (int I = 0; I < KernelNumArgs; ++I) { - auto ParamDesc = KernelParamDescGetter(I); + auto setFunc = [&Adapter, Kernel, + KernelFuncPtr](const detail::kernel_param_desc_t &ParamDesc, + size_t NextTrueIndex) { const void *ArgPtr = (const char *)KernelFuncPtr + ParamDesc.offset; switch (ParamDesc.kind) { case kernel_param_kind_t::kind_std_layout: { int Size = ParamDesc.info; - Adapter->call(Kernel, I, Size, nullptr, - ArgPtr); + Adapter->call(Kernel, NextTrueIndex, + Size, nullptr, ArgPtr); break; } case kernel_param_kind_t::kind_pointer: { const void *Ptr = *static_cast(ArgPtr); - Adapter->call(Kernel, I, nullptr, - Ptr); + Adapter->call(Kernel, NextTrueIndex, + nullptr, Ptr); break; } default: throw std::runtime_error("Direct kernel argument copy failed."); } - } + }; + applyFuncOnFilteredArgs(EliminatedArgMask, KernelFuncPtr, KernelNumArgs, + KernelParamDescGetter, setFunc); } else { auto setFunc = [&Adapter, Kernel, &DeviceImageImpl, &getMemAllocationFunc, &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index bfd34210797b2..e1718cb76cacf 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -783,6 +783,28 @@ void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, } } +template +void applyFuncOnFilteredArgs( + const KernelArgMask *EliminatedArgMask, void *KernelFuncPtr, + int KernelNumArgs, + detail::kernel_param_desc_t (*KernelParamDescGetter)(int), FuncT Func) { + if (!EliminatedArgMask || EliminatedArgMask->size() == 0) { + for (int I = 0; I < KernelNumArgs; ++I) { + const detail::kernel_param_desc_t &Param = KernelParamDescGetter(I); + Func(Param, I); + } + } else { + size_t NextTrueIndex = 0; + for (int I = 0; I < KernelNumArgs; ++I) { + const detail::kernel_param_desc_t &Param = KernelParamDescGetter(I); + if ((*EliminatedArgMask)[I]) + continue; + Func(Param, NextTrueIndex); + ++NextTrueIndex; + } + } +} + void ReverseRangeDimensionsForKernel(NDRDescT &NDR); } // namespace detail From b07c7d1e1a5972dc719a98a2a7b5c053e665e4d8 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Tue, 13 May 2025 15:51:07 +0200 Subject: [PATCH 10/11] Remove unused parameter to silence warning Signed-off-by: John Pennycook --- sycl/source/detail/scheduler/commands.cpp | 2 +- sycl/source/detail/scheduler/commands.hpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 995fc4dcc950b..b95984d88ed7b 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2422,7 +2422,7 @@ static ur_result_t SetKernelParamsAndLaunch( throw std::runtime_error("Direct kernel argument copy failed."); } }; - applyFuncOnFilteredArgs(EliminatedArgMask, KernelFuncPtr, KernelNumArgs, + applyFuncOnFilteredArgs(EliminatedArgMask, KernelNumArgs, KernelParamDescGetter, setFunc); } else { auto setFunc = [&Adapter, Kernel, &DeviceImageImpl, &getMemAllocationFunc, diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index e1718cb76cacf..0fede3287e154 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -785,8 +785,7 @@ void applyFuncOnFilteredArgs(const KernelArgMask *EliminatedArgMask, template void applyFuncOnFilteredArgs( - const KernelArgMask *EliminatedArgMask, void *KernelFuncPtr, - int KernelNumArgs, + const KernelArgMask *EliminatedArgMask, int KernelNumArgs, detail::kernel_param_desc_t (*KernelParamDescGetter)(int), FuncT Func) { if (!EliminatedArgMask || EliminatedArgMask->size() == 0) { for (int I = 0; I < KernelNumArgs; ++I) { From 54b4621323c0ef12a1a15431213ca5e0dec2a8a5 Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 14 May 2025 16:20:28 +0200 Subject: [PATCH 11/11] Replace constexpr branch with constexpr variable --- sycl/include/sycl/handler.hpp | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 799d8b805d861..2147e8f77b422 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -797,18 +797,11 @@ class __SYCL_EXPORT handler { // TODO support ESIMD in no-integration-header case too. // Force hasSpecialCaptures to be evaluated at compile-time. - // FIXME: This shouldn't be necessary! - if constexpr (detail::hasSpecialCaptures()) { - setKernelInfo((void *)MHostKernel->getPtr(), - detail::getKernelNumParams(), - &(detail::getKernelParamDesc), - detail::isKernelESIMD(), true); - } else { - setKernelInfo((void *)MHostKernel->getPtr(), - detail::getKernelNumParams(), - &(detail::getKernelParamDesc), - detail::isKernelESIMD(), false); - } + constexpr bool HasSpecialCapt = detail::hasSpecialCaptures(); + setKernelInfo((void *)MHostKernel->getPtr(), + detail::getKernelNumParams(), + &(detail::getKernelParamDesc), + detail::isKernelESIMD(), HasSpecialCapt); MKernelName = detail::getKernelName(); } else {