Skip to content

Commit df73b49

Browse files
committed
Merge remote-tracking branch 'upstream/sycl' into urlog
2 parents 88f149d + f21034e commit df73b49

31 files changed

+380
-110
lines changed

sycl/source/detail/context_impl.cpp

+13-4
Original file line numberDiff line numberDiff line change
@@ -338,16 +338,23 @@ void context_impl::removeAssociatedDeviceGlobal(const void *DeviceGlobalPtr) {
338338
void context_impl::addDeviceGlobalInitializer(
339339
ur_program_handle_t Program, const std::vector<device> &Devs,
340340
const RTDeviceBinaryImage *BinImage) {
341+
if (BinImage->getDeviceGlobals().empty())
342+
return;
341343
std::lock_guard<std::mutex> Lock(MDeviceGlobalInitializersMutex);
342344
for (const device &Dev : Devs) {
343345
auto Key = std::make_pair(Program, getSyclObjImpl(Dev)->getHandleRef());
344-
MDeviceGlobalInitializers.emplace(Key, BinImage);
346+
auto [Iter, Inserted] = MDeviceGlobalInitializers.emplace(Key, BinImage);
347+
if (Inserted && !Iter->second.MDeviceGlobalsFullyInitialized)
348+
++MDeviceGlobalNotInitializedCnt;
345349
}
346350
}
347351

348352
std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
349353
ur_program_handle_t NativePrg,
350354
const std::shared_ptr<queue_impl> &QueueImpl) {
355+
if (!MDeviceGlobalNotInitializedCnt.load(std::memory_order_acquire))
356+
return {};
357+
351358
const AdapterPtr &Adapter = getAdapter();
352359
const DeviceImplPtr &DeviceImpl = QueueImpl->getDeviceImplPtr();
353360
std::lock_guard<std::mutex> NativeProgramLock(MDeviceGlobalInitializersMutex);
@@ -369,16 +376,17 @@ std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
369376
[&Adapter](const ur_event_handle_t &Event) {
370377
return get_event_info<info::event::command_execution_status>(
371378
Event, Adapter) == info::event_command_status::complete;
372-
return false;
373379
});
374380
// Release the removed events.
375381
for (auto EventIt = NewEnd; EventIt != InitEventsRef.end(); ++EventIt)
376382
Adapter->call<UrApiKind::urEventRelease>(*EventIt);
377383
// Remove them from the collection.
378384
InitEventsRef.erase(NewEnd, InitEventsRef.end());
379385
// If there are no more events, we can mark it as fully initialized.
380-
if (InitEventsRef.empty())
386+
if (InitEventsRef.empty()) {
381387
InitRef.MDeviceGlobalsFullyInitialized = true;
388+
--MDeviceGlobalNotInitializedCnt;
389+
}
382390
return InitEventsRef;
383391
} else if (InitRef.MDeviceGlobalsFullyInitialized) {
384392
// MDeviceGlobalsFullyInitialized could have been set while we were
@@ -387,7 +395,7 @@ std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
387395
}
388396

389397
// There were no events and it was not set as fully initialized, so this is
390-
// responsible for intializing the device globals.
398+
// responsible for initializing the device globals.
391399
auto DeviceGlobals = InitRef.MBinImage->getDeviceGlobals();
392400
std::vector<std::string> DeviceGlobalIds;
393401
DeviceGlobalIds.reserve(DeviceGlobals.size());
@@ -402,6 +410,7 @@ std::vector<ur_event_handle_t> context_impl::initializeDeviceGlobals(
402410
// globals are trivially fully initialized and we can end early.
403411
if (DeviceGlobalEntries.empty()) {
404412
InitRef.MDeviceGlobalsFullyInitialized = true;
413+
--MDeviceGlobalNotInitializedCnt;
405414
return {};
406415
}
407416

sycl/source/detail/context_impl.hpp

+13-2
Original file line numberDiff line numberDiff line change
@@ -307,10 +307,21 @@ class context_impl {
307307
std::vector<ur_event_handle_t> MDeviceGlobalInitEvents;
308308
};
309309

310-
std::map<std::pair<ur_program_handle_t, ur_device_handle_t>,
311-
DeviceGlobalInitializer>
310+
using HandleDevicePair = std::pair<ur_program_handle_t, ur_device_handle_t>;
311+
312+
struct HandleDevicePairHash {
313+
std::size_t operator()(const HandleDevicePair &Key) const {
314+
return std::hash<ur_program_handle_t>{}(Key.first) ^
315+
std::hash<ur_device_handle_t>{}(Key.second);
316+
}
317+
};
318+
319+
std::unordered_map<HandleDevicePair, DeviceGlobalInitializer,
320+
HandleDevicePairHash>
312321
MDeviceGlobalInitializers;
313322
std::mutex MDeviceGlobalInitializersMutex;
323+
// The number of device globals that have not been initialized yet.
324+
std::atomic<size_t> MDeviceGlobalNotInitializedCnt = 0;
314325

315326
// For device_global variables that are not used in any kernel code we still
316327
// allow copy operations on them. MDeviceGlobalUnregisteredData stores the

sycl/source/detail/graph_impl.cpp

+8-8
Original file line numberDiff line numberDiff line change
@@ -1474,18 +1474,18 @@ void exec_graph_impl::populateURKernelUpdateStructs(
14741474
ur_kernel_handle_t UrKernel = nullptr;
14751475
auto Kernel = ExecCG.MSyclKernel;
14761476
auto KernelBundleImplPtr = ExecCG.MKernelBundle;
1477-
std::shared_ptr<sycl::detail::kernel_impl> SyclKernelImpl = nullptr;
14781477
const sycl::detail::KernelArgMask *EliminatedArgMask = nullptr;
14791478

1480-
if (auto SyclKernelImpl = KernelBundleImplPtr
1481-
? KernelBundleImplPtr->tryGetKernel(
1482-
ExecCG.MKernelName, KernelBundleImplPtr)
1483-
: std::shared_ptr<kernel_impl>{nullptr}) {
1484-
UrKernel = SyclKernelImpl->getHandleRef();
1485-
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
1486-
} else if (Kernel != nullptr) {
1479+
if (Kernel != nullptr) {
14871480
UrKernel = Kernel->getHandleRef();
14881481
EliminatedArgMask = Kernel->getKernelArgMask();
1482+
} else if (auto SyclKernelImpl =
1483+
KernelBundleImplPtr
1484+
? KernelBundleImplPtr->tryGetKernel(ExecCG.MKernelName,
1485+
KernelBundleImplPtr)
1486+
: std::shared_ptr<kernel_impl>{nullptr}) {
1487+
UrKernel = SyclKernelImpl->getHandleRef();
1488+
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
14891489
} else {
14901490
ur_program_handle_t UrProgram = nullptr;
14911491
std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) =

sycl/source/detail/helpers.cpp

+7-7
Original file line numberDiff line numberDiff line change
@@ -72,16 +72,16 @@ retrieveKernelBinary(const QueueImplPtr &Queue, const char *KernelName,
7272
const RTDeviceBinaryImage *DeviceImage = nullptr;
7373
ur_program_handle_t Program = nullptr;
7474
auto KernelBundleImpl = KernelCG->getKernelBundle();
75-
if (auto SyclKernelImpl =
76-
KernelBundleImpl
77-
? KernelBundleImpl->tryGetKernel(KernelName, KernelBundleImpl)
78-
: std::shared_ptr<kernel_impl>{nullptr}) {
75+
if (KernelCG->MSyclKernel != nullptr) {
76+
DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
77+
Program = KernelCG->MSyclKernel->getDeviceImage()->get_ur_program_ref();
78+
} else if (auto SyclKernelImpl =
79+
KernelBundleImpl ? KernelBundleImpl->tryGetKernel(
80+
KernelName, KernelBundleImpl)
81+
: std::shared_ptr<kernel_impl>{nullptr}) {
7982
// Retrieve the device image from the kernel bundle.
8083
DeviceImage = SyclKernelImpl->getDeviceImage()->get_bin_image_ref();
8184
Program = SyclKernelImpl->getDeviceImage()->get_ur_program_ref();
82-
} else if (KernelCG->MSyclKernel != nullptr) {
83-
DeviceImage = KernelCG->MSyclKernel->getDeviceImage()->get_bin_image_ref();
84-
Program = KernelCG->MSyclKernel->getDeviceImage()->get_ur_program_ref();
8585
} else {
8686
auto ContextImpl = Queue->getContextImplPtr();
8787
auto DeviceImpl = Queue->getDeviceImplPtr();

sycl/source/detail/program_manager/program_manager.cpp

+39-29
Original file line numberDiff line numberDiff line change
@@ -765,35 +765,42 @@ ProgramManager::collectDependentDeviceImagesForVirtualFunctions(
765765
}
766766
}
767767

768-
while (!WorkList.empty()) {
769-
std::string SetName = WorkList.front();
770-
WorkList.pop();
768+
if (!WorkList.empty()) {
769+
// Guard read access to m_VFSet2BinImage:
770+
// TODO: a better solution should be sought in the future, i.e. a different
771+
// mutex than m_KernelIDsMutex, check lock check pattern, etc.
772+
std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
771773

772-
// There could be more than one device image that uses the same set
773-
// of virtual functions, or provides virtual funtions from the same
774-
// set.
775-
for (RTDeviceBinaryImage *BinImage : m_VFSet2BinImage[SetName]) {
776-
// Here we can encounter both uses-virtual-functions-set and
777-
// virtual-functions-set properties, but their handling is the same: we
778-
// just grab all sets they reference and add them for consideration if
779-
// we haven't done so already.
780-
for (const sycl_device_binary_property &VFProp :
781-
BinImage->getVirtualFunctions()) {
782-
std::string StrValue = DeviceBinaryProperty(VFProp).asCString();
783-
for (const auto &SetName : detail::split_string(StrValue, ',')) {
784-
if (HandledSets.insert(SetName).second)
785-
WorkList.push(SetName);
774+
while (!WorkList.empty()) {
775+
std::string SetName = WorkList.front();
776+
WorkList.pop();
777+
778+
// There could be more than one device image that uses the same set
779+
// of virtual functions, or provides virtual funtions from the same
780+
// set.
781+
for (RTDeviceBinaryImage *BinImage : m_VFSet2BinImage.at(SetName)) {
782+
// Here we can encounter both uses-virtual-functions-set and
783+
// virtual-functions-set properties, but their handling is the same: we
784+
// just grab all sets they reference and add them for consideration if
785+
// we haven't done so already.
786+
for (const sycl_device_binary_property &VFProp :
787+
BinImage->getVirtualFunctions()) {
788+
std::string StrValue = DeviceBinaryProperty(VFProp).asCString();
789+
for (const auto &SetName : detail::split_string(StrValue, ',')) {
790+
if (HandledSets.insert(SetName).second)
791+
WorkList.push(SetName);
792+
}
786793
}
787-
}
788794

789-
// TODO: Complete this part about handling of incompatible device images.
790-
// If device image uses the same virtual function set, then we only
791-
// link it if it is compatible.
792-
// However, if device image provides virtual function set and it is
793-
// incompatible, then we should link its "dummy" version to avoid link
794-
// errors about unresolved external symbols.
795-
if (doesDevSupportDeviceRequirements(Dev, *BinImage))
796-
DeviceImagesToLink.insert(BinImage);
795+
// TODO: Complete this part about handling of incompatible device
796+
// images. If device image uses the same virtual function set, then we
797+
// only link it if it is compatible. However, if device image provides
798+
// virtual function set and it is incompatible, then we should link its
799+
// "dummy" version to avoid link errors about unresolved external
800+
// symbols.
801+
if (doesDevSupportDeviceRequirements(Dev, *BinImage))
802+
DeviceImagesToLink.insert(BinImage);
803+
}
797804
}
798805
}
799806

@@ -2163,8 +2170,14 @@ void ProgramManager::addImages(sycl_device_binaries DeviceBinary) {
21632170
}
21642171

21652172
void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) {
2173+
if (DeviceBinary->NumDeviceBinaries == 0)
2174+
return;
2175+
// Acquire lock to read and modify maps for kernel bundles
2176+
std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
2177+
21662178
for (int I = 0; I < DeviceBinary->NumDeviceBinaries; I++) {
21672179
sycl_device_binary RawImg = &(DeviceBinary->DeviceBinaries[I]);
2180+
21682181
auto DevImgIt = m_DeviceImages.find(RawImg);
21692182
if (DevImgIt == m_DeviceImages.end())
21702183
continue;
@@ -2178,9 +2191,6 @@ void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) {
21782191
// Drop the kernel argument mask map
21792192
m_EliminatedKernelArgMasks.erase(Img);
21802193

2181-
// Acquire lock to modify maps for kernel bundles
2182-
std::lock_guard<std::mutex> KernelIDsGuard(m_KernelIDsMutex);
2183-
21842194
// Unmap the unique kernel IDs for the offload entries
21852195
for (sycl_offload_entry EntriesIt = EntriesB; EntriesIt != EntriesE;
21862196
EntriesIt = EntriesIt->Increment()) {

sycl/source/detail/program_manager/program_manager.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,7 @@ class ProgramManager {
459459

460460
/// Keeps all device images we are refering to during program lifetime. Used
461461
/// for proper cleanup.
462+
/// Access must be guarded by the m_KernelIDsMutex mutex.
462463
std::unordered_map<sycl_device_binary, RTDeviceBinaryImageUPtr>
463464
m_DeviceImages;
464465

@@ -468,6 +469,7 @@ class ProgramManager {
468469

469470
/// Caches list of device images that use or provide virtual functions from
470471
/// the same set. Used to simplify access.
472+
/// Access must be guarded by the m_KernelIDsMutex mutex.
471473
std::unordered_map<std::string, std::set<RTDeviceBinaryImage *>>
472474
m_VFSet2BinImage;
473475

sycl/source/detail/scheduler/commands.cpp

+45-27
Original file line numberDiff line numberDiff line change
@@ -1997,16 +1997,16 @@ void instrumentationAddExtraKernelMetadata(
19971997
std::mutex *KernelMutex = nullptr;
19981998
const KernelArgMask *EliminatedArgMask = nullptr;
19991999

2000-
if (auto SyclKernelImpl = KernelBundleImplPtr
2001-
? KernelBundleImplPtr->tryGetKernel(
2002-
KernelName, KernelBundleImplPtr)
2003-
: std::shared_ptr<kernel_impl>{nullptr}) {
2004-
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2005-
Program = SyclKernelImpl->getDeviceImage()->get_ur_program_ref();
2006-
} else if (nullptr != SyclKernel) {
2000+
if (nullptr != SyclKernel) {
20072001
Program = SyclKernel->getProgramRef();
20082002
if (!SyclKernel->isCreatedFromSource())
20092003
EliminatedArgMask = SyclKernel->getKernelArgMask();
2004+
} else if (auto SyclKernelImpl =
2005+
KernelBundleImplPtr ? KernelBundleImplPtr->tryGetKernel(
2006+
KernelName, KernelBundleImplPtr)
2007+
: std::shared_ptr<kernel_impl>{nullptr}) {
2008+
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2009+
Program = SyclKernelImpl->getDeviceImage()->get_ur_program_ref();
20102010
} else if (Queue) {
20112011
// NOTE: Queue can be null when kernel is directly enqueued to a command
20122012
// buffer
@@ -2521,17 +2521,17 @@ getCGKernelInfo(const CGExecKernel &CommandGroup, ContextImplPtr ContextImpl,
25212521
const KernelArgMask *EliminatedArgMask = nullptr;
25222522
auto &KernelBundleImplPtr = CommandGroup.MKernelBundle;
25232523

2524-
if (auto SyclKernelImpl =
2525-
KernelBundleImplPtr
2526-
? KernelBundleImplPtr->tryGetKernel(CommandGroup.MKernelName,
2527-
KernelBundleImplPtr)
2528-
: std::shared_ptr<kernel_impl>{nullptr}) {
2524+
if (auto Kernel = CommandGroup.MSyclKernel; Kernel != nullptr) {
2525+
UrKernel = Kernel->getHandleRef();
2526+
EliminatedArgMask = Kernel->getKernelArgMask();
2527+
} else if (auto SyclKernelImpl =
2528+
KernelBundleImplPtr
2529+
? KernelBundleImplPtr->tryGetKernel(
2530+
CommandGroup.MKernelName, KernelBundleImplPtr)
2531+
: std::shared_ptr<kernel_impl>{nullptr}) {
25292532
UrKernel = SyclKernelImpl->getHandleRef();
25302533
DeviceImageImpl = SyclKernelImpl->getDeviceImage();
25312534
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2532-
} else if (auto Kernel = CommandGroup.MSyclKernel; Kernel != nullptr) {
2533-
UrKernel = Kernel->getHandleRef();
2534-
EliminatedArgMask = Kernel->getKernelArgMask();
25352535
} else {
25362536
ur_program_handle_t UrProgram = nullptr;
25372537
std::tie(UrKernel, std::ignore, EliminatedArgMask, UrProgram) =
@@ -2678,18 +2678,7 @@ void enqueueImpKernel(
26782678
std::shared_ptr<kernel_impl> SyclKernelImpl;
26792679
std::shared_ptr<device_image_impl> DeviceImageImpl;
26802680

2681-
if ((SyclKernelImpl = KernelBundleImplPtr
2682-
? KernelBundleImplPtr->tryGetKernel(
2683-
KernelName, KernelBundleImplPtr)
2684-
: std::shared_ptr<kernel_impl>{nullptr})) {
2685-
Kernel = SyclKernelImpl->getHandleRef();
2686-
DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2687-
2688-
Program = DeviceImageImpl->get_ur_program_ref();
2689-
2690-
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2691-
KernelMutex = SyclKernelImpl->getCacheMutex();
2692-
} else if (nullptr != MSyclKernel) {
2681+
if (nullptr != MSyclKernel) {
26932682
assert(MSyclKernel->get_info<info::kernel::context>() ==
26942683
Queue->get_context());
26952684
Kernel = MSyclKernel->getHandleRef();
@@ -2703,6 +2692,17 @@ void enqueueImpKernel(
27032692
// their duplication in such cases.
27042693
KernelMutex = &MSyclKernel->getNoncacheableEnqueueMutex();
27052694
EliminatedArgMask = MSyclKernel->getKernelArgMask();
2695+
} else if ((SyclKernelImpl = KernelBundleImplPtr
2696+
? KernelBundleImplPtr->tryGetKernel(
2697+
KernelName, KernelBundleImplPtr)
2698+
: std::shared_ptr<kernel_impl>{nullptr})) {
2699+
Kernel = SyclKernelImpl->getHandleRef();
2700+
DeviceImageImpl = SyclKernelImpl->getDeviceImage();
2701+
2702+
Program = DeviceImageImpl->get_ur_program_ref();
2703+
2704+
EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
2705+
KernelMutex = SyclKernelImpl->getCacheMutex();
27062706
} else {
27072707
std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
27082708
detail::ProgramManager::getInstance().getOrCreateKernel(
@@ -3511,6 +3511,18 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
35113511
const AdapterPtr &Adapter = MQueue->getAdapter();
35123512
if (MEvent != nullptr)
35133513
MEvent->setHostEnqueueTime();
3514+
// User can specify explicit dependencies via depends_on call that we should
3515+
// honor here. It is very important for cross queue dependencies. We wait
3516+
// them explicitly since barrier w/o wait list waits for all commands
3517+
// submitted before and we can't add new dependencies to its wait list.
3518+
// Output event for wait operation is not requested since barrier is
3519+
// submitted immediately after and should synchronize it internally.
3520+
if (RawEvents.size()) {
3521+
auto Result = Adapter->call_nocheck<UrApiKind::urEnqueueEventsWait>(
3522+
MQueue->getHandleRef(), RawEvents.size(), &RawEvents[0], nullptr);
3523+
if (Result != UR_RESULT_SUCCESS)
3524+
return Result;
3525+
}
35143526
if (auto Result =
35153527
Adapter->call_nocheck<UrApiKind::urEnqueueEventsWaitWithBarrierExt>(
35163528
MQueue->getHandleRef(), &Properties, 0, nullptr, Event);
@@ -3545,6 +3557,12 @@ ur_result_t ExecCGCommand::enqueueImpQueue() {
35453557
const AdapterPtr &Adapter = MQueue->getAdapter();
35463558
if (MEvent != nullptr)
35473559
MEvent->setHostEnqueueTime();
3560+
// User can specify explicit dependencies via depends_on call that we should
3561+
// honor here. It is very important for cross queue dependencies. Adding
3562+
// them to the barrier wait list since barrier w/ wait list waits only for
3563+
// the events provided in wait list and we can just extend the list.
3564+
UrEvents.insert(UrEvents.end(), RawEvents.begin(), RawEvents.end());
3565+
35483566
if (auto Result =
35493567
Adapter->call_nocheck<UrApiKind::urEnqueueEventsWaitWithBarrierExt>(
35503568
MQueue->getHandleRef(), &Properties, UrEvents.size(),

sycl/test-e2e/Graph/RecordReplay/barrier_multi_graph.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
// Extra run to check for immediate-command-list in Level Zero
66
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
77
//
8+
// UNSUPPORTED: opencl
9+
// UNSUPPORTED-INTENDED: USM memcpy command not supported for OpenCL
810

911
#include "../graph_common.hpp"
1012

sycl/test-e2e/Graph/RecordReplay/barrier_multi_queue.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
// Extra run to check for immediate-command-list in Level Zero
66
// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
77
//
8+
// UNSUPPORTED: opencl
9+
// UNSUPPORTED-INTENDED: USM memcpy command not supported for OpenCL
810

911
#include "../graph_common.hpp"
1012

sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77

88
// Tests the enqueue free function kernel shortcuts.
99

10+
// UNSUPPORTED: opencl
11+
// UNSUPPORTED-INTENDED: USM memcpy command not supported for OpenCL
12+
1013
#include "../graph_common.hpp"
1114
#include <sycl/ext/oneapi/experimental/enqueue_functions.hpp>
1215
#include <sycl/properties/all_properties.hpp>

0 commit comments

Comments
 (0)