Skip to content

Commit a0342b3

Browse files
authored
[SYCL] Revert USM caching to pre-2021.3 state and disable Shared USM … (#4799)
This change a) enhances the env var controls for USM caching, b) sets caching parameters to match the previous implementation and c) disables Shared memory chunking to fix a data race issue.
1 parent 55be63a commit a0342b3

File tree

5 files changed

+631
-151
lines changed

5 files changed

+631
-151
lines changed

sycl/doc/EnvironmentVariables.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ variables in production code.</span>
137137
| Environment variable | Values | Description |
138138
| -------------------- | ------ | ----------- |
139139
| `SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE` | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. |
140-
| `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` | MaxPoolableSize,Capacity,MaxPoolSize | Values specified as positive integers. Defaults are 1, 4, 256. MaxPoolableSize is the maximum allocation size in MB that may be pooled. Capacity is the number of allocations in each size range that are freed by the program but retained in the pool for reallocation. Size ranges follow this pattern: 32, 48, 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. MaxPoolSize is the maximum size of the pool in MB. |
140+
| `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` | EnableBuffers, MaxPoolSize [, MemType, MaxPoolableSize, Capacity, SlabMinSize]... | EnableBuffers enables pooling for SYCL buffers, default false. MaxPoolSize is the maximum size of the pool, default 0. MemType is host, device or shared. Other parameters are values specified as positive integers with optional K, M or G suffix. MaxPoolableSize is the maximum allocation size that may be pooled, default 0 for host and shared, 32KB for device. Capacity is the number of allocations in each size range freed by the program but retained in the pool for reallocation, default 0. Size ranges follow this pattern: 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. SlabMinSize is the minimum allocation size, 64KB for host and device, 2MB for shared. |
141141
| `SYCL_PI_LEVEL_ZERO_BATCH_SIZE` | Integer | Sets a preferred number of commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. |
142142
| `SYCL_PI_LEVEL_ZERO_FILTER_EVENT_WAIT_LIST` | Integer | When set to 0, disables filtering of signaled events from wait lists when using the Level Zero backend. The default is 1. |
143143
| `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Any(\*) | This environment variable enables users to control use of copy engines for copy operations. If the value is an integer, it will allow the use of copy engines, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The value of this environment variable can also be a pair of the form "lower_index:upper_index" where the indices point to copy engines in a list of all available copy engines. The default is 1. |

sycl/plugins/level_zero/pi_level_zero.cpp

+139-24
Original file line numberDiff line numberDiff line change
@@ -3032,6 +3032,74 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
30323032
return PI_SUCCESS;
30333033
}
30343034

3035+
// If indirect access tracking is enabled then performs reference counting,
3036+
// otherwise just calls zeMemAllocDevice.
3037+
static pi_result ZeDeviceMemAllocHelper(void **ResultPtr, pi_context Context,
3038+
pi_device Device, size_t Size) {
3039+
pi_platform Plt = Device->Platform;
3040+
std::unique_lock<std::mutex> ContextsLock(Plt->ContextsMutex,
3041+
std::defer_lock);
3042+
if (IndirectAccessTrackingEnabled) {
3043+
// Lock the mutex which is guarding contexts container in the platform.
3044+
// This prevents new kernels from being submitted in any context while
3045+
// we are in the process of allocating a memory, this is needed to
3046+
// properly capture allocations by kernels with indirect access.
3047+
ContextsLock.lock();
3048+
// We are going to defer memory release if there are kernels with
3049+
// indirect access, that is why explicitly retain context to be sure
3050+
// that it is released after all memory allocations in this context are
3051+
// released.
3052+
PI_CALL(piContextRetain(Context));
3053+
}
3054+
3055+
ze_device_mem_alloc_desc_t ZeDesc = {};
3056+
ZeDesc.flags = 0;
3057+
ZeDesc.ordinal = 0;
3058+
ZE_CALL(zeMemAllocDevice,
3059+
(Context->ZeContext, &ZeDesc, Size, 1, Device->ZeDevice, ResultPtr));
3060+
3061+
if (IndirectAccessTrackingEnabled) {
3062+
// Keep track of all memory allocations in the context
3063+
Context->MemAllocs.emplace(std::piecewise_construct,
3064+
std::forward_as_tuple(*ResultPtr),
3065+
std::forward_as_tuple(Context));
3066+
}
3067+
return PI_SUCCESS;
3068+
}
3069+
3070+
// If indirect access tracking is enabled then performs reference counting,
3071+
// otherwise just calls zeMemAllocHost.
3072+
static pi_result ZeHostMemAllocHelper(void **ResultPtr, pi_context Context,
3073+
size_t Size) {
3074+
pi_platform Plt = Context->Devices[0]->Platform;
3075+
std::unique_lock<std::mutex> ContextsLock(Plt->ContextsMutex,
3076+
std::defer_lock);
3077+
if (IndirectAccessTrackingEnabled) {
3078+
// Lock the mutex which is guarding contexts container in the platform.
3079+
// This prevents new kernels from being submitted in any context while
3080+
// we are in the process of allocating a memory, this is needed to
3081+
// properly capture allocations by kernels with indirect access.
3082+
ContextsLock.lock();
3083+
// We are going to defer memory release if there are kernels with
3084+
// indirect access, that is why explicitly retain context to be sure
3085+
// that it is released after all memory allocations in this context are
3086+
// released.
3087+
PI_CALL(piContextRetain(Context));
3088+
}
3089+
3090+
ze_host_mem_alloc_desc_t ZeDesc = {};
3091+
ZeDesc.flags = 0;
3092+
ZE_CALL(zeMemAllocHost, (Context->ZeContext, &ZeDesc, Size, 1, ResultPtr));
3093+
3094+
if (IndirectAccessTrackingEnabled) {
3095+
// Keep track of all memory allocations in the context
3096+
Context->MemAllocs.emplace(std::piecewise_construct,
3097+
std::forward_as_tuple(*ResultPtr),
3098+
std::forward_as_tuple(Context));
3099+
}
3100+
return PI_SUCCESS;
3101+
}
3102+
30353103
pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
30363104
void *HostPtr, pi_mem *RetMem,
30373105
const pi_mem_properties *properties) {
@@ -3091,23 +3159,35 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
30913159

30923160
pi_result Result;
30933161
if (DeviceIsIntegrated) {
3094-
Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment);
3162+
if (enableBufferPooling())
3163+
Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment);
3164+
else {
3165+
ZeHostMemAllocHelper(&Ptr, Context, Size);
3166+
}
30953167
} else if (Context->SingleRootDevice) {
30963168
// If we have a single discrete device or all devices in the context are
30973169
// sub-devices of the same device then we can allocate on device
3098-
Result = piextUSMDeviceAlloc(&Ptr, Context, Context->SingleRootDevice,
3099-
nullptr, Size, Alignment);
3170+
if (enableBufferPooling())
3171+
Result = piextUSMDeviceAlloc(&Ptr, Context, Context->SingleRootDevice,
3172+
nullptr, Size, Alignment);
3173+
else {
3174+
ZeDeviceMemAllocHelper(&Ptr, Context, Context->SingleRootDevice, Size);
3175+
}
31003176
} else {
31013177
// Context with several gpu cards. Temporarily use host allocation because
31023178
// it is accessible by all devices. But it is not good in terms of
31033179
// performance.
31043180
// TODO: We need to either allow remote access to device memory using IPC,
31053181
// or do explicit memory transfers from one device to another using host
31063182
// resources as backing buffers to allow those transfers.
3107-
Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment);
3183+
if (enableBufferPooling())
3184+
Result = piextUSMHostAlloc(&Ptr, Context, nullptr, Size, Alignment);
3185+
else {
3186+
ZeHostMemAllocHelper(&Ptr, Context, Size);
3187+
}
31083188
}
31093189

3110-
if (Result != PI_SUCCESS)
3190+
if (enableBufferPooling() && Result != PI_SUCCESS)
31113191
return Result;
31123192

31133193
if (HostPtr) {
@@ -3170,6 +3250,37 @@ pi_result piMemRetain(pi_mem Mem) {
31703250
return PI_SUCCESS;
31713251
}
31723252

3253+
// If indirect access tracking is not enabled then this functions just performs
3254+
// zeMemFree. If indirect access tracking is enabled then reference counting is
3255+
// performed.
3256+
static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr) {
3257+
pi_platform Plt = Context->Devices[0]->Platform;
3258+
std::unique_lock<std::mutex> ContextsLock(Plt->ContextsMutex,
3259+
std::defer_lock);
3260+
if (IndirectAccessTrackingEnabled) {
3261+
ContextsLock.lock();
3262+
auto It = Context->MemAllocs.find(Ptr);
3263+
if (It == std::end(Context->MemAllocs)) {
3264+
die("All memory allocations must be tracked!");
3265+
}
3266+
if (--(It->second.RefCount) != 0) {
3267+
// Memory can't be deallocated yet.
3268+
return PI_SUCCESS;
3269+
}
3270+
3271+
// Reference count is zero, it is ok to free memory.
3272+
// We don't need to track this allocation anymore.
3273+
Context->MemAllocs.erase(It);
3274+
}
3275+
3276+
ZE_CALL(zeMemFree, (Context->ZeContext, Ptr));
3277+
3278+
if (IndirectAccessTrackingEnabled)
3279+
PI_CALL(ContextReleaseHelper(Context));
3280+
3281+
return PI_SUCCESS;
3282+
}
3283+
31733284
pi_result piMemRelease(pi_mem Mem) {
31743285
PI_ASSERT(Mem, PI_INVALID_MEM_OBJECT);
31753286

@@ -3179,7 +3290,11 @@ pi_result piMemRelease(pi_mem Mem) {
31793290
} else {
31803291
auto Buf = static_cast<_pi_buffer *>(Mem);
31813292
if (!Buf->isSubBuffer()) {
3182-
PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle()));
3293+
if (enableBufferPooling()) {
3294+
PI_CALL(piextUSMFree(Mem->Context, Mem->getZeHandle()));
3295+
} else {
3296+
ZeMemFreeHelper(Mem->Context, Mem->getZeHandle());
3297+
}
31833298
}
31843299
}
31853300
delete Mem;
@@ -4998,13 +5113,7 @@ static pi_result EventRelease(pi_event Event, pi_queue LockedQueue) {
49985113
if (Event->CommandType == PI_COMMAND_TYPE_MEM_BUFFER_UNMAP &&
49995114
Event->CommandData) {
50005115
// Free the memory allocated in the piEnqueueMemBufferMap.
5001-
// TODO: always use piextUSMFree
5002-
if (IndirectAccessTrackingEnabled) {
5003-
// Use the version with reference counting
5004-
PI_CALL(piextUSMFree(Event->Context, Event->CommandData));
5005-
} else {
5006-
ZE_CALL(zeMemFree, (Event->Context->ZeContext, Event->CommandData));
5007-
}
5116+
ZeMemFreeHelper(Event->Context, Event->CommandData);
50085117
Event->CommandData = nullptr;
50095118
}
50105119
if (Event->OwnZeEvent) {
@@ -5795,17 +5904,7 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer,
57955904
if (Buffer->MapHostPtr) {
57965905
*RetMap = Buffer->MapHostPtr + Offset;
57975906
} else {
5798-
// TODO: always use piextUSMHostAlloc
5799-
if (IndirectAccessTrackingEnabled) {
5800-
// Use the version with reference counting
5801-
PI_CALL(piextUSMHostAlloc(RetMap, Queue->Context, nullptr, Size, 1));
5802-
} else {
5803-
ZeStruct<ze_host_mem_alloc_desc_t> ZeDesc;
5804-
ZeDesc.flags = 0;
5805-
5806-
ZE_CALL(zeMemAllocHost,
5807-
(Queue->Context->ZeContext, &ZeDesc, Size, 1, RetMap));
5808-
}
5907+
ZeHostMemAllocHelper(RetMap, Queue->Context, Size);
58095908
}
58105909
const auto &ZeCommandList = CommandList->first;
58115910
const auto &WaitList = (*Event)->WaitList;
@@ -6495,6 +6594,18 @@ pi_result USMHostMemoryAlloc::allocateImpl(void **ResultPtr, size_t Size,
64956594
return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
64966595
}
64976596

6597+
SystemMemory::MemType USMSharedMemoryAlloc::getMemTypeImpl() {
6598+
return SystemMemory::Shared;
6599+
}
6600+
6601+
SystemMemory::MemType USMDeviceMemoryAlloc::getMemTypeImpl() {
6602+
return SystemMemory::Device;
6603+
}
6604+
6605+
SystemMemory::MemType USMHostMemoryAlloc::getMemTypeImpl() {
6606+
return SystemMemory::Host;
6607+
}
6608+
64986609
void *USMMemoryAllocBase::allocate(size_t Size) {
64996610
void *Ptr = nullptr;
65006611

@@ -6523,6 +6634,10 @@ void USMMemoryAllocBase::deallocate(void *Ptr) {
65236634
}
65246635
}
65256636

6637+
SystemMemory::MemType USMMemoryAllocBase::getMemType() {
6638+
return getMemTypeImpl();
6639+
}
6640+
65266641
pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context,
65276642
pi_device Device,
65286643
pi_usm_mem_properties *Properties, size_t Size,

sycl/plugins/level_zero/pi_level_zero.hpp

+5
Original file line numberDiff line numberDiff line change
@@ -265,20 +265,23 @@ class USMMemoryAllocBase : public SystemMemory {
265265
// type
266266
virtual pi_result allocateImpl(void **ResultPtr, size_t Size,
267267
pi_uint32 Alignment) = 0;
268+
virtual MemType getMemTypeImpl() = 0;
268269

269270
public:
270271
USMMemoryAllocBase(pi_context Ctx, pi_device Dev)
271272
: Context{Ctx}, Device{Dev} {}
272273
void *allocate(size_t Size) override final;
273274
void *allocate(size_t Size, size_t Alignment) override final;
274275
void deallocate(void *Ptr) override final;
276+
MemType getMemType() override final;
275277
};
276278

277279
// Allocation routines for shared memory type
278280
class USMSharedMemoryAlloc : public USMMemoryAllocBase {
279281
protected:
280282
pi_result allocateImpl(void **ResultPtr, size_t Size,
281283
pi_uint32 Alignment) override;
284+
MemType getMemTypeImpl() override;
282285

283286
public:
284287
USMSharedMemoryAlloc(pi_context Ctx, pi_device Dev)
@@ -290,6 +293,7 @@ class USMDeviceMemoryAlloc : public USMMemoryAllocBase {
290293
protected:
291294
pi_result allocateImpl(void **ResultPtr, size_t Size,
292295
pi_uint32 Alignment) override;
296+
MemType getMemTypeImpl() override;
293297

294298
public:
295299
USMDeviceMemoryAlloc(pi_context Ctx, pi_device Dev)
@@ -301,6 +305,7 @@ class USMHostMemoryAlloc : public USMMemoryAllocBase {
301305
protected:
302306
pi_result allocateImpl(void **ResultPtr, size_t Size,
303307
pi_uint32 Alignment) override;
308+
MemType getMemTypeImpl() override;
304309

305310
public:
306311
USMHostMemoryAlloc(pi_context Ctx) : USMMemoryAllocBase(Ctx, nullptr) {}

0 commit comments

Comments
 (0)