Skip to content

Commit f281a88

Browse files
authored
Merge pull request #2451 from igchor/deffered_kernel_keep_submitted
[L0 v2] implement deferred kernel deallocation
2 parents 495a331 + a22588f commit f281a88

File tree

8 files changed

+226
-14
lines changed

8 files changed

+226
-14
lines changed

.github/workflows/build-hw-reusable.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ jobs:
112112

113113
- name: Test adapter specific
114114
working-directory: ${{github.workspace}}/build
115-
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180
115+
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" -E "memcheck" --timeout 180
116116
# Don't run adapter specific tests when building multiple adapters
117117
if: ${{ matrix.adapter.other_name == '' }}
118118

scripts/benchmarks/benches/compute.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,9 @@ def benchmarks(self) -> list[Benchmark]:
7878

7979
if options.ur is not None:
8080
benches += [
81-
SubmitKernelUR(self, 0),
82-
SubmitKernelUR(self, 1),
81+
SubmitKernelUR(self, 0, 0),
82+
SubmitKernelUR(self, 1, 0),
83+
SubmitKernelUR(self, 1, 1),
8384
]
8485

8586
return benches
@@ -180,13 +181,14 @@ def bin_args(self) -> list[str]:
180181
]
181182

182183
class SubmitKernelUR(ComputeBenchmark):
183-
def __init__(self, bench, ioq):
184+
def __init__(self, bench, ioq, measureCompletion):
184185
self.ioq = ioq
186+
self.measureCompletion = measureCompletion
185187
super().__init__(bench, "api_overhead_benchmark_ur", "SubmitKernel")
186188

187189
def name(self):
188190
order = "in order" if self.ioq else "out of order"
189-
return f"api_overhead_benchmark_ur SubmitKernel {order}"
191+
return f"api_overhead_benchmark_ur SubmitKernel {order}" + (" with measure completion" if self.measureCompletion else "")
190192

191193
def explicit_group(self):
192194
return "SubmitKernel"
@@ -195,7 +197,7 @@ def bin_args(self) -> list[str]:
195197
return [
196198
f"--Ioq={self.ioq}",
197199
"--DiscardEvents=0",
198-
"--MeasureCompletion=0",
200+
f"--MeasureCompletion={self.measureCompletion}",
199201
"--iterations=100000",
200202
"--Profiling=0",
201203
"--NumKernels=10",

source/adapters/level_zero/v2/kernel.cpp

+6-7
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ ur_kernel_handle_t_::ur_kernel_handle_t_(
9595
}
9696

9797
ur_result_t ur_kernel_handle_t_::release() {
98+
if (!RefCount.decrementAndTest())
99+
return UR_RESULT_SUCCESS;
100+
98101
// manually release kernels to allow errors to be propagated
99102
for (auto &singleDeviceKernelOpt : deviceKernels) {
100103
if (singleDeviceKernelOpt.has_value()) {
@@ -104,6 +107,8 @@ ur_result_t ur_kernel_handle_t_::release() {
104107

105108
UR_CALL_THROWS(ur::level_zero::urProgramRelease(hProgram));
106109

110+
delete this;
111+
107112
return UR_RESULT_SUCCESS;
108113
}
109114

@@ -362,13 +367,7 @@ ur_result_t urKernelRetain(
362367
ur_result_t urKernelRelease(
363368
ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release
364369
) try {
365-
if (!hKernel->RefCount.decrementAndTest())
366-
return UR_RESULT_SUCCESS;
367-
368-
hKernel->release();
369-
delete hKernel;
370-
371-
return UR_RESULT_SUCCESS;
370+
return hKernel->release();
372371
} catch (...) {
373372
return exceptionToResult(std::current_exception());
374373
}

source/adapters/level_zero/v2/queue_immediate_in_order.cpp

+17-1
Original file line numberDiff line numberDiff line change
@@ -186,13 +186,25 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
186186

187187
// Free deferred events
188188
for (auto &hEvent : deferredEvents) {
189-
hEvent->releaseDeferred();
189+
UR_CALL(hEvent->releaseDeferred());
190190
}
191191
deferredEvents.clear();
192192

193+
// Free deferred kernels
194+
for (auto &hKernel : submittedKernels) {
195+
UR_CALL(hKernel->release());
196+
}
197+
submittedKernels.clear();
198+
193199
return UR_RESULT_SUCCESS;
194200
}
195201

202+
void ur_queue_immediate_in_order_t::recordSubmittedKernel(
203+
ur_kernel_handle_t hKernel) {
204+
submittedKernels.push_back(hKernel);
205+
hKernel->RefCount.increment();
206+
}
207+
196208
ur_result_t ur_queue_immediate_in_order_t::queueFlush() {
197209
return UR_RESULT_SUCCESS;
198210
}
@@ -251,6 +263,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch(
251263
(handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions,
252264
zeSignalEvent, waitList.second, waitList.first));
253265

266+
recordSubmittedKernel(hKernel);
267+
254268
return UR_RESULT_SUCCESS;
255269
}
256270

@@ -1063,6 +1077,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
10631077
(handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions,
10641078
zeSignalEvent, waitList.second, waitList.first));
10651079

1080+
recordSubmittedKernel(hKernel);
1081+
10661082
return UR_RESULT_SUCCESS;
10671083
}
10681084

source/adapters/level_zero/v2/queue_immediate_in_order.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
4747
std::vector<ze_event_handle_t> waitList;
4848

4949
std::vector<ur_event_handle_t> deferredEvents;
50+
std::vector<ur_kernel_handle_t> submittedKernels;
5051

5152
std::pair<ze_event_handle_t *, uint32_t>
5253
getWaitListView(const ur_event_handle_t *phWaitEvents,
@@ -82,6 +83,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
8283
const ur_event_handle_t *phEventWaitList,
8384
ur_event_handle_t *phEvent);
8485

86+
void recordSubmittedKernel(ur_kernel_handle_t hKernel);
87+
8588
public:
8689
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
8790
const ur_queue_properties_t *);

test/adapters/level_zero/v2/CMakeLists.txt

+24
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,27 @@ add_adapter_test(level_zero_memory_residency
6161
"UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero_v2>\""
6262
"ZES_ENABLE_SYSMAN=1"
6363
)
64+
65+
if(NOT WIN32)
66+
add_adapter_test(level_zero_deferred_kernel
67+
FIXTURE KERNELS
68+
SOURCES
69+
deferred_kernel.cpp
70+
ENVIRONMENT
71+
"UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero_v2>\""
72+
)
73+
74+
set(backend level_zero)
75+
add_adapter_memcheck_test(level_zero_deferred_kernel
76+
FIXTURE KERNELS
77+
SOURCES
78+
deferred_kernel.cpp
79+
ENVIRONMENT
80+
"UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero_v2>\""
81+
)
82+
83+
target_link_libraries(test-adapter-level_zero_deferred_kernel PRIVATE
84+
LevelZeroLoader
85+
LevelZeroLoader-Headers
86+
)
87+
endif()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
3+
// See LICENSE.TXT
4+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5+
6+
#include <ze_api.h>
7+
8+
#include "../../../conformance/enqueue/helpers.h"
9+
#include "../ze_helpers.hpp"
10+
#include "uur/fixtures.h"
11+
#include "uur/raii.h"
12+
13+
struct urEnqueueKernelLaunchTest : uur::urKernelExecutionTest {
14+
void SetUp() override {
15+
program_name = "fill";
16+
UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
17+
}
18+
19+
uint32_t val = 42;
20+
size_t global_size = 32;
21+
size_t global_offset = 0;
22+
size_t n_dimensions = 1;
23+
};
24+
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchTest);
25+
26+
TEST_P(urEnqueueKernelLaunchTest, DeferredKernelRelease) {
27+
ur_mem_handle_t buffer = nullptr;
28+
AddBuffer1DArg(sizeof(val) * global_size, &buffer);
29+
AddPodArg(val);
30+
31+
auto zeEvent = createZeEvent(context, device);
32+
33+
ur_event_handle_t event;
34+
ASSERT_SUCCESS(urEventCreateWithNativeHandle(
35+
reinterpret_cast<ur_native_handle_t>(zeEvent.get()), context, nullptr,
36+
&event));
37+
38+
ASSERT_SUCCESS(urEnqueueEventsWait(queue, 1, &event, nullptr));
39+
ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
40+
&global_offset, &global_size, nullptr,
41+
0, nullptr, nullptr));
42+
ASSERT_SUCCESS(urKernelRelease(kernel));
43+
44+
// Kernel should still be alive since kernel launch is pending
45+
ur_context_handle_t contextFromKernel;
46+
ASSERT_SUCCESS(urKernelGetInfo(kernel, UR_KERNEL_INFO_CONTEXT,
47+
sizeof(ur_context_handle_t),
48+
&contextFromKernel, nullptr));
49+
50+
ASSERT_EQ(context, contextFromKernel);
51+
52+
ze_event_handle_t ze_event = nullptr;
53+
ASSERT_SUCCESS(urEventGetNativeHandle(
54+
event, reinterpret_cast<ur_native_handle_t *>(&ze_event)));
55+
ASSERT_EQ(zeEventHostSignal(ze_event), ZE_RESULT_SUCCESS);
56+
57+
ASSERT_SUCCESS(urQueueFinish(queue));
58+
59+
kernel = nullptr;
60+
61+
ASSERT_SUCCESS(urEventRelease(event));
62+
}
63+
64+
struct urMultiQueueLaunchKernelDeferFreeTest
65+
: uur::urMultiQueueMultiDeviceTest<2> {
66+
std::string KernelName;
67+
68+
static constexpr char ProgramName[] = "foo";
69+
static constexpr size_t ArraySize = 100;
70+
static constexpr uint32_t InitialValue = 1;
71+
72+
ur_program_handle_t program = nullptr;
73+
ur_kernel_handle_t kernel = nullptr;
74+
75+
void SetUp() override {
76+
if (devices.size() < 2) {
77+
GTEST_SKIP() << "This test requires at least 2 devices";
78+
}
79+
80+
UUR_RETURN_ON_FATAL_FAILURE(
81+
uur::urMultiQueueMultiDeviceTest<2>::SetUp());
82+
83+
KernelName = uur::KernelsEnvironment::instance->GetEntryPointNames(
84+
ProgramName)[0];
85+
86+
std::shared_ptr<std::vector<char>> il_binary;
87+
std::vector<ur_program_metadata_t> metadatas{};
88+
89+
uur::KernelsEnvironment::instance->LoadSource(ProgramName, platform,
90+
il_binary);
91+
92+
const ur_program_properties_t properties = {
93+
UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr,
94+
static_cast<uint32_t>(metadatas.size()),
95+
metadatas.empty() ? nullptr : metadatas.data()};
96+
97+
ASSERT_SUCCESS(urProgramCreateWithIL(context, il_binary->data(),
98+
il_binary->size(), &properties,
99+
&program));
100+
101+
UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
102+
urProgramBuild(context, program, nullptr));
103+
ASSERT_SUCCESS(urKernelCreate(program, KernelName.data(), &kernel));
104+
}
105+
106+
void TearDown() override {
107+
// kernel will be release in the actual test
108+
109+
urProgramRelease(program);
110+
UUR_RETURN_ON_FATAL_FAILURE(
111+
uur::urMultiQueueMultiDeviceTest<2>::TearDown());
112+
}
113+
};
114+
115+
UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urMultiQueueLaunchKernelDeferFreeTest);
116+
117+
TEST_P(urMultiQueueLaunchKernelDeferFreeTest, Success) {
118+
auto zeEvent1 = createZeEvent(context, devices[0]);
119+
auto zeEvent2 = createZeEvent(context, devices[1]);
120+
121+
ur_event_handle_t event1;
122+
ASSERT_SUCCESS(urEventCreateWithNativeHandle(
123+
reinterpret_cast<ur_native_handle_t>(zeEvent1.get()), context, nullptr,
124+
&event1));
125+
ur_event_handle_t event2;
126+
ASSERT_SUCCESS(urEventCreateWithNativeHandle(
127+
reinterpret_cast<ur_native_handle_t>(zeEvent2.get()), context, nullptr,
128+
&event2));
129+
130+
size_t global_offset = 0;
131+
size_t global_size = 1;
132+
133+
ASSERT_SUCCESS(urEnqueueEventsWait(queues[0], 1, &event1, nullptr));
134+
ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernel, 1, &global_offset,
135+
&global_size, nullptr, 0, nullptr,
136+
nullptr));
137+
138+
ASSERT_SUCCESS(urEnqueueEventsWait(queues[1], 1, &event2, nullptr));
139+
ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernel, 1, &global_offset,
140+
&global_size, nullptr, 0, nullptr,
141+
nullptr));
142+
143+
ASSERT_SUCCESS(urKernelRelease(kernel));
144+
145+
// Kernel should still be alive since both kernels are pending
146+
ur_context_handle_t contextFromKernel;
147+
ASSERT_SUCCESS(urKernelGetInfo(kernel, UR_KERNEL_INFO_CONTEXT,
148+
sizeof(ur_context_handle_t),
149+
&contextFromKernel, nullptr));
150+
ASSERT_EQ(context, contextFromKernel);
151+
152+
ASSERT_EQ(zeEventHostSignal(zeEvent2.get()), ZE_RESULT_SUCCESS);
153+
ASSERT_SUCCESS(urQueueFinish(queues[1]));
154+
155+
// Kernel should still be alive since kernel launch is pending
156+
ASSERT_SUCCESS(urKernelGetInfo(kernel, UR_KERNEL_INFO_CONTEXT,
157+
sizeof(ur_context_handle_t),
158+
&contextFromKernel, nullptr));
159+
ASSERT_EQ(context, contextFromKernel);
160+
161+
ASSERT_EQ(zeEventHostSignal(zeEvent1.get()), ZE_RESULT_SUCCESS);
162+
ASSERT_SUCCESS(urQueueFinish(queues[0]));
163+
164+
ASSERT_SUCCESS(urEventRelease(event1));
165+
ASSERT_SUCCESS(urEventRelease(event2));
166+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{{IGNORE}}
2+
{{.*}} ERROR SUMMARY: 0 errors from 0 contexts {{.*}}

0 commit comments

Comments
 (0)