Skip to content

Commit ac05da3

Browse files
Merge pull request #284 from MiloLurati/hip-local-memory-error-handleing
Hip local memory error handling
2 parents 083a3ee + cbdd0a8 commit ac05da3

File tree

3 files changed

+27
-8
lines changed

3 files changed

+27
-8
lines changed

kernel_tuner/backends/compiler.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -265,12 +265,23 @@ def compile(self, kernel_instance):
265265
if platform.system() == "Darwin":
266266
lib_extension = ".dylib"
267267

268-
subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
269-
subprocess.check_call(
268+
subprocess.run(
269+
[self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"],
270+
stdout=subprocess.PIPE,
271+
stderr=subprocess.PIPE,
272+
text=True,
273+
check=True
274+
)
275+
276+
subprocess.run(
270277
[self.compiler, filename + ".o"]
271278
+ compiler_options
272279
+ ["-shared", "-o", filename + lib_extension]
273-
+ lib_args
280+
+ lib_args,
281+
stdout=subprocess.PIPE,
282+
stderr=subprocess.PIPE,
283+
text=True,
284+
check=True
274285
)
275286

276287
self.lib = np.ctypeslib.load_library(filename, ".")
@@ -396,10 +407,16 @@ def memcpy_htod(self, dest, src):
396407

397408
def cleanup_lib(self):
398409
"""unload the previously loaded shared library"""
410+
if self.lib is None:
411+
return
412+
399413
if not self.using_openmp and not self.using_openacc:
400414
# this if statement is necessary because shared libraries that use
401415
# OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
402416
logging.debug("unloading shared library")
403-
_ctypes.dlclose(self.lib._handle)
417+
try:
418+
_ctypes.dlclose(self.lib._handle)
419+
finally:
420+
self.lib = None
404421

405422
units = {}

kernel_tuner/core.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -647,8 +647,10 @@ def compile_kernel(self, instance, verbose):
647647
shared_mem_error_messages = [
648648
"uses too much shared data",
649649
"local memory limit exceeded",
650+
r"local memory \(\d+\) exceeds limit \(\d+\)",
650651
]
651-
if any(msg in str(e) for msg in shared_mem_error_messages):
652+
error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
653+
if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
652654
logging.debug(
653655
"compile_kernel failed due to kernel using too much shared memory"
654656
)
@@ -715,7 +717,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
715717
)
716718

717719
# check for templated kernel
718-
if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name:
720+
if kernel_source.lang in ["CUDA", "NVCUDA", "HIP"] and "<" in name and ">" in name:
719721
kernel_string, name = wrap_templated_kernel(kernel_string, name)
720722

721723
# Preprocess GPU arguments. Require for handling `Tunable` arguments

test/test_compiler_functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,11 +198,11 @@ def test_compile_detects_device_code(npct, subprocess):
198198
cfunc = CompilerFunctions()
199199
cfunc.compile(kernel_instance)
200200

201-
print(subprocess.check_call.call_args_list)
201+
print(subprocess.run.call_args_list)
202202

203203
# assert the filename suffix used for source compilation is .cu
204204
dot_cu_used = False
205-
for call in subprocess.check_call.call_args_list:
205+
for call in subprocess.run.call_args_list:
206206
args, kwargs = call
207207
args = args[0]
208208
print(args)

0 commit comments

Comments
 (0)