Merge pull request #284 from MiloLurati/hip-local-memory-error-handleing

benvanwerkhoven · web-flow · commit ac05da310ecd · 2024-12-13T16:28:57.000+01:00
Hip local memory error handling
diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py
@@ -265,12 +265,23 @@ def compile(self, kernel_instance):
             if platform.system() == "Darwin":
                 lib_extension = ".dylib"
 
-            subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
-            subprocess.check_call(
+            subprocess.run(
+                [self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
+            )
+
+            subprocess.run(
                 [self.compiler, filename + ".o"]
                 + compiler_options
                 + ["-shared", "-o", filename + lib_extension]
-                + lib_args
+                + lib_args,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=True
             )
 
             self.lib = np.ctypeslib.load_library(filename, ".")
@@ -396,10 +407,16 @@ def memcpy_htod(self, dest, src):
 
     def cleanup_lib(self):
         """unload the previously loaded shared library"""
+        if self.lib is None:
+            return
+        
         if not self.using_openmp and not self.using_openacc:
             # this if statement is necessary because shared libraries that use
             # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP
             logging.debug("unloading shared library")
-            _ctypes.dlclose(self.lib._handle)
+            try:
+                _ctypes.dlclose(self.lib._handle)
+            finally:
+                self.lib = None
 
     units = {}
diff --git a/kernel_tuner/core.py b/kernel_tuner/core.py
@@ -647,8 +647,10 @@ def compile_kernel(self, instance, verbose):
             shared_mem_error_messages = [
                 "uses too much shared data",
                 "local memory limit exceeded",
+                r"local memory \(\d+\) exceeds limit \(\d+\)",
             ]
-            if any(msg in str(e) for msg in shared_mem_error_messages):
+            error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)
+            if any(re.search(msg, error_message) for msg in shared_mem_error_messages):
                 logging.debug(
                     "compile_kernel failed due to kernel using too much shared memory"
                 )
@@ -715,7 +717,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)
         )
 
         # check for templated kernel
-        if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name:
+        if kernel_source.lang in ["CUDA", "NVCUDA", "HIP"] and "<" in name and ">" in name:
             kernel_string, name = wrap_templated_kernel(kernel_string, name)
 
         # Preprocess GPU arguments. Require for handling `Tunable` arguments
diff --git a/test/test_compiler_functions.py b/test/test_compiler_functions.py
@@ -198,11 +198,11 @@ def test_compile_detects_device_code(npct, subprocess):
     cfunc = CompilerFunctions()
     cfunc.compile(kernel_instance)
 
-    print(subprocess.check_call.call_args_list)
+    print(subprocess.run.call_args_list)
 
     # assert the filename suffix used for source compilation is .cu
     dot_cu_used = False
-    for call in subprocess.check_call.call_args_list:
+    for call in subprocess.run.call_args_list:
         args, kwargs = call
         args = args[0]
         print(args)

Original file line number	Diff line number	Diff line change
`@@ -647,8 +647,10 @@ def compile_kernel(self, instance, verbose):`
`647`	`647`	`shared_mem_error_messages = [`
`648`	`648`	`"uses too much shared data",`
`649`	`649`	`"local memory limit exceeded",`
	`650`	`+ r"local memory \(\d+\) exceeds limit \(\d+\)",`
`650`	`651`	`]`
`651`		`- if any(msg in str(e) for msg in shared_mem_error_messages):`
	`652`	`+ error_message = str(e.stderr) if hasattr(e, "stderr") else str(e)`
	`653`	`+ if any(re.search(msg, error_message) for msg in shared_mem_error_messages):`
`652`	`654`	`logging.debug(`
`653`	`655`	`"compile_kernel failed due to kernel using too much shared memory"`
`654`	`656`	`)`
`@@ -715,7 +717,7 @@ def create_kernel_instance(self, kernel_source, kernel_options, params, verbose)`
`715`	`717`	`)`
`716`	`718`
`717`	`719`	`# check for templated kernel`
`718`		`- if kernel_source.lang in ["CUDA", "NVCUDA"] and "<" in name and ">" in name:`
	`720`	`+ if kernel_source.lang in ["CUDA", "NVCUDA", "HIP"] and "<" in name and ">" in name:`
`719`	`721`	`kernel_string, name = wrap_templated_kernel(kernel_string, name)`
`720`	`722`
`721`	`723`	# Preprocess GPU arguments. Require for handling `Tunable` arguments