Merge pull request #1846 from CEED/jeremy/run-report-debug

jeremylt · web-flow · commit dc6f907154f3 · 2025-06-27T09:31:07.000-06:00
Report JiT kernel launch errors when using try-catch
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
@@ -152,12 +152,14 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
     if (throw_error) {
       return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
     } else {
+      // LCOV_EXCL_START
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
       CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
       CeedCallBackend(CeedFree(&log));
       CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
       return CEED_ERROR_SUCCESS;
+      // LCOV_EXCL_STOP
     }
   }
 
@@ -250,17 +252,24 @@ static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstrea
   CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
 
   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
-    *is_good_run = false;
-    if (throw_error) {
-      int max_threads_per_block, shared_size_bytes, num_regs;
+    int max_threads_per_block, shared_size_bytes, num_regs;
 
-      cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
-      cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
-      cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
+    cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
+    cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
+    cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
+    if (throw_error) {
       return CeedError(ceed, CEED_ERROR_BACKEND,
                        "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
                        max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+    } else {
+      // LCOV_EXCL_START
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
+      CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
+                max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      // LCOV_EXCL_STOP
     }
+    *is_good_run = false;
   } else CeedChk_Cu(ceed, result);
   return CEED_ERROR_SUCCESS;
 }
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
@@ -154,12 +154,14 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_e
     if (throw_error) {
       return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log);
     } else {
+      // LCOV_EXCL_START
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
       CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", hiprtcGetErrorString(result), log);
       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
       CeedCallBackend(CeedFree(&log));
       CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog));
       return CEED_ERROR_SUCCESS;
+      // LCOV_EXCL_STOP
     }
   }
 
@@ -229,8 +231,22 @@ static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, hipSt
                                           bool *is_good_run, void **args) {
   hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
 
-  *is_good_run = result == hipSuccess;
-  if (throw_error) CeedCallHip(ceed, result);
+  if (result == hipSuccess) {
+    *is_good_run = true;
+  } else {
+    if (throw_error) {
+      CeedCallHip(ceed, result);
+    } else {
+      // LCOV_EXCL_START
+      const char *message = hipGetErrorName(result);
+
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
+      CeedDebug(ceed, "%s\n", message);
+      CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
+      // LCOV_EXCL_STOP
+    }
+    *is_good_run = false;
+  }
   return CEED_ERROR_SUCCESS;
 }