Skip to content

Commit dc6f907

Browse files
authored
Merge pull request #1846 from CEED/jeremy/run-report-debug
Report JiT kernel launch errors when using try-catch
2 parents 204f3be + c49dc7a commit dc6f907

File tree

2 files changed

+33
-8
lines changed

2 files changed

+33
-8
lines changed

backends/cuda/ceed-cuda-compile.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -152,12 +152,14 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
152152
if (throw_error) {
153153
return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
154154
} else {
155+
// LCOV_EXCL_START
155156
CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
156157
CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
157158
CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
158159
CeedCallBackend(CeedFree(&log));
159160
CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
160161
return CEED_ERROR_SUCCESS;
162+
// LCOV_EXCL_STOP
161163
}
162164
}
163165

@@ -250,17 +252,24 @@ static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstrea
250252
CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
251253

252254
if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
253-
*is_good_run = false;
254-
if (throw_error) {
255-
int max_threads_per_block, shared_size_bytes, num_regs;
255+
int max_threads_per_block, shared_size_bytes, num_regs;
256256

257-
cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
258-
cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
259-
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
257+
cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
258+
cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
259+
cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
260+
if (throw_error) {
260261
return CeedError(ceed, CEED_ERROR_BACKEND,
261262
"CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
262263
max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
264+
} else {
265+
// LCOV_EXCL_START
266+
CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
267+
CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
268+
max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
269+
CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
270+
// LCOV_EXCL_STOP
263271
}
272+
*is_good_run = false;
264273
} else CeedChk_Cu(ceed, result);
265274
return CEED_ERROR_SUCCESS;
266275
}

backends/hip/ceed-hip-compile.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,12 +154,14 @@ static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_e
154154
if (throw_error) {
155155
return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", hiprtcGetErrorString(result), log);
156156
} else {
157+
// LCOV_EXCL_START
157158
CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
158159
CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", hiprtcGetErrorString(result), log);
159160
CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
160161
CeedCallBackend(CeedFree(&log));
161162
CeedCallHiprtc(ceed, hiprtcDestroyProgram(&prog));
162163
return CEED_ERROR_SUCCESS;
164+
// LCOV_EXCL_STOP
163165
}
164166
}
165167

@@ -229,8 +231,22 @@ static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, hipSt
229231
bool *is_good_run, void **args) {
230232
hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
231233

232-
*is_good_run = result == hipSuccess;
233-
if (throw_error) CeedCallHip(ceed, result);
234+
if (result == hipSuccess) {
235+
*is_good_run = true;
236+
} else {
237+
if (throw_error) {
238+
CeedCallHip(ceed, result);
239+
} else {
240+
// LCOV_EXCL_START
241+
const char *message = hipGetErrorName(result);
242+
243+
CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
244+
CeedDebug(ceed, "%s\n", message);
245+
CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
246+
// LCOV_EXCL_STOP
247+
}
248+
*is_good_run = false;
249+
}
234250
return CEED_ERROR_SUCCESS;
235251
}
236252

0 commit comments

Comments
 (0)