From ddae5012d4ca987da08499b586cefc9e622c3919 Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Fri, 7 Feb 2025 12:22:52 -0700 Subject: [PATCH 1/3] cuda - gen fallback to shared if error --- .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 81 +++++++++++++++++-- .../cuda-gen/ceed-cuda-gen-operator-build.h | 2 +- backends/cuda-gen/ceed-cuda-gen-operator.c | 78 +++++------------- backends/cuda-gen/ceed-cuda-gen.h | 1 + backends/cuda/ceed-cuda-compile.cpp | 78 +++++++++++++----- backends/cuda/ceed-cuda-compile.h | 3 + 6 files changed, 160 insertions(+), 83 deletions(-) diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index d19eedd491..4e06536adf 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -916,7 +916,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C //------------------------------------------------------------------------------ // Build single operator kernel //------------------------------------------------------------------------------ -extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { +extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) { bool is_tensor = true, is_at_points = false, use_3d_slices = false; Ceed ceed; CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0; @@ -927,18 +927,77 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { CeedOperator_Cuda_gen *data; std::ostringstream code; + CeedCallBackend(CeedOperatorGetData(op, &data)); { bool is_setup_done; CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); - if (is_setup_done) return CEED_ERROR_SUCCESS; + if (is_setup_done) { + *is_good_build = !data->use_fallback; + return CEED_ERROR_SUCCESS; + } } + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + + // Check field compatibility + { + bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true; + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor &= is_tensor; + is_all_nontensor &= !is_tensor; + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + } + + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor &= is_tensor; + is_all_nontensor &= !is_tensor; + + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + } + // -- Fallback to ref if not all bases are shared + if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) { + *is_good_build = false; + return CEED_ERROR_SUCCESS; + } + } CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &data)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Get operator data @@ -1207,8 +1266,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { code << "// -----------------------------------------------------------------------------\n\n"; // Compile - CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d))); - CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op)); + { + bool is_compile_good = false; + + CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d))); + if (is_compile_good) { + *is_good_build = true; + CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op)); + } else { + *is_good_build = false; + data->use_fallback = true; + } + } CeedCallBackend(CeedOperatorSetSetupDone(op)); CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedQFunctionDestroy(&qf)); diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h index 28031e8e3b..88e20ceda2 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h @@ -6,4 +6,4 @@ // This file is part of CEED: http://github.com/ceed #pragma once -CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op); +CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build); diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c index 43d388e293..175a4c0034 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator.c +++ b/backends/cuda-gen/ceed-cuda-gen-operator.c @@ -99,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar) // Apply and add to output //------------------------------------------------------------------------------ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { - bool is_at_points, is_tensor; + bool is_at_points, is_tensor, is_good_run = true; Ceed ceed; Ceed_Cuda *cuda_data; CeedInt num_elem, num_input_fields, num_output_fields; @@ -111,62 +111,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Cuda_gen *data; - // Check for shared bases - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + // Creation of the operator { - bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true; - - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedBasis basis; - - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { - bool is_tensor = true; - const char *resource; - char *resource_root; - Ceed basis_ceed; - - CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - is_all_tensor &= is_tensor; - is_all_nontensor &= !is_tensor; - CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); - CeedCallBackend(CeedGetResource(basis_ceed, &resource)); - CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); - has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared"); - CeedCallBackend(CeedFree(&resource_root)); - CeedCallBackend(CeedDestroy(&basis_ceed)); - } - CeedCallBackend(CeedBasisDestroy(&basis)); - } + bool is_good_build = false; - for (CeedInt i = 0; i < num_output_fields; i++) { - CeedBasis basis; - - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { - bool is_tensor = true; - const char *resource; - char *resource_root; - Ceed basis_ceed; - - CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - is_all_tensor &= is_tensor; - is_all_nontensor &= !is_tensor; - - CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); - CeedCallBackend(CeedGetResource(basis_ceed, &resource)); - CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); - has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared"); - CeedCallBackend(CeedFree(&resource_root)); - CeedCallBackend(CeedDestroy(&basis_ceed)); - } - CeedCallBackend(CeedBasisDestroy(&basis)); - } - // -- Fallback to ref if not all bases are shared - if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) { + CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_good_build)); + if (!is_good_build) { CeedOperator op_fallback; - CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due unsupported bases"); + CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to code generation issue"); CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); return CEED_ERROR_SUCCESS; @@ -179,11 +132,9 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - // Creation of the operator - CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op)); - // Input vectors for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); @@ -293,7 +244,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, } CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs)); + CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_good_run, opargs)); // Restore input arrays for (CeedInt i = 0; i < num_input_fields; i++) { @@ -349,8 +300,21 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, // Restore context data CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Cleanup CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedQFunctionDestroy(&qf)); + + // Fallback if run was bad (out of resources) + if (!is_good_run) { + CeedOperator op_fallback; + + data->use_fallback = true; + CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to kernel execution issue"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); + return CEED_ERROR_SUCCESS; + } return CEED_ERROR_SUCCESS; } diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h index c88e9fd18f..09b66171e9 100644 --- a/backends/cuda-gen/ceed-cuda-gen.h +++ b/backends/cuda-gen/ceed-cuda-gen.h @@ -12,6 +12,7 @@ #include typedef struct { + bool use_fallback; CeedInt dim; CeedInt Q_1d; CeedInt max_P_1d; diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp index 20c57db2e8..4c196fe297 100644 --- a/backends/cuda/ceed-cuda-compile.cpp +++ b/backends/cuda/ceed-cuda-compile.cpp @@ -34,7 +34,8 @@ //------------------------------------------------------------------------------ // Compile CUDA kernel //------------------------------------------------------------------------------ -int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) { +static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module, + const CeedInt num_defines, va_list args) { size_t ptx_size; char *ptx; const int num_opts = 4; @@ -50,8 +51,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed // Get kernel specific options, such as kernel constants if (num_defines > 0) { - va_list args; - va_start(args, num_defines); char *name; int val; @@ -60,7 +59,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed val = va_arg(args, int); code << "#define " << name << " " << val << "\n"; } - va_end(args); } // Standard libCEED definitions for CUDA backends @@ -133,14 +131,17 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i])); } CeedCallBackend(CeedFree(&opts)); - if (result != NVRTC_SUCCESS) { + *is_compile_good = result == NVRTC_SUCCESS; + if (!*is_compile_good) { char *log; size_t log_size; - CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size)); - CeedCallBackend(CeedMalloc(log_size, &log)); - CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log)); - return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log); + if (throw_error) { + CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size)); + CeedCallBackend(CeedMalloc(log_size, &log)); + CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log)); + return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log); + } } #if CUDA_VERSION >= 11010 @@ -159,6 +160,25 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed return CEED_ERROR_SUCCESS; } +int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) { + bool is_compile_good = true; + va_list args; + + va_start(args, num_defines); + CeedCallBackend(CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args)); + va_end(args); + return CEED_ERROR_SUCCESS; +} + +int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) { + va_list args; + + va_start(args, num_defines); + CeedCallBackend(CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args)); + va_end(args); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Get CUDA kernel //------------------------------------------------------------------------------ @@ -200,24 +220,44 @@ int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, con //------------------------------------------------------------------------------ // Run CUDA kernel for spatial dimension with shared memory //------------------------------------------------------------------------------ -int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, - const int block_size_z, const int shared_mem_size, void **args) { +static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run, + void **args) { #if CUDA_VERSION >= 9000 cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size); #endif CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL); if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) { - int max_threads_per_block, shared_size_bytes, num_regs; - - cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); - cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); - cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel); - return CeedError(ceed, CEED_ERROR_BACKEND, - "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d", - max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs); + *is_good_run = false; + if (throw_error) { + int max_threads_per_block, shared_size_bytes, num_regs; + + cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); + cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); + cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel); + return CeedError(ceed, CEED_ERROR_BACKEND, + "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d", + max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs); + } } else CeedChk_Cu(ceed, result); return CEED_ERROR_SUCCESS; } +int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, void **args) { + bool is_good_run = true; + + CeedCallBackend( + CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args)); + return CEED_ERROR_SUCCESS; +} + +int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) { + CeedCallBackend( + CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args)); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ diff --git a/backends/cuda/ceed-cuda-compile.h b/backends/cuda/ceed-cuda-compile.h index 846de28c9d..21204a495d 100644 --- a/backends/cuda/ceed-cuda-compile.h +++ b/backends/cuda/ceed-cuda-compile.h @@ -13,6 +13,7 @@ static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; } CEED_INTERN int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...); +CEED_INTERN int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...); CEED_INTERN int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel); @@ -24,3 +25,5 @@ CEED_INTERN int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, int grid_siz CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, int shared_mem_size, void **args); +CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, + int shared_mem_size, bool *is_good_run, void **args); From 8d12f40e0e187f71c4a1a78742076f931e72da09 Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Fri, 7 Feb 2025 13:06:17 -0700 Subject: [PATCH 2/3] hip - gen fallback to shared if error --- .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 2 +- backends/cuda-gen/ceed-cuda-gen-operator.c | 6 +- backends/cuda/ceed-cuda-compile.cpp | 12 ++- .../hip-gen/ceed-hip-gen-operator-build.cpp | 83 ++++++++++++++++-- .../hip-gen/ceed-hip-gen-operator-build.h | 2 +- backends/hip-gen/ceed-hip-gen-operator.c | 85 ++++++------------- backends/hip-gen/ceed-hip-gen.h | 1 + backends/hip/ceed-hip-compile.cpp | 49 +++++++++-- backends/hip/ceed-hip-compile.h | 3 + 9 files changed, 159 insertions(+), 84 deletions(-) diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index 4e06536adf..69f6788c19 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -937,9 +937,9 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b return CEED_ERROR_SUCCESS; } } - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); // Check field compatibility + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); { bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true; diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c index 175a4c0034..3410cdcfb7 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator.c +++ b/backends/cuda-gen/ceed-cuda-gen-operator.c @@ -99,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar) // Apply and add to output //------------------------------------------------------------------------------ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { - bool is_at_points, is_tensor, is_good_run = true; + bool is_at_points, is_tensor, is_run_good = true; Ceed ceed; Ceed_Cuda *cuda_data; CeedInt num_elem, num_input_fields, num_output_fields; @@ -244,7 +244,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, } CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar); - CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_good_run, opargs)); + CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs)); // Restore input arrays for (CeedInt i = 0; i < num_input_fields; i++) { @@ -306,7 +306,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedCallBackend(CeedQFunctionDestroy(&qf)); // Fallback if run was bad (out of resources) - if (!is_good_run) { + if (!is_run_good) { CeedOperator op_fallback; data->use_fallback = true; diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp index 4c196fe297..6c0e07c0b9 100644 --- a/backends/cuda/ceed-cuda-compile.cpp +++ b/backends/cuda/ceed-cuda-compile.cpp @@ -132,16 +132,14 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_ } CeedCallBackend(CeedFree(&opts)); *is_compile_good = result == NVRTC_SUCCESS; - if (!*is_compile_good) { + if (!*is_compile_good && throw_error) { char *log; size_t log_size; - if (throw_error) { - CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size)); - CeedCallBackend(CeedMalloc(log_size, &log)); - CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log)); - return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log); - } + CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size)); + CeedCallBackend(CeedMalloc(log_size, &log)); + CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log)); + return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log); } #if CUDA_VERSION >= 11010 diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp index 1df23c3d7e..4c3479ee21 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp +++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp @@ -942,7 +942,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce //------------------------------------------------------------------------------ // Build single operator kernel //------------------------------------------------------------------------------ -extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { +extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) { bool is_tensor = true, is_at_points = false, use_3d_slices = false; Ceed ceed; CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0; @@ -953,18 +953,77 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { CeedOperator_Hip_gen *data; std::ostringstream code; + CeedCallBackend(CeedOperatorGetData(op, &data)); { bool is_setup_done; CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done)); - if (is_setup_done) return CEED_ERROR_SUCCESS; + if (is_setup_done) { + *is_good_build = !data->use_fallback; + return CEED_ERROR_SUCCESS; + } } + // Check field compatibility + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + { + bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true; + + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor &= is_tensor; + is_all_nontensor &= !is_tensor; + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + } + + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedBasis basis; + + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); + if (basis != CEED_BASIS_NONE) { + bool is_tensor = true; + const char *resource; + char *resource_root; + Ceed basis_ceed; + + CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); + is_all_tensor &= is_tensor; + is_all_nontensor &= !is_tensor; + + CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); + CeedCallBackend(CeedGetResource(basis_ceed, &resource)); + CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); + has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared"); + CeedCallBackend(CeedFree(&resource_root)); + CeedCallBackend(CeedDestroy(&basis_ceed)); + } + CeedCallBackend(CeedBasisDestroy(&basis)); + } + // -- Fallback to ref if not all bases are shared + if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) { + *is_good_build = false; + return CEED_ERROR_SUCCESS; + } + } CeedCallBackend(CeedOperatorGetCeed(op, &ceed)); - CeedCallBackend(CeedOperatorGetData(op, &data)); CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); // Get operator data @@ -1225,9 +1284,19 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) { // Compile CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, data->max_P_1d, Q_1d, block_sizes)); - CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE", - block_sizes[0] * block_sizes[1] * block_sizes[2])); - CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); + { + bool is_compile_good = false; + + CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE", + block_sizes[0] * block_sizes[1] * block_sizes[2])); + if (is_compile_good) { + *is_good_build = true; + CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op)); + } else { + *is_good_build = false; + data->use_fallback = true; + } + } CeedCallBackend(CeedOperatorSetSetupDone(op)); CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedQFunctionDestroy(&qf)); diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h index c17ba46eeb..4d5de74269 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.h +++ b/backends/hip-gen/ceed-hip-gen-operator-build.h @@ -7,4 +7,4 @@ #pragma once CEED_INTERN int BlockGridCalculate_Hip_gen(CeedInt dim, CeedInt num_elem, CeedInt P_1d, CeedInt Q_1d, CeedInt *block_sizes); -CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op); +CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build); diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c index a2a6ccd1f1..da164e2b93 100644 --- a/backends/hip-gen/ceed-hip-gen-operator.c +++ b/backends/hip-gen/ceed-hip-gen-operator.c @@ -35,7 +35,7 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) { // Apply and add to output //------------------------------------------------------------------------------ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) { - bool is_at_points, is_tensor; + bool is_at_points, is_tensor, is_good_run = true; Ceed ceed; CeedInt num_elem, num_input_fields, num_output_fields; CeedEvalMode eval_mode; @@ -46,62 +46,15 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C CeedOperatorField *op_input_fields, *op_output_fields; CeedOperator_Hip_gen *data; - // Check for shared bases - CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); + // Creation of the operator { - bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true; - - for (CeedInt i = 0; i < num_input_fields; i++) { - CeedBasis basis; - - CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { - bool is_tensor = true; - const char *resource; - char *resource_root; - Ceed basis_ceed; - - CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - is_all_tensor &= is_tensor; - is_all_nontensor &= !is_tensor; - CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); - CeedCallBackend(CeedGetResource(basis_ceed, &resource)); - CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); - has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared"); - CeedCallBackend(CeedFree(&resource_root)); - CeedCallBackend(CeedDestroy(&basis_ceed)); - } - CeedCallBackend(CeedBasisDestroy(&basis)); - } + bool is_good_build = false; - for (CeedInt i = 0; i < num_output_fields; i++) { - CeedBasis basis; - - CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis)); - if (basis != CEED_BASIS_NONE) { - bool is_tensor = true; - const char *resource; - char *resource_root; - Ceed basis_ceed; - - CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - is_all_tensor &= is_tensor; - is_all_nontensor &= !is_tensor; - - CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); - CeedCallBackend(CeedGetResource(basis_ceed, &resource)); - CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); - has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared"); - CeedCallBackend(CeedFree(&resource_root)); - CeedCallBackend(CeedDestroy(&basis_ceed)); - } - CeedCallBackend(CeedBasisDestroy(&basis)); - } - // -- Fallback to ref if not all bases are shared - if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) { + CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_good_build)); + if (!is_good_build) { CeedOperator op_fallback; - CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to unsupported bases"); + CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to code generation issue"); CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); return CEED_ERROR_SUCCESS; @@ -113,11 +66,9 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C CeedCallBackend(CeedOperatorGetQFunction(op, &qf)); CeedCallBackend(CeedQFunctionGetData(qf, &qf_data)); CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem)); + CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields)); CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields)); - // Creation of the operator - CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op)); - // Input vectors for (CeedInt i = 0; i < num_input_fields; i++) { CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode)); @@ -219,17 +170,20 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + CeedCallBackend( + CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs)); } else if (dim == 2) { CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + CeedCallBackend( + CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs)); } else if (dim == 3) { CeedInt grid = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0); CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar); - CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs)); + CeedCallBackend( + CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs)); } // Restore input arrays @@ -280,8 +234,21 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C // Restore context data CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c)); + + // Cleanup CeedCallBackend(CeedDestroy(&ceed)); CeedCallBackend(CeedQFunctionDestroy(&qf)); + + // Fallback if run was bad (out of resources) + if (!is_good_run) { + CeedOperator op_fallback; + + data->use_fallback = true; + CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to kernel execution issue"); + CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback)); + CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request)); + return CEED_ERROR_SUCCESS; + } return CEED_ERROR_SUCCESS; } diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h index eb5dd0c893..760fef2ed5 100644 --- a/backends/hip-gen/ceed-hip-gen.h +++ b/backends/hip-gen/ceed-hip-gen.h @@ -12,6 +12,7 @@ #include typedef struct { + bool use_fallback; CeedInt dim; CeedInt Q_1d; CeedInt max_P_1d; diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp index dface44ef6..51c83cf222 100644 --- a/backends/hip/ceed-hip-compile.cpp +++ b/backends/hip/ceed-hip-compile.cpp @@ -33,7 +33,8 @@ //------------------------------------------------------------------------------ // Compile HIP kernel //------------------------------------------------------------------------------ -int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) { +static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, hipModule_t *module, + const CeedInt num_defines, va_list args) { size_t ptx_size; char *ptx; const int num_opts = 4; @@ -62,8 +63,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce // Kernel specific options, such as kernel constants if (num_defines > 0) { - va_list args; - va_start(args, num_defines); char *name; int val; @@ -72,7 +71,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce val = va_arg(args, int); code << "#define " << name << " " << val << "\n"; } - va_end(args); } // Standard libCEED definitions for HIP backends @@ -135,7 +133,8 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i])); } CeedCallBackend(CeedFree(&opts)); - if (result != HIPRTC_SUCCESS) { + *is_compile_good = result == HIPRTC_SUCCESS; + if (!*is_compile_good && throw_error) { size_t log_size; char *log; @@ -155,6 +154,25 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce return CEED_ERROR_SUCCESS; } +int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) { + bool is_compile_good = true; + va_list args; + + va_start(args, num_defines); + CeedCallBackend(CeedCompileCore_Hip(ceed, source, true, &is_compile_good, module, num_defines, args)); + va_end(args); + return CEED_ERROR_SUCCESS; +} + +int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...) { + va_list args; + + va_start(args, num_defines); + CeedCallBackend(CeedCompileCore_Hip(ceed, source, false, is_compile_good, module, num_defines, args)); + va_end(args); + return CEED_ERROR_SUCCESS; +} + //------------------------------------------------------------------------------ // Get HIP kernel //------------------------------------------------------------------------------ @@ -183,9 +201,28 @@ int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, c //------------------------------------------------------------------------------ // Run HIP kernel for spatial dimension with shared memory //------------------------------------------------------------------------------ +static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run, void **args) { + hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL); + + *is_good_run = result == hipSuccess; + if (throw_error) CeedCallHip(ceed, result); + return CEED_ERROR_SUCCESS; +} + int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, const int shared_mem_size, void **args) { - CeedCallHip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL)); + bool is_good_run = true; + + CeedCallBackend( + CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args)); + return CEED_ERROR_SUCCESS; +} + +int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y, + const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) { + CeedCallBackend( + CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args)); return CEED_ERROR_SUCCESS; } diff --git a/backends/hip/ceed-hip-compile.h b/backends/hip/ceed-hip-compile.h index d990924ec2..0a29fad33e 100644 --- a/backends/hip/ceed-hip-compile.h +++ b/backends/hip/ceed-hip-compile.h @@ -13,6 +13,7 @@ static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; } CEED_INTERN int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...); +CEED_INTERN int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...); CEED_INTERN int CeedGetKernel_Hip(Ceed ceed, hipModule_t module, const char *name, hipFunction_t *kernel); @@ -23,3 +24,5 @@ CEED_INTERN int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, int grid_s CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, int shared_mem_size, void **args); +CEED_INTERN int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z, + int shared_mem_size, bool *is_good_run, void **args); From c9192aca9c02dc42a6a7d7a897b4af02df4a189e Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Fri, 7 Feb 2025 16:05:47 -0700 Subject: [PATCH 3/3] gpu - swap out bitwise assignment operators for bools Co-authored-by: Zach Atkins --- backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 12 ++++++------ backends/hip-gen/ceed-hip-gen-operator-build.cpp | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index 69f6788c19..181346303c 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -954,12 +954,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b Ceed basis_ceed; CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - is_all_tensor &= is_tensor; - is_all_nontensor &= !is_tensor; + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_not_tensor && !is_tensor; CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); CeedCallBackend(CeedGetResource(basis_ceed, &resource)); CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); - has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared"); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedDestroy(&basis_ceed)); } @@ -977,13 +977,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b Ceed basis_ceed; CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - is_all_tensor &= is_tensor; - is_all_nontensor &= !is_tensor; + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); CeedCallBackend(CeedGetResource(basis_ceed, &resource)); CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); - has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared"); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared"); CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedDestroy(&basis_ceed)); } diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp index 4c3479ee21..77c642b9ca 100644 --- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp +++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp @@ -980,12 +980,12 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu Ceed basis_ceed; CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - is_all_tensor &= is_tensor; - is_all_nontensor &= !is_tensor; + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); CeedCallBackend(CeedGetResource(basis_ceed, &resource)); CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); - has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared"); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedDestroy(&basis_ceed)); } @@ -1003,13 +1003,13 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu Ceed basis_ceed; CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor)); - is_all_tensor &= is_tensor; - is_all_nontensor &= !is_tensor; + is_all_tensor = is_all_tensor && is_tensor; + is_all_nontensor = is_all_nontensor && !is_tensor; CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed)); CeedCallBackend(CeedGetResource(basis_ceed, &resource)); CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root)); - has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared"); + has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared"); CeedCallBackend(CeedFree(&resource_root)); CeedCallBackend(CeedDestroy(&basis_ceed)); }