From ddae5012d4ca987da08499b586cefc9e622c3919 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Feb 2025 12:22:52 -0700
Subject: [PATCH 1/3] cuda - gen fallback to shared if error

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp | 81 +++++++++++++++++--
 .../cuda-gen/ceed-cuda-gen-operator-build.h   |  2 +-
 backends/cuda-gen/ceed-cuda-gen-operator.c    | 78 +++++-------------
 backends/cuda-gen/ceed-cuda-gen.h             |  1 +
 backends/cuda/ceed-cuda-compile.cpp           | 78 +++++++++++++-----
 backends/cuda/ceed-cuda-compile.h             |  3 +
 6 files changed, 160 insertions(+), 83 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index d19eedd491..4e06536adf 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -916,7 +916,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 //------------------------------------------------------------------------------
 // Build single operator kernel
 //------------------------------------------------------------------------------
-extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
+extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build) {
   bool                    is_tensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                    ceed;
   CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
@@ -927,18 +927,77 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   CeedOperator_Cuda_gen  *data;
   std::ostringstream      code;
 
+  CeedCallBackend(CeedOperatorGetData(op, &data));
   {
     bool is_setup_done;
 
     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
-    if (is_setup_done) return CEED_ERROR_SUCCESS;
+    if (is_setup_done) {
+      *is_good_build = !data->use_fallback;
+      return CEED_ERROR_SUCCESS;
+    }
   }
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+
+  // Check field compatibility
+  {
+    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
 
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+    // -- Fallback to ref if not all bases are shared
+    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+      *is_good_build = false;
+      return CEED_ERROR_SUCCESS;
+    }
+  }
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Get operator data
@@ -1207,8 +1266,18 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "// -----------------------------------------------------------------------------\n\n";
 
   // Compile
-  CeedCallBackend(CeedCompile_Cuda(ceed, code.str().c_str(), &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
-  CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
+  {
+    bool is_compile_good = false;
+
+    CeedCallBackend(CeedTryCompile_Cuda(ceed, code.str().c_str(), &is_compile_good, &data->module, 1, "T_1D", CeedIntMax(Q_1d, data->max_P_1d)));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Cuda(ceed, data->module, operator_name.c_str(), &data->op));
+    } else {
+      *is_good_build     = false;
+      data->use_fallback = true;
+    }
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.h b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
index 28031e8e3b..88e20ceda2 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.h
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.h
@@ -6,4 +6,4 @@
 // This file is part of CEED:  http://github.com/ceed
 #pragma once
 
-CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op);
+CEED_INTERN int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 43d388e293..175a4c0034 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -99,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                    is_at_points, is_tensor;
+  bool                    is_at_points, is_tensor, is_good_run = true;
   Ceed                    ceed;
   Ceed_Cuda              *cuda_data;
   CeedInt                 num_elem, num_input_fields, num_output_fields;
@@ -111,62 +111,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedOperatorField      *op_input_fields, *op_output_fields;
   CeedOperator_Cuda_gen  *data;
 
-  // Check for shared bases
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  // Creation of the operator
   {
-    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
-
-    for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedBasis basis;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        bool        is_tensor = true;
-        const char *resource;
-        char       *resource_root;
-        Ceed        basis_ceed;
-
-        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
-        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
-        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
-        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
-        CeedCallBackend(CeedFree(&resource_root));
-        CeedCallBackend(CeedDestroy(&basis_ceed));
-      }
-      CeedCallBackend(CeedBasisDestroy(&basis));
-    }
+    bool is_good_build = false;
 
-    for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedBasis basis;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        bool        is_tensor = true;
-        const char *resource;
-        char       *resource_root;
-        Ceed        basis_ceed;
-
-        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
-
-        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
-        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
-        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
-        CeedCallBackend(CeedFree(&resource_root));
-        CeedCallBackend(CeedDestroy(&basis_ceed));
-      }
-      CeedCallBackend(CeedBasisDestroy(&basis));
-    }
-    // -- Fallback to ref if not all bases are shared
-    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+    CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op, &is_good_build));
+    if (!is_good_build) {
       CeedOperator op_fallback;
 
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due unsupported bases");
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to code generation issue");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
@@ -179,11 +132,9 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
-  // Creation of the operator
-  CeedCallBackend(CeedOperatorBuildKernel_Cuda_gen(op));
-
   // Input vectors
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
@@ -293,7 +244,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   }
   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-  CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, opargs));
+  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_good_run, opargs));
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -349,8 +300,21 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
 
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+  // Cleanup
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
+
+  // Fallback if run was bad (out of resources)
+  if (!is_good_run) {
+    CeedOperator op_fallback;
+
+    data->use_fallback = true;
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/cuda/ref CeedOperator due to kernel execution issue");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+    return CEED_ERROR_SUCCESS;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
index c88e9fd18f..09b66171e9 100644
--- a/backends/cuda-gen/ceed-cuda-gen.h
+++ b/backends/cuda-gen/ceed-cuda-gen.h
@@ -12,6 +12,7 @@
 #include <cuda.h>
 
 typedef struct {
+  bool           use_fallback;
   CeedInt        dim;
   CeedInt        Q_1d;
   CeedInt        max_P_1d;
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 20c57db2e8..4c196fe297 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -34,7 +34,8 @@
 //------------------------------------------------------------------------------
 // Compile CUDA kernel
 //------------------------------------------------------------------------------
-int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
+static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
+                                const CeedInt num_defines, va_list args) {
   size_t                ptx_size;
   char                 *ptx;
   const int             num_opts            = 4;
@@ -50,8 +51,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
 
   // Get kernel specific options, such as kernel constants
   if (num_defines > 0) {
-    va_list args;
-    va_start(args, num_defines);
     char *name;
     int   val;
 
@@ -60,7 +59,6 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
       val  = va_arg(args, int);
       code << "#define " << name << " " << val << "\n";
     }
-    va_end(args);
   }
 
   // Standard libCEED definitions for CUDA backends
@@ -133,14 +131,17 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
     CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
   }
   CeedCallBackend(CeedFree(&opts));
-  if (result != NVRTC_SUCCESS) {
+  *is_compile_good = result == NVRTC_SUCCESS;
+  if (!*is_compile_good) {
     char  *log;
     size_t log_size;
 
-    CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
-    CeedCallBackend(CeedMalloc(log_size, &log));
-    CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
-    return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
+    if (throw_error) {
+      CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
+      CeedCallBackend(CeedMalloc(log_size, &log));
+      CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
+      return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
+    }
   }
 
 #if CUDA_VERSION >= 11010
@@ -159,6 +160,25 @@ int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const Ceed
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
+  bool    is_compile_good = true;
+  va_list args;
+
+  va_start(args, num_defines);
+  CeedCallBackend(CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args));
+  va_end(args);
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
+  va_list args;
+
+  va_start(args, num_defines);
+  CeedCallBackend(CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args));
+  va_end(args);
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Get CUDA kernel
 //------------------------------------------------------------------------------
@@ -200,24 +220,44 @@ int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, con
 //------------------------------------------------------------------------------
 // Run CUDA kernel for spatial dimension with shared memory
 //------------------------------------------------------------------------------
-int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
-                                const int block_size_z, const int shared_mem_size, void **args) {
+static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                           const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run,
+                                           void **args) {
 #if CUDA_VERSION >= 9000
   cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
 #endif
   CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
 
   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
-    int max_threads_per_block, shared_size_bytes, num_regs;
-
-    cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
-    cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
-    cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
-    return CeedError(ceed, CEED_ERROR_BACKEND,
-                     "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
-                     max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+    *is_good_run = false;
+    if (throw_error) {
+      int max_threads_per_block, shared_size_bytes, num_regs;
+
+      cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
+      cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
+      cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
+      return CeedError(ceed, CEED_ERROR_BACKEND,
+                       "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
+                       max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
+    }
   } else CeedChk_Cu(ceed, result);
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                const int block_size_z, const int shared_mem_size, void **args) {
+  bool is_good_run = true;
+
+  CeedCallBackend(
+      CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                   const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
+  CeedCallBackend(
+      CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
diff --git a/backends/cuda/ceed-cuda-compile.h b/backends/cuda/ceed-cuda-compile.h
index 846de28c9d..21204a495d 100644
--- a/backends/cuda/ceed-cuda-compile.h
+++ b/backends/cuda/ceed-cuda-compile.h
@@ -13,6 +13,7 @@
 static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; }
 
 CEED_INTERN int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...);
+CEED_INTERN int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...);
 
 CEED_INTERN int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel);
 
@@ -24,3 +25,5 @@ CEED_INTERN int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, int grid_siz
 
 CEED_INTERN int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
                                             int shared_mem_size, void **args);
+CEED_INTERN int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
+                                               int shared_mem_size, bool *is_good_run, void **args);

From 8d12f40e0e187f71c4a1a78742076f931e72da09 Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Feb 2025 13:06:17 -0700
Subject: [PATCH 2/3] hip - gen fallback to shared if error

---
 .../cuda-gen/ceed-cuda-gen-operator-build.cpp |  2 +-
 backends/cuda-gen/ceed-cuda-gen-operator.c    |  6 +-
 backends/cuda/ceed-cuda-compile.cpp           | 12 ++-
 .../hip-gen/ceed-hip-gen-operator-build.cpp   | 83 ++++++++++++++++--
 .../hip-gen/ceed-hip-gen-operator-build.h     |  2 +-
 backends/hip-gen/ceed-hip-gen-operator.c      | 85 ++++++-------------
 backends/hip-gen/ceed-hip-gen.h               |  1 +
 backends/hip/ceed-hip-compile.cpp             | 49 +++++++++--
 backends/hip/ceed-hip-compile.h               |  3 +
 9 files changed, 159 insertions(+), 84 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 4e06536adf..69f6788c19 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -937,9 +937,9 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
       return CEED_ERROR_SUCCESS;
     }
   }
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
 
   // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   {
     bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
 
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
index 175a4c0034..3410cdcfb7 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator.c
+++ b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -99,7 +99,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                    is_at_points, is_tensor, is_good_run = true;
+  bool                    is_at_points, is_tensor, is_run_good = true;
   Ceed                    ceed;
   Ceed_Cuda              *cuda_data;
   CeedInt                 num_elem, num_input_fields, num_output_fields;
@@ -244,7 +244,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   }
   CeedInt shared_mem = block[0] * block[1] * block[2] * sizeof(CeedScalar);
 
-  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_good_run, opargs));
+  CeedCallBackend(CeedTryRunKernelDimShared_Cuda(ceed, data->op, grid, block[0], block[1], block[2], shared_mem, &is_run_good, opargs));
 
   // Restore input arrays
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -306,7 +306,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
   CeedCallBackend(CeedQFunctionDestroy(&qf));
 
   // Fallback if run was bad (out of resources)
-  if (!is_good_run) {
+  if (!is_run_good) {
     CeedOperator op_fallback;
 
     data->use_fallback = true;
diff --git a/backends/cuda/ceed-cuda-compile.cpp b/backends/cuda/ceed-cuda-compile.cpp
index 4c196fe297..6c0e07c0b9 100644
--- a/backends/cuda/ceed-cuda-compile.cpp
+++ b/backends/cuda/ceed-cuda-compile.cpp
@@ -132,16 +132,14 @@ static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_
   }
   CeedCallBackend(CeedFree(&opts));
   *is_compile_good = result == NVRTC_SUCCESS;
-  if (!*is_compile_good) {
+  if (!*is_compile_good && throw_error) {
     char  *log;
     size_t log_size;
 
-    if (throw_error) {
-      CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
-      CeedCallBackend(CeedMalloc(log_size, &log));
-      CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
-      return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
-    }
+    CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
+    CeedCallBackend(CeedMalloc(log_size, &log));
+    CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
+    return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
   }
 
 #if CUDA_VERSION >= 11010
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 1df23c3d7e..4c3479ee21 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -942,7 +942,7 @@ static int CeedOperatorBuildKernelQFunction_Hip_gen(std::ostringstream &code, Ce
 //------------------------------------------------------------------------------
 // Build single operator kernel
 //------------------------------------------------------------------------------
-extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
+extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build) {
   bool                   is_tensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                   ceed;
   CeedInt                Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0, coords_comp_stride = 0;
@@ -953,18 +953,77 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   CeedOperator_Hip_gen  *data;
   std::ostringstream     code;
 
+  CeedCallBackend(CeedOperatorGetData(op, &data));
   {
     bool is_setup_done;
 
     CeedCallBackend(CeedOperatorIsSetupDone(op, &is_setup_done));
-    if (is_setup_done) return CEED_ERROR_SUCCESS;
+    if (is_setup_done) {
+      *is_good_build = !data->use_fallback;
+      return CEED_ERROR_SUCCESS;
+    }
   }
 
+  // Check field compatibility
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  {
+    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
+
+    for (CeedInt i = 0; i < num_input_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+
+    for (CeedInt i = 0; i < num_output_fields; i++) {
+      CeedBasis basis;
+
+      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
+      if (basis != CEED_BASIS_NONE) {
+        bool        is_tensor = true;
+        const char *resource;
+        char       *resource_root;
+        Ceed        basis_ceed;
+
+        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
+        is_all_tensor &= is_tensor;
+        is_all_nontensor &= !is_tensor;
+
+        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
+        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
+        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
+        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        CeedCallBackend(CeedFree(&resource_root));
+        CeedCallBackend(CeedDestroy(&basis_ceed));
+      }
+      CeedCallBackend(CeedBasisDestroy(&basis));
+    }
+    // -- Fallback to ref if not all bases are shared
+    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+      *is_good_build = false;
+      return CEED_ERROR_SUCCESS;
+    }
+  }
   CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
-  CeedCallBackend(CeedOperatorGetData(op, &data));
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Get operator data
@@ -1225,9 +1284,19 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op) {
   // Compile
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
   CeedCallBackend(BlockGridCalculate_Hip_gen(is_tensor ? dim : 1, num_elem, data->max_P_1d, Q_1d, block_sizes));
-  CeedCallBackend(CeedCompile_Hip(ceed, code.str().c_str(), &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
-                                  block_sizes[0] * block_sizes[1] * block_sizes[2]));
-  CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
+  {
+    bool is_compile_good = false;
+
+    CeedCallBackend(CeedTryCompile_Hip(ceed, code.str().c_str(), &is_compile_good, &data->module, 2, "T_1D", block_sizes[0], "BLOCK_SIZE",
+                                       block_sizes[0] * block_sizes[1] * block_sizes[2]));
+    if (is_compile_good) {
+      *is_good_build = true;
+      CeedCallBackend(CeedGetKernel_Hip(ceed, data->module, operator_name.c_str(), &data->op));
+    } else {
+      *is_good_build     = false;
+      data->use_fallback = true;
+    }
+  }
   CeedCallBackend(CeedOperatorSetSetupDone(op));
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.h b/backends/hip-gen/ceed-hip-gen-operator-build.h
index c17ba46eeb..4d5de74269 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.h
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.h
@@ -7,4 +7,4 @@
 #pragma once
 
 CEED_INTERN int BlockGridCalculate_Hip_gen(CeedInt dim, CeedInt num_elem, CeedInt P_1d, CeedInt Q_1d, CeedInt *block_sizes);
-CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op);
+CEED_INTERN int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_build);
diff --git a/backends/hip-gen/ceed-hip-gen-operator.c b/backends/hip-gen/ceed-hip-gen-operator.c
index a2a6ccd1f1..da164e2b93 100644
--- a/backends/hip-gen/ceed-hip-gen-operator.c
+++ b/backends/hip-gen/ceed-hip-gen-operator.c
@@ -35,7 +35,7 @@ static int CeedOperatorDestroy_Hip_gen(CeedOperator op) {
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
-  bool                   is_at_points, is_tensor;
+  bool                   is_at_points, is_tensor, is_good_run = true;
   Ceed                   ceed;
   CeedInt                num_elem, num_input_fields, num_output_fields;
   CeedEvalMode           eval_mode;
@@ -46,62 +46,15 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedOperatorField     *op_input_fields, *op_output_fields;
   CeedOperator_Hip_gen  *data;
 
-  // Check for shared bases
-  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
+  // Creation of the operator
   {
-    bool has_shared_bases = true, is_all_tensor = true, is_all_nontensor = true;
-
-    for (CeedInt i = 0; i < num_input_fields; i++) {
-      CeedBasis basis;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        bool        is_tensor = true;
-        const char *resource;
-        char       *resource_root;
-        Ceed        basis_ceed;
-
-        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
-        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
-        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
-        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
-        CeedCallBackend(CeedFree(&resource_root));
-        CeedCallBackend(CeedDestroy(&basis_ceed));
-      }
-      CeedCallBackend(CeedBasisDestroy(&basis));
-    }
+    bool is_good_build = false;
 
-    for (CeedInt i = 0; i < num_output_fields; i++) {
-      CeedBasis basis;
-
-      CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
-      if (basis != CEED_BASIS_NONE) {
-        bool        is_tensor = true;
-        const char *resource;
-        char       *resource_root;
-        Ceed        basis_ceed;
-
-        CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
-
-        CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
-        CeedCallBackend(CeedGetResource(basis_ceed, &resource));
-        CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
-        CeedCallBackend(CeedFree(&resource_root));
-        CeedCallBackend(CeedDestroy(&basis_ceed));
-      }
-      CeedCallBackend(CeedBasisDestroy(&basis));
-    }
-    // -- Fallback to ref if not all bases are shared
-    if (!has_shared_bases || (!is_all_tensor && !is_all_nontensor)) {
+    CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op, &is_good_build));
+    if (!is_good_build) {
       CeedOperator op_fallback;
 
-      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to unsupported bases");
+      CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to code generation issue");
       CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
       CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
       return CEED_ERROR_SUCCESS;
@@ -113,11 +66,9 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
   CeedCallBackend(CeedOperatorGetQFunction(op, &qf));
   CeedCallBackend(CeedQFunctionGetData(qf, &qf_data));
   CeedCallBackend(CeedOperatorGetNumElements(op, &num_elem));
+  CeedCallBackend(CeedOperatorGetFields(op, &num_input_fields, &op_input_fields, &num_output_fields, &op_output_fields));
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
-  // Creation of the operator
-  CeedCallBackend(CeedOperatorBuildKernel_Hip_gen(op));
-
   // Input vectors
   for (CeedInt i = 0; i < num_input_fields; i++) {
     CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
@@ -219,17 +170,20 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    CeedCallBackend(
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
   } else if (dim == 2) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    CeedCallBackend(
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
   } else if (dim == 3) {
     CeedInt grid      = num_elem / block_sizes[2] + ((num_elem / block_sizes[2] * block_sizes[2] < num_elem) ? 1 : 0);
     CeedInt sharedMem = block_sizes[2] * thread_1d * thread_1d * sizeof(CeedScalar);
 
-    CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, opargs));
+    CeedCallBackend(
+        CeedTryRunKernelDimShared_Hip(ceed, data->op, grid, block_sizes[0], block_sizes[1], block_sizes[2], sharedMem, &is_good_run, opargs));
   }
 
   // Restore input arrays
@@ -280,8 +234,21 @@ static int CeedOperatorApplyAdd_Hip_gen(CeedOperator op, CeedVector input_vec, C
 
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
+
+  // Cleanup
   CeedCallBackend(CeedDestroy(&ceed));
   CeedCallBackend(CeedQFunctionDestroy(&qf));
+
+  // Fallback if run was bad (out of resources)
+  if (!is_good_run) {
+    CeedOperator op_fallback;
+
+    data->use_fallback = true;
+    CeedDebug256(CeedOperatorReturnCeed(op), CEED_DEBUG_COLOR_SUCCESS, "Falling back to /gpu/hip/ref CeedOperator due to kernel execution issue");
+    CeedCallBackend(CeedOperatorGetFallback(op, &op_fallback));
+    CeedCallBackend(CeedOperatorApplyAdd(op_fallback, input_vec, output_vec, request));
+    return CEED_ERROR_SUCCESS;
+  }
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip-gen/ceed-hip-gen.h b/backends/hip-gen/ceed-hip-gen.h
index eb5dd0c893..760fef2ed5 100644
--- a/backends/hip-gen/ceed-hip-gen.h
+++ b/backends/hip-gen/ceed-hip-gen.h
@@ -12,6 +12,7 @@
 #include <hip/hip_runtime.h>
 
 typedef struct {
+  bool          use_fallback;
   CeedInt       dim;
   CeedInt       Q_1d;
   CeedInt       max_P_1d;
diff --git a/backends/hip/ceed-hip-compile.cpp b/backends/hip/ceed-hip-compile.cpp
index dface44ef6..51c83cf222 100644
--- a/backends/hip/ceed-hip-compile.cpp
+++ b/backends/hip/ceed-hip-compile.cpp
@@ -33,7 +33,8 @@
 //------------------------------------------------------------------------------
 // Compile HIP kernel
 //------------------------------------------------------------------------------
-int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) {
+static int CeedCompileCore_Hip(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, hipModule_t *module,
+                               const CeedInt num_defines, va_list args) {
   size_t                 ptx_size;
   char                  *ptx;
   const int              num_opts            = 4;
@@ -62,8 +63,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
 
   // Kernel specific options, such as kernel constants
   if (num_defines > 0) {
-    va_list args;
-    va_start(args, num_defines);
     char *name;
     int   val;
 
@@ -72,7 +71,6 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
       val  = va_arg(args, int);
       code << "#define " << name << " " << val << "\n";
     }
-    va_end(args);
   }
 
   // Standard libCEED definitions for HIP backends
@@ -135,7 +133,8 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
     CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
   }
   CeedCallBackend(CeedFree(&opts));
-  if (result != HIPRTC_SUCCESS) {
+  *is_compile_good = result == HIPRTC_SUCCESS;
+  if (!*is_compile_good && throw_error) {
     size_t log_size;
     char  *log;
 
@@ -155,6 +154,25 @@ int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const Ce
   return CEED_ERROR_SUCCESS;
 }
 
+int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...) {
+  bool    is_compile_good = true;
+  va_list args;
+
+  va_start(args, num_defines);
+  CeedCallBackend(CeedCompileCore_Hip(ceed, source, true, &is_compile_good, module, num_defines, args));
+  va_end(args);
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...) {
+  va_list args;
+
+  va_start(args, num_defines);
+  CeedCallBackend(CeedCompileCore_Hip(ceed, source, false, is_compile_good, module, num_defines, args));
+  va_end(args);
+  return CEED_ERROR_SUCCESS;
+}
+
 //------------------------------------------------------------------------------
 // Get HIP kernel
 //------------------------------------------------------------------------------
@@ -183,9 +201,28 @@ int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, c
 //------------------------------------------------------------------------------
 // Run HIP kernel for spatial dimension with shared memory
 //------------------------------------------------------------------------------
+static int CeedRunKernelDimSharedCore_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                          const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run, void **args) {
+  hipError_t result = hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
+
+  *is_good_run = result == hipSuccess;
+  if (throw_error) CeedCallHip(ceed, result);
+  return CEED_ERROR_SUCCESS;
+}
+
 int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
                                const int block_size_z, const int shared_mem_size, void **args) {
-  CeedCallHip(ceed, hipModuleLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL));
+  bool is_good_run = true;
+
+  CeedCallBackend(
+      CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
+  return CEED_ERROR_SUCCESS;
+}
+
+int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, const int grid_size, const int block_size_x, const int block_size_y,
+                                  const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
+  CeedCallBackend(
+      CeedRunKernelDimSharedCore_Hip(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
   return CEED_ERROR_SUCCESS;
 }
 
diff --git a/backends/hip/ceed-hip-compile.h b/backends/hip/ceed-hip-compile.h
index d990924ec2..0a29fad33e 100644
--- a/backends/hip/ceed-hip-compile.h
+++ b/backends/hip/ceed-hip-compile.h
@@ -13,6 +13,7 @@
 static inline CeedInt CeedDivUpInt(CeedInt numerator, CeedInt denominator) { return (numerator + denominator - 1) / denominator; }
 
 CEED_INTERN int CeedCompile_Hip(Ceed ceed, const char *source, hipModule_t *module, const CeedInt num_defines, ...);
+CEED_INTERN int CeedTryCompile_Hip(Ceed ceed, const char *source, bool *is_compile_good, hipModule_t *module, const CeedInt num_defines, ...);
 
 CEED_INTERN int CeedGetKernel_Hip(Ceed ceed, hipModule_t module, const char *name, hipFunction_t *kernel);
 
@@ -23,3 +24,5 @@ CEED_INTERN int CeedRunKernelDim_Hip(Ceed ceed, hipFunction_t kernel, int grid_s
 
 CEED_INTERN int CeedRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
                                            int shared_mem_size, void **args);
+CEED_INTERN int CeedTryRunKernelDimShared_Hip(Ceed ceed, hipFunction_t kernel, int grid_size, int block_size_x, int block_size_y, int block_size_z,
+                                              int shared_mem_size, bool *is_good_run, void **args);

From c9192aca9c02dc42a6a7d7a897b4af02df4a189e Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Fri, 7 Feb 2025 16:05:47 -0700
Subject: [PATCH 3/3] gpu - swap out bitwise assignment operators for bools

Co-authored-by: Zach Atkins <zach.atkins@colorado.edu>
---
 backends/cuda-gen/ceed-cuda-gen-operator-build.cpp | 12 ++++++------
 backends/hip-gen/ceed-hip-gen-operator-build.cpp   | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
index 69f6788c19..181346303c 100644
--- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
+++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -954,12 +954,12 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_not_tensor && !is_tensor;
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
         CeedCallBackend(CeedFree(&resource_root));
         CeedCallBackend(CeedDestroy(&basis_ceed));
       }
@@ -977,13 +977,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op, bool *is_good_b
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
 
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/cuda/shared");
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/cuda/shared");
         CeedCallBackend(CeedFree(&resource_root));
         CeedCallBackend(CeedDestroy(&basis_ceed));
       }
diff --git a/backends/hip-gen/ceed-hip-gen-operator-build.cpp b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
index 4c3479ee21..77c642b9ca 100644
--- a/backends/hip-gen/ceed-hip-gen-operator-build.cpp
+++ b/backends/hip-gen/ceed-hip-gen-operator-build.cpp
@@ -980,12 +980,12 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
         CeedCallBackend(CeedFree(&resource_root));
         CeedCallBackend(CeedDestroy(&basis_ceed));
       }
@@ -1003,13 +1003,13 @@ extern "C" int CeedOperatorBuildKernel_Hip_gen(CeedOperator op, bool *is_good_bu
         Ceed        basis_ceed;
 
         CeedCallBackend(CeedBasisIsTensor(basis, &is_tensor));
-        is_all_tensor &= is_tensor;
-        is_all_nontensor &= !is_tensor;
+        is_all_tensor    = is_all_tensor && is_tensor;
+        is_all_nontensor = is_all_nontensor && !is_tensor;
 
         CeedCallBackend(CeedBasisGetCeed(basis, &basis_ceed));
         CeedCallBackend(CeedGetResource(basis_ceed, &resource));
         CeedCallBackend(CeedGetResourceRoot(basis_ceed, resource, ":", &resource_root));
-        has_shared_bases &= !strcmp(resource_root, "/gpu/hip/shared");
+        has_shared_bases = has_shared_bases && !strcmp(resource_root, "/gpu/hip/shared");
         CeedCallBackend(CeedFree(&resource_root));
         CeedCallBackend(CeedDestroy(&basis_ceed));
       }