cuda - update shared max matrix sizes

jeremylt · jeremylt · commit 3e42d603c00f · 2025-01-03T16:48:03.000-07:00
diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c
@@ -24,6 +24,7 @@
 int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B);
 int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);
 int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr);
+int CeedInit_CudaNonTensor(CeedScalar *d_B, CeedInt dim, CeedInt P, CeedInt Q, CeedScalar **c_B);
 
 //------------------------------------------------------------------------------
 // Apply tensor basis
@@ -456,7 +457,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread = CeedIntMax(Q, P);
 
-      CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P, Q, &data->c_B));
+      CeedCallBackend(CeedInit_CudaNonTensor(data->d_interp_1d, 1, P, Q, &data->c_B));
       void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v};
 
       {
@@ -480,7 +481,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a
       CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q));
       CeedInt thread = CeedIntMax(Q, P);
 
-      CeedCallBackend(CeedInit_CudaInterp(data->d_grad_1d, P, Q * dim, &data->c_G));
+      CeedCallBackend(CeedInit_CudaNonTensor(data->d_grad_1d, 3, P, Q * dim, &data->c_G));
       void *grad_args[] = {(void *)&num_elem, &data->c_G, &d_u, &d_v};
 
       {
@@ -641,6 +642,10 @@ int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt nu
   CeedCallBackend(CeedBasisGetCeed(basis, &ceed));
   CeedCallBackend(CeedCalloc(1, &data));
 
+  // Check max sizes
+  CeedCheck(dim <= 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with dim > 3");
+  CeedCheck(num_nodes * num_qpts * dim < 52 * 52 * 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with P * Q this large");
+
   // Copy basis data to GPU
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp));
   CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad));
diff --git a/backends/cuda-shared/kernels/cuda-shared-basis-nontensor.cu b/backends/cuda-shared/kernels/cuda-shared-basis-nontensor.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed.h>
+#include <cuda.h>
+
+const int               MAX_SIZE = 52, MAX_DIM = 3;
+__constant__ CeedScalar c_B[MAX_SIZE * MAX_SIZE * MAX_DIM];
+
+//------------------------------------------------------------------------------
+// Interp device initialization
+//------------------------------------------------------------------------------
+extern "C" int CeedInit_CudaNonTensor(CeedScalar *d_B, CeedInt P, CeedInt Q, CeedInt dim, CeedScalar **c_B_ptr) {
+  const int bytes = P * Q * dim * sizeof(CeedScalar);
+
+  cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice);
+  cudaGetSymbolAddress((void **)c_B_ptr, c_B);
+  return CEED_ERROR_SUCCESS;
+}
+
+//------------------------------------------------------------------------------
diff --git a/backends/cuda-shared/kernels/cuda-shared-basis-tensor.cu b/backends/cuda-shared/kernels/cuda-shared-basis-tensor.cu
@@ -8,9 +8,9 @@
 #include <ceed.h>
 #include <cuda.h>
 
-const int               sizeMax = 16;
-__constant__ CeedScalar c_B[sizeMax * sizeMax];
-__constant__ CeedScalar c_G[sizeMax * sizeMax];
+const int               MAX_SIZE = 16;
+__constant__ CeedScalar c_B[MAX_SIZE * MAX_SIZE];
+__constant__ CeedScalar c_G[MAX_SIZE * MAX_SIZE];
 
 //------------------------------------------------------------------------------
 // Interp device initialization