diff --git a/backends/cuda-shared/ceed-cuda-shared-basis.c b/backends/cuda-shared/ceed-cuda-shared-basis.c index eb344f09eb..acbf843bee 100644 --- a/backends/cuda-shared/ceed-cuda-shared-basis.c +++ b/backends/cuda-shared/ceed-cuda-shared-basis.c @@ -24,6 +24,7 @@ int CeedInit_CudaInterp(CeedScalar *d_B, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B); int CeedInit_CudaGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); int CeedInit_CudaCollocatedGrad(CeedScalar *d_B, CeedScalar *d_G, CeedInt P_1d, CeedInt Q_1d, CeedScalar **c_B_ptr, CeedScalar **c_G_ptr); +int CeedInit_CudaNonTensor(CeedScalar *d_B, CeedInt dim, CeedInt P, CeedInt Q, CeedScalar **c_B); //------------------------------------------------------------------------------ // Apply tensor basis @@ -456,7 +457,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); CeedInt thread = CeedIntMax(Q, P); - CeedCallBackend(CeedInit_CudaInterp(data->d_interp_1d, P, Q, &data->c_B)); + CeedCallBackend(CeedInit_CudaNonTensor(data->d_interp_1d, 1, P, Q, &data->c_B)); void *interp_args[] = {(void *)&num_elem, &data->c_B, &d_u, &d_v}; { @@ -480,7 +481,7 @@ static int CeedBasisApplyNonTensorCore_Cuda_shared(CeedBasis basis, bool apply_a CeedCallBackend(CeedBasisGetNumQuadraturePoints(basis, &Q)); CeedInt thread = CeedIntMax(Q, P); - CeedCallBackend(CeedInit_CudaInterp(data->d_grad_1d, P, Q * dim, &data->c_G)); + CeedCallBackend(CeedInit_CudaNonTensor(data->d_grad_1d, 3, P, Q * dim, &data->c_G)); void *grad_args[] = {(void *)&num_elem, &data->c_G, &d_u, &d_v}; { @@ -641,6 +642,10 @@ int CeedBasisCreateH1_Cuda_shared(CeedElemTopology topo, CeedInt dim, CeedInt nu CeedCallBackend(CeedBasisGetCeed(basis, &ceed)); CeedCallBackend(CeedCalloc(1, &data)); + // Check max sizes + CeedCheck(dim <= 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with dim > 3"); + CeedCheck(num_nodes * num_qpts * dim < 52 * 52 * 3, ceed, CEED_ERROR_BACKEND, "Backend does not implement nontensor bases with P * Q this large"); + // Copy basis data to GPU CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_INTERP, &q_comp_interp)); CeedCallBackend(CeedBasisGetNumQuadratureComponents(basis, CEED_EVAL_GRAD, &q_comp_grad)); diff --git a/backends/cuda-shared/kernels/cuda-shared-basis-nontensor.cu b/backends/cuda-shared/kernels/cuda-shared-basis-nontensor.cu new file mode 100644 index 0000000000..1879138884 --- /dev/null +++ b/backends/cuda-shared/kernels/cuda-shared-basis-nontensor.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. +// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. +// +// SPDX-License-Identifier: BSD-2-Clause +// +// This file is part of CEED: http://github.com/ceed + +#include +#include + +const int MAX_SIZE = 52, MAX_DIM = 3; +__constant__ CeedScalar c_B[MAX_SIZE * MAX_SIZE * MAX_DIM]; + +//------------------------------------------------------------------------------ +// Interp device initialization +//------------------------------------------------------------------------------ +extern "C" int CeedInit_CudaNonTensor(CeedScalar *d_B, CeedInt P, CeedInt Q, CeedInt dim, CeedScalar **c_B_ptr) { + const int bytes = P * Q * dim * sizeof(CeedScalar); + + cudaMemcpyToSymbol(c_B, d_B, bytes, 0, cudaMemcpyDeviceToDevice); + cudaGetSymbolAddress((void **)c_B_ptr, c_B); + return CEED_ERROR_SUCCESS; +} + +//------------------------------------------------------------------------------ diff --git a/backends/cuda-shared/kernels/cuda-shared-basis.cu b/backends/cuda-shared/kernels/cuda-shared-basis-tensor.cu similarity index 94% rename from backends/cuda-shared/kernels/cuda-shared-basis.cu rename to backends/cuda-shared/kernels/cuda-shared-basis-tensor.cu index f654f7ddda..48fb2056c3 100644 --- a/backends/cuda-shared/kernels/cuda-shared-basis.cu +++ b/backends/cuda-shared/kernels/cuda-shared-basis-tensor.cu @@ -8,9 +8,9 @@ #include #include -const int sizeMax = 16; -__constant__ CeedScalar c_B[sizeMax * sizeMax]; -__constant__ CeedScalar c_G[sizeMax * sizeMax]; +const int MAX_SIZE = 16; +__constant__ CeedScalar c_B[MAX_SIZE * MAX_SIZE]; +__constant__ CeedScalar c_G[MAX_SIZE * MAX_SIZE]; //------------------------------------------------------------------------------ // Interp device initialization