diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index 72bc9950ad..0132271bd1 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -156,7 +156,6 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C code << " const CeedInt " << P_name << " = " << (basis == CEED_BASIS_NONE ? Q_1d : P_1d) << ";\n"; code << " const CeedInt num_comp" << var_suffix << " = " << num_comp << ";\n"; } - CeedCallBackend(CeedBasisDestroy(&basis)); // Load basis data code << " // EvalMode: " << CeedEvalModes[eval_mode] << "\n"; @@ -240,6 +239,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C break; // TODO: Not implemented // LCOV_EXCL_STOP } + CeedCallBackend(CeedBasisDestroy(&basis)); return CEED_ERROR_SUCCESS; } @@ -319,10 +319,21 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, << strides[2] << ">(data, elem, d" << var_suffix << ", r_e" << var_suffix << ");\n"; break; } + case CEED_RESTRICTION_POINTS: { + CeedInt comp_stride; + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << " // CompStride: " << comp_stride << "\n"; + data->indices.inputs[i] = (CeedInt *)rstr_data->d_offsets; + code << " ReadLVecStandard" << dim << "d(data, l_size" + << var_suffix << ", elem, indices.inputs[" << i << "], d" << var_suffix << ", r_e" << var_suffix << ");\n"; + break; + } // LCOV_EXCL_START case CEED_RESTRICTION_ORIENTED: case CEED_RESTRICTION_CURL_ORIENTED: - case CEED_RESTRICTION_POINTS: break; // TODO: Not implemented // LCOV_EXCL_STOP } @@ -358,10 +369,21 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, << strides[2] << ">(data, elem, r_e" << var_suffix << ", d" << var_suffix << ");\n"; break; } + case CEED_RESTRICTION_POINTS: { + CeedInt comp_stride; + + CeedCallBackend(CeedElemRestrictionGetLVectorSize(elem_rstr, &l_size)); + code << " const CeedInt l_size" << var_suffix << " = " << l_size << ";\n"; + CeedCallBackend(CeedElemRestrictionGetCompStride(elem_rstr, &comp_stride)); + code << " // CompStride: " << comp_stride << "\n"; + data->indices.outputs[i] = (CeedInt *)rstr_data->d_offsets; + code << " WriteLVecAtPoints" << dim << "d(data, l_size" << var_suffix + << ", elem, indices.outputs[" << i << "], points.num_per_elem, r_e" << var_suffix << ", d" << var_suffix << ");\n"; + break; + } // LCOV_EXCL_START case CEED_RESTRICTION_ORIENTED: case CEED_RESTRICTION_CURL_ORIENTED: - case CEED_RESTRICTION_POINTS: break; // TODO: Not implemented // LCOV_EXCL_STOP } @@ -406,9 +428,18 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO } break; case CEED_EVAL_INTERP: - code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; - code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; + if (is_at_points) { + code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*" << Q_name << "];\n"; + code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; + } else { + code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*max_num_points];\n"; + code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; + + code << " InterpAtPoints<" << dim << ", num_comp" << var_suffix << ", max_num_points, " << Q_name + << ">(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; + } break; case CEED_EVAL_GRAD: if (use_3d_slices) { @@ -416,10 +447,16 @@ static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedO code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; } else { - code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*" << Q_name << "];\n"; - code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" - << var_suffix << ");\n"; + if (is_at_points) { + } else { + code << " CeedScalar r_q" << var_suffix << "[num_comp" << var_suffix << "*dim*max_num_points];\n"; + code << " Interp" << (dim > 1 ? "Tensor" : "") << dim << "d(data, r_e" << var_suffix << ", s_B" << var_suffix << ", r_q" << var_suffix << ");\n"; + + code << " Grad" << (dim > 1 ? "Tensor" : "") << (dim == 3 && Q_1d >= P_1d ? "Collocated" : "") << dim << "d(data, r_e" << var_suffix << ", s_B" << var_suffix << ", s_G" << var_suffix << ", r_q" + << var_suffix << ");\n"; + } } break; case CEED_EVAL_WEIGHT: { diff --git a/backends/cuda-gen/ceed-cuda-gen.c b/backends/cuda-gen/ceed-cuda-gen.c index 0ab817186d..213a769052 100644 --- a/backends/cuda-gen/ceed-cuda-gen.c +++ b/backends/cuda-gen/ceed-cuda-gen.c @@ -39,6 +39,7 @@ static int CeedInit_Cuda_gen(const char *resource, Ceed ceed) { CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "QFunctionCreate", CeedQFunctionCreate_Cuda_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreate", CeedOperatorCreate_Cuda_gen)); + CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "OperatorCreateAtPoints", CeedOperatorCreate_Cuda_gen)); CeedCallBackend(CeedSetBackendFunction(ceed, "Ceed", ceed, "Destroy", CeedDestroy_Cuda)); return CEED_ERROR_SUCCESS; } diff --git a/include/ceed/jit-source/cuda/cuda-gen-templates.h b/include/ceed/jit-source/cuda/cuda-gen-templates.h index 8a79ba5989..ededdbe0e3 100644 --- a/include/ceed/jit-source/cuda/cuda-gen-templates.h +++ b/include/ceed/jit-source/cuda/cuda-gen-templates.h @@ -63,6 +63,22 @@ inline __device__ void WriteLVecStandard1d(SharedData_Cuda &data, const CeedInt } } +//------------------------------------------------------------------------------ +// E-vector -> L-vector, AtPoints +//------------------------------------------------------------------------------ +template +inline __device__ void writeDofsAtPoints1d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < P_1d) { + const CeedInt node = data.t_id_x; + const CeedInt ind = indices[node + elem * P_1d]; + + if (node < points_per_elem[elem]) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); + } + } +} + //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ @@ -123,6 +139,22 @@ inline __device__ void WriteLVecStandard2d(SharedData_Cuda &data, const CeedInt } } +//------------------------------------------------------------------------------ +// E-vector -> L-vector, AtPoints +//------------------------------------------------------------------------------ +template +inline __device__ void writeDofsAtPoints2d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < P_1d && data.t_id_y < P_1d) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d; + const CeedInt ind = indices[node + elem * P_1d * P_1d]; + + if (node < points_per_elem[elem]) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[comp]); + } + } +} + //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------ @@ -215,6 +247,23 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt } } +//------------------------------------------------------------------------------ +// E-vector -> L-vector, AtPoints +//------------------------------------------------------------------------------ +template +inline __device__ void writeDofsAtPoints3d(SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices, + const CeedInt *__restrict__ points_per_elem, const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) { + if (data.t_id_x < P_1d && data.t_id_y < P_1d) + for (CeedInt z = 0; z < P_1d; z++) { + const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d; + const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d]; + + if (node < points_per_elem[elem]) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) atomicAdd(&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]); + } + } +} + //------------------------------------------------------------------------------ // E-vector -> L-vector, strided //------------------------------------------------------------------------------