Skip to content

Commit bc086f0

Browse files
committed
wip
1 parent 290fc47 commit bc086f0

File tree

4 files changed

+134
-32
lines changed

4 files changed

+134
-32
lines changed

backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

Lines changed: 76 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
124124
// Setup fields
125125
//------------------------------------------------------------------------------
126126
static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
127-
CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
127+
CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points,
128+
bool use_3d_slices) {
128129
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
129130
std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
130131
std::string option_name = (is_input ? "inputs" : "outputs");
@@ -163,16 +164,55 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
163164
case CEED_EVAL_NONE:
164165
break;
165166
case CEED_EVAL_INTERP:
166-
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
167-
else data->B.outputs[i] = basis_data->d_interp_1d;
167+
if (is_at_points) {
168+
// AtPoints
169+
if (!basis_data->d_chebyshev_interp_1d) {
170+
CeedSize interp_bytes;
171+
CeedScalar *chebyshev_interp_1d;
172+
173+
interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
174+
CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
175+
CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
176+
CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
177+
CeedCallCuda(CeedBasisReturnCeed(basis),
178+
cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
179+
CeedCallBackend(CeedFree(&chebyshev_interp_1d));
180+
}
181+
if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
182+
else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
183+
} else {
184+
// Standard quadrature
185+
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
186+
else data->B.outputs[i] = basis_data->d_interp_1d;
187+
}
168188
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
169189
code << " loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
170190
break;
171191
case CEED_EVAL_GRAD:
172-
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
173-
else data->B.outputs[i] = basis_data->d_interp_1d;
192+
if (is_at_points) {
193+
// AtPoints
194+
if (!basis_data->d_chebyshev_interp_1d) {
195+
CeedSize interp_bytes;
196+
CeedScalar *chebyshev_interp_1d;
197+
198+
interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
199+
CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
200+
CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
201+
CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
202+
CeedCallCuda(CeedBasisReturnCeed(basis),
203+
cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
204+
CeedCallBackend(CeedFree(&chebyshev_interp_1d));
205+
}
206+
if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
207+
else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
208+
} else {
209+
// Standard quadrature
210+
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
211+
else data->B.outputs[i] = basis_data->d_interp_1d;
212+
}
174213
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
175214
code << " loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
215+
if (is_at_points) break; // No G mat for AtPoints
176216
if (use_3d_slices) {
177217
if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
178218
else data->G.outputs[i] = basis_data->d_collo_grad_1d;
@@ -209,7 +249,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
209249
//------------------------------------------------------------------------------
210250
static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
211251
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
212-
CeedInt Q_1d, bool is_input, bool use_3d_slices) {
252+
CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
213253
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
214254
std::string P_name = "P_1d" + var_suffix;
215255
CeedEvalMode eval_mode = CEED_EVAL_NONE;
@@ -318,7 +358,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
318358
//------------------------------------------------------------------------------
319359
static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
320360
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
321-
bool use_3d_slices) {
361+
bool is_at_points, bool use_3d_slices) {
322362
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
323363
std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
324364
CeedEvalMode eval_mode = CEED_EVAL_NONE;
@@ -421,7 +461,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
421461
CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
422462
CeedInt num_output_fields, CeedOperatorField *op_output_fields,
423463
CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
424-
bool use_3d_slices) {
464+
bool is_at_points, bool use_3d_slices) {
425465
std::string Q_name = "Q_1d";
426466
CeedEvalMode eval_mode = CEED_EVAL_NONE;
427467
CeedElemRestriction elem_rstr;
@@ -636,9 +676,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
636676
// Build single operator kernel
637677
//------------------------------------------------------------------------------
638678
extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
639-
bool is_tensor = true, use_3d_slices = false;
679+
bool is_tensor = true, is_at_points = false, use_3d_slices = false;
640680
Ceed ceed;
641-
CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1;
681+
CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0;
642682
CeedQFunctionField *qf_input_fields, *qf_output_fields;
643683
CeedQFunction_Cuda_gen *qf_data;
644684
CeedQFunction qf;
@@ -661,17 +701,23 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
661701
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
662702

663703
// Get operator data
704+
CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
664705
CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
665706
qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
666707
if (dim == 0) dim = 1;
667708
data->dim = dim;
668709
if (Q_1d == 0) {
669-
CeedInt Q;
670-
671-
CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
672-
Q_1d = Q;
710+
CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
673711
}
674712
data->Q_1d = Q_1d;
713+
if (is_at_points) {
714+
CeedElemRestriction rstr_points = NULL;
715+
716+
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
717+
CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
718+
CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
719+
}
720+
if (is_at_points) use_3d_slices = false;
675721

676722
// Check for restriction only identity operator
677723
{
@@ -705,6 +751,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
705751
// TODO: Add non-tensor, AtPoints
706752
code << "// Tensor basis source\n";
707753
code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
754+
code << "// AtPoints basis source\n";
755+
code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
708756
code << "// CodeGen operator source\n";
709757
code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
710758

@@ -746,7 +794,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
746794
code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
747795
code << "// -----------------------------------------------------------------------------\n";
748796
code << "extern \"C\" __global__ void " << operator_name
749-
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {\n";
797+
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
798+
"points) {\n";
750799

751800
// Scratch buffers
752801
for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -776,11 +825,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
776825
// Initialize constants, and matrices B and G
777826
code << "\n // Input field constants and basis data\n";
778827
for (CeedInt i = 0; i < num_input_fields; i++) {
779-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
828+
CeedCallBackend(
829+
CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices));
780830
}
781831
code << "\n // Output field constants and basis data\n";
782832
for (CeedInt i = 0; i < num_output_fields; i++) {
783-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
833+
CeedCallBackend(
834+
CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
784835
}
785836

786837
// Loop over all elements
@@ -867,27 +918,29 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
867918

868919
// ---- Restriction
869920
CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
870-
Q_1d, true, use_3d_slices));
921+
Q_1d, true, is_at_points, use_3d_slices));
871922

872923
// ---- Basis action
873-
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
924+
CeedCallBackend(
925+
CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices));
874926
}
875927

876928
// -- Q function
877929
CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
878-
op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
930+
op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points, use_3d_slices));
879931

880932
// -- Output basis and restriction
881933
code << "\n // -- Output field basis action and restrictions\n";
882934
for (CeedInt i = 0; i < num_output_fields; i++) {
883935
code << " // ---- Output field " << i << "\n";
884936

885937
// ---- Basis action
886-
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
938+
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points,
939+
use_3d_slices));
887940

888941
// ---- Restriction
889-
CeedCallBackend(
890-
CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
942+
CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
943+
is_at_points, use_3d_slices));
891944
}
892945

893946
// Close loop and function

backends/cuda-gen/ceed-cuda-gen-operator.c

Lines changed: 51 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,14 @@
1919
// Destroy operator
2020
//------------------------------------------------------------------------------
2121
static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
22+
Ceed ceed;
2223
CeedOperator_Cuda_gen *impl;
2324

25+
CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
2426
CeedCallBackend(CeedOperatorGetData(op, &impl));
27+
if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
2528
CeedCallBackend(CeedFree(&impl));
29+
CeedCallBackend(CeedDestroy(&ceed));
2630
return CEED_ERROR_SUCCESS;
2731
}
2832

@@ -92,6 +96,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
9296
// Apply and add to output
9397
//------------------------------------------------------------------------------
9498
static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
99+
bool is_at_points;
95100
Ceed ceed;
96101
Ceed_Cuda *cuda_data;
97102
CeedInt num_elem, num_input_fields, num_output_fields;
@@ -181,25 +186,52 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
181186
}
182187
}
183188

189+
// Point coordinates, if needed
190+
CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
191+
if (is_at_points) {
192+
// Coords
193+
CeedVector vec;
194+
195+
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
196+
CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
197+
CeedCallBackend(CeedVectorDestroy(&vec));
198+
199+
// Points per elem
200+
if (num_elem != data->points.num_elem) {
201+
CeedInt *points_per_elem;
202+
const CeedInt num_bytes = num_elem * sizeof(CeedInt);
203+
CeedElemRestriction rstr_points = NULL;
204+
205+
data->points.num_elem = num_elem;
206+
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
207+
CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
208+
for (CeedInt e = 0; e < num_elem; e++) {
209+
CeedInt num_points_elem;
210+
211+
CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
212+
points_per_elem[e] = num_points_elem;
213+
}
214+
if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
215+
CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
216+
CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
217+
CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
218+
CeedCallBackend(CeedFree(&points_per_elem));
219+
}
220+
}
221+
184222
// Get context data
185223
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
186224

187225
// Apply operator
188-
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
226+
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
189227
const CeedInt dim = data->dim;
190228
const CeedInt Q_1d = data->Q_1d;
191229
const CeedInt P_1d = data->max_P_1d;
192230
const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
193-
int max_threads_per_block, min_grid_size;
231+
int max_threads_per_block, min_grid_size, grid;
194232

195233
CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
196-
int block[3] =
197-
{
198-
thread_1d,
199-
dim < 2 ? 1 : thread_1d,
200-
-1,
201-
},
202-
grid;
234+
int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1};
203235

204236
CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
205237
cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
@@ -236,6 +268,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
236268
if (is_active) vec = output_vec;
237269
// Check for multiple output modes
238270
CeedInt index = -1;
271+
239272
for (CeedInt j = 0; j < i; j++) {
240273
if (vec == output_vecs[j]) {
241274
index = j;
@@ -249,6 +282,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
249282
}
250283
}
251284

285+
// Restore point coordinates, if needed
286+
if (is_at_points) {
287+
CeedVector vec;
288+
289+
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
290+
CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
291+
CeedCallBackend(CeedVectorDestroy(&vec));
292+
}
293+
252294
// Restore context data
253295
CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
254296
CeedCallBackend(CeedDestroy(&ceed));

backends/cuda-gen/ceed-cuda-gen.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ typedef struct {
2222
Fields_Cuda B;
2323
Fields_Cuda G;
2424
CeedScalar *W;
25+
Points_Cuda points;
2526
} CeedOperator_Cuda_gen;
2627

2728
typedef struct {

include/ceed/jit-source/cuda/cuda-types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ typedef struct {
2323
CeedInt *outputs[CEED_CUDA_NUMBER_FIELDS];
2424
} FieldsInt_Cuda;
2525

26+
typedef struct {
27+
CeedInt num_elem;
28+
const CeedInt *num_per_elem;
29+
const CeedScalar *coords;
30+
} Points_Cuda;
31+
2632
typedef struct {
2733
CeedInt t_id_x;
2834
CeedInt t_id_y;

0 commit comments

Comments
 (0)