Skip to content

Commit 16898a3

Browse files
committed
wip
1 parent 5a7f61c commit 16898a3

File tree

4 files changed

+136
-32
lines changed

4 files changed

+136
-32
lines changed

backends/cuda-gen/ceed-cuda-gen-operator-build.cpp

+76-23
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
124124
// Setup fields
125125
//------------------------------------------------------------------------------
126126
static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
127-
CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
127+
CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points,
128+
bool use_3d_slices) {
128129
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
129130
std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
130131
std::string option_name = (is_input ? "inputs" : "outputs");
@@ -163,16 +164,55 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
163164
case CEED_EVAL_NONE:
164165
break;
165166
case CEED_EVAL_INTERP:
166-
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
167-
else data->B.outputs[i] = basis_data->d_interp_1d;
167+
if (is_at_points) {
168+
// AtPoints
169+
if (!basis_data->d_chebyshev_interp_1d) {
170+
CeedSize interp_bytes;
171+
CeedScalar *chebyshev_interp_1d;
172+
173+
interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
174+
CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
175+
CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
176+
CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
177+
CeedCallCuda(CeedBasisReturnCeed(basis),
178+
cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
179+
CeedCallBackend(CeedFree(&chebyshev_interp_1d));
180+
}
181+
if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
182+
else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
183+
} else {
184+
// Standard quadrature
185+
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
186+
else data->B.outputs[i] = basis_data->d_interp_1d;
187+
}
168188
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
169189
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
170190
break;
171191
case CEED_EVAL_GRAD:
172-
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
173-
else data->B.outputs[i] = basis_data->d_interp_1d;
192+
if (is_at_points) {
193+
// AtPoints
194+
if (!basis_data->d_chebyshev_interp_1d) {
195+
CeedSize interp_bytes;
196+
CeedScalar *chebyshev_interp_1d;
197+
198+
interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
199+
CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
200+
CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
201+
CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
202+
CeedCallCuda(CeedBasisReturnCeed(basis),
203+
cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
204+
CeedCallBackend(CeedFree(&chebyshev_interp_1d));
205+
}
206+
if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
207+
else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
208+
} else {
209+
// Standard quadrature
210+
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
211+
else data->B.outputs[i] = basis_data->d_interp_1d;
212+
}
174213
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
175214
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
215+
if (is_at_points) break; // No G mat for AtPoints
176216
if (use_3d_slices) {
177217
if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
178218
else data->G.outputs[i] = basis_data->d_collo_grad_1d;
@@ -208,7 +248,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
208248
//------------------------------------------------------------------------------
209249
static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
210250
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
211-
CeedInt Q_1d, bool is_input, bool use_3d_slices) {
251+
CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
212252
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
213253
std::string P_name = "P_1d" + var_suffix;
214254
CeedEvalMode eval_mode = CEED_EVAL_NONE;
@@ -335,7 +375,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
335375
//------------------------------------------------------------------------------
336376
static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
337377
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
338-
bool use_3d_slices) {
378+
bool is_at_points, bool use_3d_slices) {
339379
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
340380
std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
341381
CeedEvalMode eval_mode = CEED_EVAL_NONE;
@@ -438,7 +478,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
438478
CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
439479
CeedInt num_output_fields, CeedOperatorField *op_output_fields,
440480
CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
441-
bool use_3d_slices) {
481+
bool is_at_points, bool use_3d_slices) {
442482
std::string Q_name = "Q_1d";
443483
CeedEvalMode eval_mode = CEED_EVAL_NONE;
444484
CeedElemRestriction elem_rstr;
@@ -653,9 +693,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
653693
// Build single operator kernel
654694
//------------------------------------------------------------------------------
655695
extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
656-
bool is_tensor = true, use_3d_slices = false;
696+
bool is_tensor = true, is_at_points = false, use_3d_slices = false;
657697
Ceed ceed;
658-
CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1;
698+
CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0;
659699
CeedQFunctionField *qf_input_fields, *qf_output_fields;
660700
CeedQFunction_Cuda_gen *qf_data;
661701
CeedQFunction qf;
@@ -678,17 +718,23 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
678718
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
679719

680720
// Get operator data
721+
CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
681722
CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
682723
qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
683724
if (dim == 0) dim = 1;
684725
data->dim = dim;
685726
if (Q_1d == 0) {
686-
CeedInt Q;
687-
688-
CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
689-
Q_1d = Q;
727+
CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
690728
}
691729
data->Q_1d = Q_1d;
730+
if (is_at_points) {
731+
CeedElemRestriction rstr_points = NULL;
732+
733+
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
734+
CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
735+
CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
736+
}
737+
if (is_at_points) use_3d_slices = false;
692738

693739
// Check for restriction only identity operator
694740
{
@@ -722,6 +768,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
722768
// TODO: Add non-tensor, AtPoints
723769
code << "// Tensor basis source\n";
724770
code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
771+
code << "// AtPoints basis source\n";
772+
code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
725773
code << "// CodeGen operator source\n";
726774
code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
727775

@@ -763,7 +811,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
763811
code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
764812
code << "// -----------------------------------------------------------------------------\n";
765813
code << "extern \"C\" __global__ void " << operator_name
766-
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {\n";
814+
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
815+
"points) {\n";
767816

768817
// Scratch buffers
769818
for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -793,11 +842,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
793842
// Initialize constants, and matrices B and G
794843
code << "\n // Input field constants and basis data\n";
795844
for (CeedInt i = 0; i < num_input_fields; i++) {
796-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
845+
CeedCallBackend(
846+
CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices));
797847
}
798848
code << "\n // Output field constants and basis data\n";
799849
for (CeedInt i = 0; i < num_output_fields; i++) {
800-
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
850+
CeedCallBackend(
851+
CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
801852
}
802853

803854
// Loop over all elements
@@ -884,27 +935,29 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
884935

885936
// ---- Restriction
886937
CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
887-
Q_1d, true, use_3d_slices));
938+
Q_1d, true, is_at_points, use_3d_slices));
888939

889940
// ---- Basis action
890-
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
941+
CeedCallBackend(
942+
CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices));
891943
}
892944

893945
// -- Q function
894946
CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
895-
op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
947+
op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points, use_3d_slices));
896948

897949
// -- Output basis and restriction
898950
code << "\n // -- Output field basis action and restrictions\n";
899951
for (CeedInt i = 0; i < num_output_fields; i++) {
900952
code << " // ---- Output field " << i << "\n";
901953

902954
// ---- Basis action
903-
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
955+
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points,
956+
use_3d_slices));
904957

905958
// ---- Restriction
906-
CeedCallBackend(
907-
CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
959+
CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
960+
is_at_points, use_3d_slices));
908961
}
909962

910963
// Close loop and function

backends/cuda-gen/ceed-cuda-gen-operator.c

+53-9
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#include <ceed.h>
99
#include <ceed/backend.h>
1010
#include <ceed/jit-source/cuda/cuda-types.h>
11+
#include <cuda.h>
12+
#include <cuda_runtime.h>
1113
#include <stddef.h>
1214

1315
#include "../cuda/ceed-cuda-common.h"
@@ -19,10 +21,14 @@
1921
// Destroy operator
2022
//------------------------------------------------------------------------------
2123
static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
24+
Ceed ceed;
2225
CeedOperator_Cuda_gen *impl;
2326

27+
CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
2428
CeedCallBackend(CeedOperatorGetData(op, &impl));
29+
if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
2530
CeedCallBackend(CeedFree(&impl));
31+
CeedCallBackend(CeedDestroy(&ceed));
2632
return CEED_ERROR_SUCCESS;
2733
}
2834

@@ -92,6 +98,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
9298
// Apply and add to output
9399
//------------------------------------------------------------------------------
94100
static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
101+
bool is_at_points;
95102
Ceed ceed;
96103
Ceed_Cuda *cuda_data;
97104
CeedInt num_elem, num_input_fields, num_output_fields;
@@ -181,25 +188,52 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
181188
}
182189
}
183190

191+
// Point coordinates, if needed
192+
CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
193+
if (is_at_points) {
194+
// Coords
195+
CeedVector vec;
196+
197+
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
198+
CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
199+
CeedCallBackend(CeedVectorDestroy(&vec));
200+
201+
// Points per elem
202+
if (num_elem != data->points.num_elem) {
203+
CeedInt *points_per_elem;
204+
const CeedInt num_bytes = num_elem * sizeof(CeedInt);
205+
CeedElemRestriction rstr_points = NULL;
206+
207+
data->points.num_elem = num_elem;
208+
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
209+
CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
210+
for (CeedInt e = 0; e < num_elem; e++) {
211+
CeedInt num_points_elem;
212+
213+
CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
214+
points_per_elem[e] = num_points_elem;
215+
}
216+
if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
217+
CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
218+
CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
219+
CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
220+
CeedCallBackend(CeedFree(&points_per_elem));
221+
}
222+
}
223+
184224
// Get context data
185225
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
186226

187227
// Apply operator
188-
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
228+
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
189229
const CeedInt dim = data->dim;
190230
const CeedInt Q_1d = data->Q_1d;
191231
const CeedInt P_1d = data->max_P_1d;
192232
const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
193-
int max_threads_per_block, min_grid_size;
233+
int max_threads_per_block, min_grid_size, grid;
194234

195235
CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
196-
int block[3] =
197-
{
198-
thread_1d,
199-
dim < 2 ? 1 : thread_1d,
200-
-1,
201-
},
202-
grid;
236+
int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1};
203237

204238
CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
205239
cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
@@ -236,6 +270,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
236270
if (is_active) vec = output_vec;
237271
// Check for multiple output modes
238272
CeedInt index = -1;
273+
239274
for (CeedInt j = 0; j < i; j++) {
240275
if (vec == output_vecs[j]) {
241276
index = j;
@@ -249,6 +284,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
249284
}
250285
}
251286

287+
// Restore point coordinates, if needed
288+
if (is_at_points) {
289+
CeedVector vec;
290+
291+
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
292+
CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
293+
CeedCallBackend(CeedVectorDestroy(&vec));
294+
}
295+
252296
// Restore context data
253297
CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
254298
CeedCallBackend(CeedDestroy(&ceed));

backends/cuda-gen/ceed-cuda-gen.h

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ typedef struct {
2222
Fields_Cuda B;
2323
Fields_Cuda G;
2424
CeedScalar *W;
25+
Points_Cuda points;
2526
} CeedOperator_Cuda_gen;
2627

2728
typedef struct {

include/ceed/jit-source/cuda/cuda-types.h

+6
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@ typedef struct {
2323
CeedInt *outputs[CEED_CUDA_NUMBER_FIELDS];
2424
} FieldsInt_Cuda;
2525

26+
typedef struct {
27+
CeedInt num_elem;
28+
const CeedInt *num_per_elem;
29+
const CeedScalar *coords;
30+
} Points_Cuda;
31+
2632
typedef struct {
2733
CeedInt t_id_x;
2834
CeedInt t_id_y;

0 commit comments

Comments
 (0)