@@ -124,7 +124,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
124
124
// Setup fields
125
125
// ------------------------------------------------------------------------------
126
126
static int CeedOperatorBuildKernelFieldData_Cuda_gen (std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
127
- CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
127
+ CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points,
128
+ bool use_3d_slices) {
128
129
std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
129
130
std::string P_name = " P_1d" + var_suffix, Q_name = " Q_1d" ;
130
131
std::string option_name = (is_input ? " inputs" : " outputs" );
@@ -163,16 +164,55 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
163
164
case CEED_EVAL_NONE:
164
165
break ;
165
166
case CEED_EVAL_INTERP:
166
- if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
167
- else data->B .outputs [i] = basis_data->d_interp_1d ;
167
+ if (is_at_points) {
168
+ // AtPoints
169
+ if (!basis_data->d_chebyshev_interp_1d ) {
170
+ CeedSize interp_bytes;
171
+ CeedScalar *chebyshev_interp_1d;
172
+
173
+ interp_bytes = P_1d * Q_1d * sizeof (CeedScalar);
174
+ CeedCallBackend (CeedCalloc (P_1d * Q_1d, &chebyshev_interp_1d));
175
+ CeedCallBackend (CeedBasisGetChebyshevInterp1D (basis, chebyshev_interp_1d));
176
+ CeedCallCuda (CeedBasisReturnCeed (basis), cudaMalloc ((void **)&basis_data->d_chebyshev_interp_1d , interp_bytes));
177
+ CeedCallCuda (CeedBasisReturnCeed (basis),
178
+ cudaMemcpy (basis_data->d_chebyshev_interp_1d , chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
179
+ CeedCallBackend (CeedFree (&chebyshev_interp_1d));
180
+ }
181
+ if (is_input) data->B .inputs [i] = basis_data->d_chebyshev_interp_1d ;
182
+ else data->B .outputs [i] = basis_data->d_chebyshev_interp_1d ;
183
+ } else {
184
+ // Standard quadrature
185
+ if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
186
+ else data->B .outputs [i] = basis_data->d_interp_1d ;
187
+ }
168
188
code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_1d * Q_1d << " ];\n " ;
169
189
code << " loadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
170
190
break ;
171
191
case CEED_EVAL_GRAD:
172
- if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
173
- else data->B .outputs [i] = basis_data->d_interp_1d ;
192
+ if (is_at_points) {
193
+ // AtPoints
194
+ if (!basis_data->d_chebyshev_interp_1d ) {
195
+ CeedSize interp_bytes;
196
+ CeedScalar *chebyshev_interp_1d;
197
+
198
+ interp_bytes = P_1d * Q_1d * sizeof (CeedScalar);
199
+ CeedCallBackend (CeedCalloc (P_1d * Q_1d, &chebyshev_interp_1d));
200
+ CeedCallBackend (CeedBasisGetChebyshevInterp1D (basis, chebyshev_interp_1d));
201
+ CeedCallCuda (CeedBasisReturnCeed (basis), cudaMalloc ((void **)&basis_data->d_chebyshev_interp_1d , interp_bytes));
202
+ CeedCallCuda (CeedBasisReturnCeed (basis),
203
+ cudaMemcpy (basis_data->d_chebyshev_interp_1d , chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
204
+ CeedCallBackend (CeedFree (&chebyshev_interp_1d));
205
+ }
206
+ if (is_input) data->B .inputs [i] = basis_data->d_chebyshev_interp_1d ;
207
+ else data->B .outputs [i] = basis_data->d_chebyshev_interp_1d ;
208
+ } else {
209
+ // Standard quadrature
210
+ if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
211
+ else data->B .outputs [i] = basis_data->d_interp_1d ;
212
+ }
174
213
code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_1d * Q_1d << " ];\n " ;
175
214
code << " loadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
215
+ if (is_at_points) break ; // No G mat for AtPoints
176
216
if (use_3d_slices) {
177
217
if (is_input) data->G .inputs [i] = basis_data->d_collo_grad_1d ;
178
218
else data->G .outputs [i] = basis_data->d_collo_grad_1d ;
@@ -209,7 +249,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
209
249
// ------------------------------------------------------------------------------
210
250
static int CeedOperatorBuildKernelRestriction_Cuda_gen (std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
211
251
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
212
- CeedInt Q_1d, bool is_input, bool use_3d_slices) {
252
+ CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
213
253
std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
214
254
std::string P_name = " P_1d" + var_suffix;
215
255
CeedEvalMode eval_mode = CEED_EVAL_NONE;
@@ -318,7 +358,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
318
358
// ------------------------------------------------------------------------------
319
359
static int CeedOperatorBuildKernelBasis_Cuda_gen (std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
320
360
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
321
- bool use_3d_slices) {
361
+ bool is_at_points, bool use_3d_slices) {
322
362
std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
323
363
std::string P_name = " P_1d" + var_suffix, Q_name = " Q_1d" ;
324
364
CeedEvalMode eval_mode = CEED_EVAL_NONE;
@@ -421,7 +461,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
421
461
CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
422
462
CeedInt num_output_fields, CeedOperatorField *op_output_fields,
423
463
CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
424
- bool use_3d_slices) {
464
+ bool is_at_points, bool use_3d_slices) {
425
465
std::string Q_name = " Q_1d" ;
426
466
CeedEvalMode eval_mode = CEED_EVAL_NONE;
427
467
CeedElemRestriction elem_rstr;
@@ -636,9 +676,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
636
676
// Build single operator kernel
637
677
// ------------------------------------------------------------------------------
638
678
extern " C" int CeedOperatorBuildKernel_Cuda_gen (CeedOperator op) {
639
- bool is_tensor = true , use_3d_slices = false ;
679
+ bool is_tensor = true , is_at_points = false , use_3d_slices = false ;
640
680
Ceed ceed;
641
- CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1 ;
681
+ CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1 , max_num_points = 0 ;
642
682
CeedQFunctionField *qf_input_fields, *qf_output_fields;
643
683
CeedQFunction_Cuda_gen *qf_data;
644
684
CeedQFunction qf;
@@ -661,17 +701,23 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
661
701
CeedCallBackend (CeedQFunctionGetFields (qf, NULL , &qf_input_fields, NULL , &qf_output_fields));
662
702
663
703
// Get operator data
704
+ CeedCallBackend (CeedOperatorIsAtPoints (op, &is_at_points));
664
705
CeedCallBackend (CeedOperatorBuildKernelData_Cuda_gen (ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
665
706
qf_output_fields, &data->max_P_1d , &Q_1d, &dim, &is_tensor, &use_3d_slices));
666
707
if (dim == 0 ) dim = 1 ;
667
708
data->dim = dim;
668
709
if (Q_1d == 0 ) {
669
- CeedInt Q;
670
-
671
- CeedCallBackend (CeedOperatorGetNumQuadraturePoints (op, &Q));
672
- Q_1d = Q;
710
+ CeedCallBackend (CeedOperatorGetNumQuadraturePoints (op, &Q_1d));
673
711
}
674
712
data->Q_1d = Q_1d;
713
+ if (is_at_points) {
714
+ CeedElemRestriction rstr_points = NULL ;
715
+
716
+ CeedCallBackend (CeedOperatorAtPointsGetPoints (op, &rstr_points, NULL ));
717
+ CeedCallBackend (CeedElemRestrictionGetMaxPointsInElement (rstr_points, &max_num_points));
718
+ CeedCallBackend (CeedElemRestrictionDestroy (&rstr_points));
719
+ }
720
+ if (is_at_points) use_3d_slices = false ;
675
721
676
722
// Check for restriction only identity operator
677
723
{
@@ -705,6 +751,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
705
751
// TODO: Add non-tensor, AtPoints
706
752
code << " // Tensor basis source\n " ;
707
753
code << " #include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n " ;
754
+ code << " // AtPoints basis source\n " ;
755
+ code << " #include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n " ;
708
756
code << " // CodeGen operator source\n " ;
709
757
code << " #include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n " ;
710
758
@@ -746,7 +794,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
746
794
code << " // s_G_[in,out]_i: Gradient matrix, shared memory\n " ;
747
795
code << " // -----------------------------------------------------------------------------\n " ;
748
796
code << " extern \" C\" __global__ void " << operator_name
749
- << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {\n " ;
797
+ << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
798
+ " points) {\n " ;
750
799
751
800
// Scratch buffers
752
801
for (CeedInt i = 0 ; i < num_input_fields; i++) {
@@ -776,11 +825,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
776
825
// Initialize constants, and matrices B and G
777
826
code << " \n // Input field constants and basis data\n " ;
778
827
for (CeedInt i = 0 ; i < num_input_fields; i++) {
779
- CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true , use_3d_slices));
828
+ CeedCallBackend (
829
+ CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true , is_at_points, use_3d_slices));
780
830
}
781
831
code << " \n // Output field constants and basis data\n " ;
782
832
for (CeedInt i = 0 ; i < num_output_fields; i++) {
783
- CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false , use_3d_slices));
833
+ CeedCallBackend (
834
+ CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false , is_at_points, use_3d_slices));
784
835
}
785
836
786
837
// Loop over all elements
@@ -867,27 +918,29 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
867
918
868
919
// ---- Restriction
869
920
CeedCallBackend (CeedOperatorBuildKernelRestriction_Cuda_gen (code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
870
- Q_1d, true , use_3d_slices));
921
+ Q_1d, true , is_at_points, use_3d_slices));
871
922
872
923
// ---- Basis action
873
- CeedCallBackend (CeedOperatorBuildKernelBasis_Cuda_gen (code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true , use_3d_slices));
924
+ CeedCallBackend (
925
+ CeedOperatorBuildKernelBasis_Cuda_gen (code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true , is_at_points, use_3d_slices));
874
926
}
875
927
876
928
// -- Q function
877
929
CeedCallBackend (CeedOperatorBuildKernelQFunction_Cuda_gen (code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
878
- op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
930
+ op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points, use_3d_slices));
879
931
880
932
// -- Output basis and restriction
881
933
code << " \n // -- Output field basis action and restrictions\n " ;
882
934
for (CeedInt i = 0 ; i < num_output_fields; i++) {
883
935
code << " // ---- Output field " << i << " \n " ;
884
936
885
937
// ---- Basis action
886
- CeedCallBackend (CeedOperatorBuildKernelBasis_Cuda_gen (code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false , use_3d_slices));
938
+ CeedCallBackend (CeedOperatorBuildKernelBasis_Cuda_gen (code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false , is_at_points,
939
+ use_3d_slices));
887
940
888
941
// ---- Restriction
889
- CeedCallBackend (
890
- CeedOperatorBuildKernelRestriction_Cuda_gen (code, data, i, dim, NULL , op_output_fields[i], qf_output_fields[i], Q_1d, false , use_3d_slices));
942
+ CeedCallBackend (CeedOperatorBuildKernelRestriction_Cuda_gen (code, data, i, dim, NULL , op_output_fields[i], qf_output_fields[i], Q_1d, false ,
943
+ is_at_points , use_3d_slices));
891
944
}
892
945
893
946
// Close loop and function
0 commit comments