@@ -124,7 +124,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
124
124
// Setup fields
125
125
// ------------------------------------------------------------------------------
126
126
static int CeedOperatorBuildKernelFieldData_Cuda_gen (std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
127
- CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
127
+ CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points,
128
+ bool use_3d_slices) {
128
129
std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
129
130
std::string P_name = " P_1d" + var_suffix, Q_name = " Q_1d" ;
130
131
std::string option_name = (is_input ? " inputs" : " outputs" );
@@ -163,16 +164,55 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
163
164
case CEED_EVAL_NONE:
164
165
break ;
165
166
case CEED_EVAL_INTERP:
166
- if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
167
- else data->B .outputs [i] = basis_data->d_interp_1d ;
167
+ if (is_at_points) {
168
+ // AtPoints
169
+ if (!basis_data->d_chebyshev_interp_1d ) {
170
+ CeedSize interp_bytes;
171
+ CeedScalar *chebyshev_interp_1d;
172
+
173
+ interp_bytes = P_1d * Q_1d * sizeof (CeedScalar);
174
+ CeedCallBackend (CeedCalloc (P_1d * Q_1d, &chebyshev_interp_1d));
175
+ CeedCallBackend (CeedBasisGetChebyshevInterp1D (basis, chebyshev_interp_1d));
176
+ CeedCallCuda (CeedBasisReturnCeed (basis), cudaMalloc ((void **)&basis_data->d_chebyshev_interp_1d , interp_bytes));
177
+ CeedCallCuda (CeedBasisReturnCeed (basis),
178
+ cudaMemcpy (basis_data->d_chebyshev_interp_1d , chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
179
+ CeedCallBackend (CeedFree (&chebyshev_interp_1d));
180
+ }
181
+ if (is_input) data->B .inputs [i] = basis_data->d_chebyshev_interp_1d ;
182
+ else data->B .outputs [i] = basis_data->d_chebyshev_interp_1d ;
183
+ } else {
184
+ // Standard quadrature
185
+ if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
186
+ else data->B .outputs [i] = basis_data->d_interp_1d ;
187
+ }
168
188
code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_1d * Q_1d << " ];\n " ;
169
189
code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
170
190
break ;
171
191
case CEED_EVAL_GRAD:
172
- if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
173
- else data->B .outputs [i] = basis_data->d_interp_1d ;
192
+ if (is_at_points) {
193
+ // AtPoints
194
+ if (!basis_data->d_chebyshev_interp_1d ) {
195
+ CeedSize interp_bytes;
196
+ CeedScalar *chebyshev_interp_1d;
197
+
198
+ interp_bytes = P_1d * Q_1d * sizeof (CeedScalar);
199
+ CeedCallBackend (CeedCalloc (P_1d * Q_1d, &chebyshev_interp_1d));
200
+ CeedCallBackend (CeedBasisGetChebyshevInterp1D (basis, chebyshev_interp_1d));
201
+ CeedCallCuda (CeedBasisReturnCeed (basis), cudaMalloc ((void **)&basis_data->d_chebyshev_interp_1d , interp_bytes));
202
+ CeedCallCuda (CeedBasisReturnCeed (basis),
203
+ cudaMemcpy (basis_data->d_chebyshev_interp_1d , chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
204
+ CeedCallBackend (CeedFree (&chebyshev_interp_1d));
205
+ }
206
+ if (is_input) data->B .inputs [i] = basis_data->d_chebyshev_interp_1d ;
207
+ else data->B .outputs [i] = basis_data->d_chebyshev_interp_1d ;
208
+ } else {
209
+ // Standard quadrature
210
+ if (is_input) data->B .inputs [i] = basis_data->d_interp_1d ;
211
+ else data->B .outputs [i] = basis_data->d_interp_1d ;
212
+ }
174
213
code << " __shared__ CeedScalar s_B" << var_suffix << " [" << P_1d * Q_1d << " ];\n " ;
175
214
code << " LoadMatrix<" << P_name << " , " << Q_name << " >(data, B." << option_name << " [" << i << " ], s_B" << var_suffix << " );\n " ;
215
+ if (is_at_points) break ; // No G mat for AtPoints
176
216
if (use_3d_slices) {
177
217
if (is_input) data->G .inputs [i] = basis_data->d_collo_grad_1d ;
178
218
else data->G .outputs [i] = basis_data->d_collo_grad_1d ;
@@ -208,7 +248,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
208
248
// ------------------------------------------------------------------------------
209
249
static int CeedOperatorBuildKernelRestriction_Cuda_gen (std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
210
250
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
211
- CeedInt Q_1d, bool is_input, bool use_3d_slices) {
251
+ CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
212
252
std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
213
253
std::string P_name = " P_1d" + var_suffix;
214
254
CeedEvalMode eval_mode = CEED_EVAL_NONE;
@@ -335,7 +375,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
335
375
// ------------------------------------------------------------------------------
336
376
static int CeedOperatorBuildKernelBasis_Cuda_gen (std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
337
377
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
338
- bool use_3d_slices) {
378
+ bool is_at_points, bool use_3d_slices) {
339
379
std::string var_suffix = (is_input ? " _in_" : " _out_" ) + std::to_string (i);
340
380
std::string P_name = " P_1d" + var_suffix, Q_name = " Q_1d" ;
341
381
CeedEvalMode eval_mode = CEED_EVAL_NONE;
@@ -438,7 +478,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
438
478
CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
439
479
CeedInt num_output_fields, CeedOperatorField *op_output_fields,
440
480
CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
441
- bool use_3d_slices) {
481
+ bool is_at_points, bool use_3d_slices) {
442
482
std::string Q_name = " Q_1d" ;
443
483
CeedEvalMode eval_mode = CEED_EVAL_NONE;
444
484
CeedElemRestriction elem_rstr;
@@ -653,9 +693,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
653
693
// Build single operator kernel
654
694
// ------------------------------------------------------------------------------
655
695
extern " C" int CeedOperatorBuildKernel_Cuda_gen (CeedOperator op) {
656
- bool is_tensor = true , use_3d_slices = false ;
696
+ bool is_tensor = true , is_at_points = false , use_3d_slices = false ;
657
697
Ceed ceed;
658
- CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1 ;
698
+ CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1 , max_num_points = 0 ;
659
699
CeedQFunctionField *qf_input_fields, *qf_output_fields;
660
700
CeedQFunction_Cuda_gen *qf_data;
661
701
CeedQFunction qf;
@@ -678,17 +718,23 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
678
718
CeedCallBackend (CeedQFunctionGetFields (qf, NULL , &qf_input_fields, NULL , &qf_output_fields));
679
719
680
720
// Get operator data
721
+ CeedCallBackend (CeedOperatorIsAtPoints (op, &is_at_points));
681
722
CeedCallBackend (CeedOperatorBuildKernelData_Cuda_gen (ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
682
723
qf_output_fields, &data->max_P_1d , &Q_1d, &dim, &is_tensor, &use_3d_slices));
683
724
if (dim == 0 ) dim = 1 ;
684
725
data->dim = dim;
685
726
if (Q_1d == 0 ) {
686
- CeedInt Q;
687
-
688
- CeedCallBackend (CeedOperatorGetNumQuadraturePoints (op, &Q));
689
- Q_1d = Q;
727
+ CeedCallBackend (CeedOperatorGetNumQuadraturePoints (op, &Q_1d));
690
728
}
691
729
data->Q_1d = Q_1d;
730
+ if (is_at_points) {
731
+ CeedElemRestriction rstr_points = NULL ;
732
+
733
+ CeedCallBackend (CeedOperatorAtPointsGetPoints (op, &rstr_points, NULL ));
734
+ CeedCallBackend (CeedElemRestrictionGetMaxPointsInElement (rstr_points, &max_num_points));
735
+ CeedCallBackend (CeedElemRestrictionDestroy (&rstr_points));
736
+ }
737
+ if (is_at_points) use_3d_slices = false ;
692
738
693
739
// Check for restriction only identity operator
694
740
{
@@ -722,6 +768,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
722
768
// TODO: Add non-tensor, AtPoints
723
769
code << " // Tensor basis source\n " ;
724
770
code << " #include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n " ;
771
+ code << " // AtPoints basis source\n " ;
772
+ code << " #include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n " ;
725
773
code << " // CodeGen operator source\n " ;
726
774
code << " #include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n " ;
727
775
@@ -763,7 +811,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
763
811
code << " // s_G_[in,out]_i: Gradient matrix, shared memory\n " ;
764
812
code << " // -----------------------------------------------------------------------------\n " ;
765
813
code << " extern \" C\" __global__ void " << operator_name
766
- << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {\n " ;
814
+ << " (CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
815
+ " points) {\n " ;
767
816
768
817
// Scratch buffers
769
818
for (CeedInt i = 0 ; i < num_input_fields; i++) {
@@ -793,11 +842,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
793
842
// Initialize constants, and matrices B and G
794
843
code << " \n // Input field constants and basis data\n " ;
795
844
for (CeedInt i = 0 ; i < num_input_fields; i++) {
796
- CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true , use_3d_slices));
845
+ CeedCallBackend (
846
+ CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true , is_at_points, use_3d_slices));
797
847
}
798
848
code << " \n // Output field constants and basis data\n " ;
799
849
for (CeedInt i = 0 ; i < num_output_fields; i++) {
800
- CeedCallBackend (CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false , use_3d_slices));
850
+ CeedCallBackend (
851
+ CeedOperatorBuildKernelFieldData_Cuda_gen (code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false , is_at_points, use_3d_slices));
801
852
}
802
853
803
854
// Loop over all elements
@@ -884,27 +935,29 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
884
935
885
936
// ---- Restriction
886
937
CeedCallBackend (CeedOperatorBuildKernelRestriction_Cuda_gen (code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
887
- Q_1d, true , use_3d_slices));
938
+ Q_1d, true , is_at_points, use_3d_slices));
888
939
889
940
// ---- Basis action
890
- CeedCallBackend (CeedOperatorBuildKernelBasis_Cuda_gen (code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true , use_3d_slices));
941
+ CeedCallBackend (
942
+ CeedOperatorBuildKernelBasis_Cuda_gen (code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true , is_at_points, use_3d_slices));
891
943
}
892
944
893
945
// -- Q function
894
946
CeedCallBackend (CeedOperatorBuildKernelQFunction_Cuda_gen (code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
895
- op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
947
+ op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points, use_3d_slices));
896
948
897
949
// -- Output basis and restriction
898
950
code << " \n // -- Output field basis action and restrictions\n " ;
899
951
for (CeedInt i = 0 ; i < num_output_fields; i++) {
900
952
code << " // ---- Output field " << i << " \n " ;
901
953
902
954
// ---- Basis action
903
- CeedCallBackend (CeedOperatorBuildKernelBasis_Cuda_gen (code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false , use_3d_slices));
955
+ CeedCallBackend (CeedOperatorBuildKernelBasis_Cuda_gen (code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false , is_at_points,
956
+ use_3d_slices));
904
957
905
958
// ---- Restriction
906
- CeedCallBackend (
907
- CeedOperatorBuildKernelRestriction_Cuda_gen (code, data, i, dim, NULL , op_output_fields[i], qf_output_fields[i], Q_1d, false , use_3d_slices));
959
+ CeedCallBackend (CeedOperatorBuildKernelRestriction_Cuda_gen (code, data, i, dim, NULL , op_output_fields[i], qf_output_fields[i], Q_1d, false ,
960
+ is_at_points , use_3d_slices));
908
961
}
909
962
910
963
// Close loop and function
0 commit comments