@@ -64,7 +64,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
64
64
if (dim == 1 ) {
65
65
CeedInt elems_per_block = CeedIntMin (ceed_Cuda -> device_prop .maxThreadsDim [2 ], CeedIntMax (512 / thread_1d ,
66
66
1 )); // avoid >512 total threads
67
- CeedInt grid = num_elem / elems_per_block + (( num_elem / elems_per_block * elems_per_block < num_elem ) ? 1 : 0 );
67
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
68
68
CeedInt shared_mem = elems_per_block * thread_1d * sizeof (CeedScalar );
69
69
70
70
if (t_mode == CEED_TRANSPOSE ) {
@@ -77,7 +77,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
77
77
const CeedInt opt_elems [7 ] = {0 , 32 , 8 , 6 , 4 , 2 , 8 };
78
78
// elems_per_block must be at least 1
79
79
CeedInt elems_per_block = CeedIntMax (thread_1d < 7 ? opt_elems [thread_1d ] / num_comp : 1 , 1 );
80
- CeedInt grid = num_elem / elems_per_block + (( num_elem / elems_per_block * elems_per_block < num_elem ) ? 1 : 0 );
80
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
81
81
CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof (CeedScalar );
82
82
83
83
if (t_mode == CEED_TRANSPOSE ) {
@@ -88,7 +88,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
88
88
}
89
89
} else if (dim == 3 ) {
90
90
CeedInt elems_per_block = 1 ;
91
- CeedInt grid = num_elem / elems_per_block + (( num_elem / elems_per_block * elems_per_block < num_elem ) ? 1 : 0 );
91
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
92
92
CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof (CeedScalar );
93
93
94
94
if (t_mode == CEED_TRANSPOSE ) {
@@ -115,7 +115,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
115
115
if (dim == 1 ) {
116
116
CeedInt elems_per_block = CeedIntMin (ceed_Cuda -> device_prop .maxThreadsDim [2 ], CeedIntMax (512 / thread_1d ,
117
117
1 )); // avoid >512 total threads
118
- CeedInt grid = num_elem / elems_per_block + (( num_elem / elems_per_block * elems_per_block < num_elem ) ? 1 : 0 );
118
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
119
119
CeedInt shared_mem = elems_per_block * thread_1d * sizeof (CeedScalar );
120
120
121
121
if (t_mode == CEED_TRANSPOSE ) {
@@ -128,7 +128,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
128
128
const CeedInt opt_elems [7 ] = {0 , 32 , 8 , 6 , 4 , 2 , 8 };
129
129
// elems_per_block must be at least 1
130
130
CeedInt elems_per_block = CeedIntMax (thread_1d < 7 ? opt_elems [thread_1d ] / num_comp : 1 , 1 );
131
- CeedInt grid = num_elem / elems_per_block + (( num_elem / elems_per_block * elems_per_block < num_elem ) ? 1 : 0 );
131
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
132
132
CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof (CeedScalar );
133
133
134
134
if (t_mode == CEED_TRANSPOSE ) {
@@ -139,7 +139,7 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
139
139
}
140
140
} else if (dim == 3 ) {
141
141
CeedInt elems_per_block = 1 ;
142
- CeedInt grid = num_elem / elems_per_block + (( num_elem / elems_per_block * elems_per_block < num_elem ) ? 1 : 0 );
142
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
143
143
CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof (CeedScalar );
144
144
145
145
if (t_mode == CEED_TRANSPOSE ) {
@@ -159,19 +159,19 @@ static int CeedBasisApplyTensorCore_Cuda_shared(CeedBasis basis, bool apply_add,
159
159
void * weight_args [] = {(void * )& num_elem , (void * )& data -> d_q_weight_1d , & d_v };
160
160
if (dim == 1 ) {
161
161
const CeedInt elems_per_block = block_size / Q_1d ;
162
- const CeedInt grid_size = num_elem / elems_per_block + (( num_elem / elems_per_block * elems_per_block < num_elem ) ? 1 : 0 );
162
+ const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
163
163
164
164
CeedCallBackend (CeedRunKernelDim_Cuda (ceed , data -> Weight , grid_size , Q_1d , elems_per_block , 1 , weight_args ));
165
165
} else if (dim == 2 ) {
166
166
const CeedInt opt_elems = block_size / (Q_1d * Q_1d );
167
167
const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1 ;
168
- const CeedInt grid_size = num_elem / elems_per_block + (( num_elem / elems_per_block * elems_per_block < num_elem ) ? 1 : 0 );
168
+ const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
169
169
170
170
CeedCallBackend (CeedRunKernelDim_Cuda (ceed , data -> Weight , grid_size , Q_1d , Q_1d , elems_per_block , weight_args ));
171
171
} else if (dim == 3 ) {
172
172
const CeedInt opt_elems = block_size / (Q_1d * Q_1d );
173
173
const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1 ;
174
- const CeedInt grid_size = num_elem / elems_per_block + (( num_elem / elems_per_block * elems_per_block < num_elem ) ? 1 : 0 );
174
+ const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
175
175
176
176
CeedCallBackend (CeedRunKernelDim_Cuda (ceed , data -> Weight , grid_size , Q_1d , Q_1d , elems_per_block , weight_args ));
177
177
}
@@ -211,16 +211,17 @@ static int CeedBasisApplyAddTensor_Cuda_shared(CeedBasis basis, const CeedInt nu
211
211
static int CeedBasisApplyAtPointsCore_Cuda_shared (CeedBasis basis , bool apply_add , const CeedInt num_elem , const CeedInt * num_points ,
212
212
CeedTransposeMode t_mode , CeedEvalMode eval_mode , CeedVector x_ref , CeedVector u , CeedVector v ) {
213
213
Ceed ceed ;
214
- CeedInt Q_1d , dim , max_num_points = num_points [ 0 ] ;
215
- const CeedInt is_transpose = t_mode == CEED_TRANSPOSE ;
216
- const int max_block_size = 32 ;
214
+ Ceed_Cuda * ceed_Cuda ;
215
+ CeedInt Q_1d , dim , num_comp , max_num_points = num_points [ 0 ] ;
216
+ const CeedInt is_transpose = t_mode == CEED_TRANSPOSE ;
217
217
const CeedScalar * d_x , * d_u ;
218
218
CeedScalar * d_v ;
219
219
CeedBasis_Cuda_shared * data ;
220
220
221
221
CeedCallBackend (CeedBasisGetData (basis , & data ));
222
222
CeedCallBackend (CeedBasisGetNumQuadraturePoints1D (basis , & Q_1d ));
223
223
CeedCallBackend (CeedBasisGetDimension (basis , & dim ));
224
+ CeedCallBackend (CeedBasisGetNumComponents (basis , & num_comp ));
224
225
225
226
// Weight handled separately
226
227
if (eval_mode == CEED_EVAL_WEIGHT ) {
@@ -229,14 +230,13 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
229
230
}
230
231
231
232
CeedCallBackend (CeedBasisGetCeed (basis , & ceed ));
233
+ CeedCallBackend (CeedGetData (ceed , & ceed_Cuda ));
232
234
233
235
// Check padded to uniform number of points per elem
234
236
for (CeedInt i = 1 ; i < num_elem ; i ++ ) max_num_points = CeedIntMax (max_num_points , num_points [i ]);
235
237
{
236
- CeedInt num_comp , q_comp ;
238
+ CeedInt q_comp ;
237
239
CeedSize len , len_required ;
238
-
239
- CeedCallBackend (CeedBasisGetNumComponents (basis , & num_comp ));
240
240
CeedCallBackend (CeedBasisGetNumQuadratureComponents (basis , eval_mode , & q_comp ));
241
241
CeedCallBackend (CeedVectorGetLength (is_transpose ? u : v , & len ));
242
242
len_required = (CeedSize )num_comp * (CeedSize )q_comp * (CeedSize )num_elem * (CeedSize )max_num_points ;
@@ -285,15 +285,14 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
285
285
}
286
286
287
287
// -- Compile kernels
288
- const char basis_kernel_source [] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-ref -basis-tensor-at-points.h>\n" ;
288
+ const char basis_kernel_source [] = "// AtPoints basis source\n#include <ceed/jit-source/cuda/cuda-shared -basis-tensor-at-points.h>\n" ;
289
289
CeedInt num_comp ;
290
290
291
291
if (data -> moduleAtPoints ) CeedCallCuda (ceed , cuModuleUnload (data -> moduleAtPoints ));
292
292
CeedCallBackend (CeedBasisGetNumComponents (basis , & num_comp ));
293
- CeedCallBackend (CeedCompile_Cuda (ceed , basis_kernel_source , & data -> moduleAtPoints , 9 , "BASIS_Q_1D" , Q_1d , "BASIS_P_1D" , P_1d , "BASIS_BUF_LEN" ,
294
- Q_1d * CeedIntPow (Q_1d > P_1d ? Q_1d : P_1d , dim - 1 ), "BASIS_DIM" , dim , "BASIS_NUM_COMP" , num_comp ,
295
- "BASIS_NUM_NODES" , CeedIntPow (P_1d , dim ), "BASIS_NUM_QPTS" , CeedIntPow (Q_1d , dim ), "BASIS_NUM_PTS" ,
296
- max_num_points , "POINTS_BUFF_LEN" , CeedIntPow (Q_1d , dim - 1 )));
293
+ CeedCallBackend (CeedCompile_Cuda (ceed , basis_kernel_source , & data -> moduleAtPoints , 8 , "BASIS_Q_1D" , Q_1d , "BASIS_P_1D" , P_1d , "T_1D" ,
294
+ CeedIntMax (Q_1d , P_1d ), "BASIS_DIM" , dim , "BASIS_NUM_COMP" , num_comp , "BASIS_NUM_NODES" , CeedIntPow (P_1d , dim ),
295
+ "BASIS_NUM_QPTS" , CeedIntPow (Q_1d , dim ), "BASIS_NUM_PTS" , max_num_points ));
297
296
CeedCallBackend (CeedGetKernel_Cuda (ceed , data -> moduleAtPoints , "InterpAtPoints" , & data -> InterpAtPoints ));
298
297
CeedCallBackend (CeedGetKernel_Cuda (ceed , data -> moduleAtPoints , "InterpTransposeAtPoints" , & data -> InterpTransposeAtPoints ));
299
298
CeedCallBackend (CeedGetKernel_Cuda (ceed , data -> moduleAtPoints , "GradAtPoints" , & data -> GradAtPoints ));
@@ -323,17 +322,76 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
323
322
// Basis action
324
323
switch (eval_mode ) {
325
324
case CEED_EVAL_INTERP : {
326
- void * interp_args [] = {(void * )& num_elem , & data -> d_chebyshev_interp_1d , & data -> d_points_per_elem , & d_x , & d_u , & d_v };
327
- const CeedInt block_size = CeedIntMin (CeedIntPow (Q_1d , dim ), max_block_size );
325
+ CeedInt P_1d , Q_1d ;
328
326
329
- CeedCallBackend (
330
- CeedRunKernel_Cuda (ceed , is_transpose ? data -> InterpTransposeAtPoints : data -> InterpAtPoints , num_elem , block_size , interp_args ));
327
+ CeedCallBackend (CeedBasisGetNumNodes1D (basis , & P_1d ));
328
+ CeedCallBackend (CeedBasisGetNumQuadraturePoints1D (basis , & Q_1d ));
329
+ CeedInt thread_1d = CeedIntMax (Q_1d , P_1d );
330
+
331
+ CeedCallBackend (CeedInit_CudaInterp (data -> d_chebyshev_interp_1d , P_1d , Q_1d , & data -> c_B ));
332
+ void * interp_args [] = {(void * )& num_elem , & data -> c_B , & data -> d_points_per_elem , & d_x , & d_u , & d_v };
333
+
334
+ if (dim == 1 ) {
335
+ CeedInt elems_per_block = CeedIntMin (ceed_Cuda -> device_prop .maxThreadsDim [2 ], CeedIntMax (512 / thread_1d ,
336
+ 1 )); // avoid >512 total threads
337
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
338
+ CeedInt shared_mem = elems_per_block * thread_1d * sizeof (CeedScalar );
339
+
340
+ CeedCallBackend (CeedRunKernelDimShared_Cuda (ceed , is_transpose ? data -> InterpTransposeAtPoints : data -> InterpAtPoints , grid , thread_1d , 1 ,
341
+ elems_per_block , shared_mem , interp_args ));
342
+ } else if (dim == 2 ) {
343
+ const CeedInt opt_elems [7 ] = {0 , 32 , 8 , 6 , 4 , 2 , 8 };
344
+ // elems_per_block must be at least 1
345
+ CeedInt elems_per_block = CeedIntMax (thread_1d < 7 ? opt_elems [thread_1d ] / num_comp : 1 , 1 );
346
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
347
+ CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof (CeedScalar );
348
+
349
+ CeedCallBackend (CeedRunKernelDimShared_Cuda (ceed , is_transpose ? data -> InterpTransposeAtPoints : data -> InterpAtPoints , grid , thread_1d ,
350
+ thread_1d , elems_per_block , shared_mem , interp_args ));
351
+ } else if (dim == 3 ) {
352
+ CeedInt elems_per_block = 1 ;
353
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
354
+ CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof (CeedScalar );
355
+
356
+ CeedCallBackend (CeedRunKernelDimShared_Cuda (ceed , is_transpose ? data -> InterpTransposeAtPoints : data -> InterpAtPoints , grid , thread_1d ,
357
+ thread_1d , elems_per_block , shared_mem , interp_args ));
358
+ }
331
359
} break ;
332
360
case CEED_EVAL_GRAD : {
333
- void * grad_args [] = {(void * )& num_elem , & data -> d_chebyshev_interp_1d , & data -> d_points_per_elem , & d_x , & d_u , & d_v };
334
- const CeedInt block_size = CeedIntMin (CeedIntPow (Q_1d , dim ), max_block_size );
361
+ CeedInt P_1d , Q_1d ;
362
+
363
+ CeedCallBackend (CeedBasisGetNumNodes1D (basis , & P_1d ));
364
+ CeedCallBackend (CeedBasisGetNumQuadraturePoints1D (basis , & Q_1d ));
365
+ CeedInt thread_1d = CeedIntMax (Q_1d , P_1d );
366
+
367
+ CeedCallBackend (CeedInit_CudaInterp (data -> d_chebyshev_interp_1d , P_1d , Q_1d , & data -> c_B ));
368
+ void * grad_args [] = {(void * )& num_elem , & data -> d_chebyshev_interp_1d , & data -> d_points_per_elem , & d_x , & d_u , & d_v };
369
+
370
+ if (dim == 1 ) {
371
+ CeedInt elems_per_block = CeedIntMin (ceed_Cuda -> device_prop .maxThreadsDim [2 ], CeedIntMax (512 / thread_1d ,
372
+ 1 )); // avoid >512 total threads
373
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
374
+ CeedInt shared_mem = elems_per_block * thread_1d * sizeof (CeedScalar );
375
+
376
+ CeedCallBackend (CeedRunKernelDimShared_Cuda (ceed , is_transpose ? data -> GradTransposeAtPoints : data -> GradAtPoints , grid , thread_1d , 1 ,
377
+ elems_per_block , shared_mem , grad_args ));
378
+ } else if (dim == 2 ) {
379
+ const CeedInt opt_elems [7 ] = {0 , 32 , 8 , 6 , 4 , 2 , 8 };
380
+ // elems_per_block must be at least 1
381
+ CeedInt elems_per_block = CeedIntMax (thread_1d < 7 ? opt_elems [thread_1d ] / num_comp : 1 , 1 );
382
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
383
+ CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof (CeedScalar );
384
+
385
+ CeedCallBackend (CeedRunKernelDimShared_Cuda (ceed , is_transpose ? data -> GradTransposeAtPoints : data -> GradAtPoints , grid , thread_1d , thread_1d ,
386
+ elems_per_block , shared_mem , grad_args ));
387
+ } else if (dim == 3 ) {
388
+ CeedInt elems_per_block = 1 ;
389
+ CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0 );
390
+ CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof (CeedScalar );
335
391
336
- CeedCallBackend (CeedRunKernel_Cuda (ceed , is_transpose ? data -> GradTransposeAtPoints : data -> GradAtPoints , num_elem , block_size , grad_args ));
392
+ CeedCallBackend (CeedRunKernelDimShared_Cuda (ceed , is_transpose ? data -> GradTransposeAtPoints : data -> GradAtPoints , grid , thread_1d , thread_1d ,
393
+ elems_per_block , shared_mem , grad_args ));
394
+ }
337
395
} break ;
338
396
case CEED_EVAL_WEIGHT :
339
397
case CEED_EVAL_NONE : /* handled separately below */
0 commit comments