Skip to content

Commit be8d6f5

Browse files
authored
Merge pull request #1710 from CEED/jeremy/split-at-points
Split AtPoints basis between Transpose/no
2 parents bc3a688 + 81ae615 commit be8d6f5

10 files changed

+586
-470
lines changed

backends/cuda-ref/ceed-cuda-ref-basis.c

+9-6
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,9 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
194194
"BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
195195
max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
196196
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
197+
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
197198
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
199+
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
198200
}
199201

200202
// Get read/write access to u, v
@@ -220,16 +222,17 @@ static int CeedBasisApplyAtPointsCore_Cuda(CeedBasis basis, bool apply_add, cons
220222
// Basis action
221223
switch (eval_mode) {
222224
case CEED_EVAL_INTERP: {
223-
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
224-
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
225+
void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
226+
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
225227

226-
CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
228+
CeedCallBackend(
229+
CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
227230
} break;
228231
case CEED_EVAL_GRAD: {
229-
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
230-
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
232+
void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
233+
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
231234

232-
CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
235+
CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
233236
} break;
234237
case CEED_EVAL_WEIGHT:
235238
case CEED_EVAL_NONE: /* handled separately below */

backends/cuda-ref/ceed-cuda-ref.h

+2
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ typedef struct {
7070
CUmodule moduleAtPoints;
7171
CeedInt num_points;
7272
CUfunction InterpAtPoints;
73+
CUfunction InterpTransposeAtPoints;
7374
CUfunction GradAtPoints;
75+
CUfunction GradTransposeAtPoints;
7476
CeedScalar *d_interp_1d;
7577
CeedScalar *d_grad_1d;
7678
CeedScalar *d_q_weight_1d;

backends/cuda-shared/ceed-cuda-shared-basis.c

+9-6
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,9 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
295295
"BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
296296
max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
297297
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
298+
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
298299
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
300+
CeedCallBackend(CeedGetKernel_Cuda(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
299301
}
300302

301303
// Get read/write access to u, v
@@ -321,16 +323,17 @@ static int CeedBasisApplyAtPointsCore_Cuda_shared(CeedBasis basis, bool apply_ad
321323
// Basis action
322324
switch (eval_mode) {
323325
case CEED_EVAL_INTERP: {
324-
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
325-
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
326+
void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
327+
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
326328

327-
CeedCallBackend(CeedRunKernel_Cuda(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
329+
CeedCallBackend(
330+
CeedRunKernel_Cuda(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
328331
} break;
329332
case CEED_EVAL_GRAD: {
330-
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
331-
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
333+
void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
334+
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
332335

333-
CeedCallBackend(CeedRunKernel_Cuda(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
336+
CeedCallBackend(CeedRunKernel_Cuda(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
334337
} break;
335338
case CEED_EVAL_WEIGHT:
336339
case CEED_EVAL_NONE: /* handled separately below */

backends/cuda-shared/ceed-cuda-shared.h

+2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ typedef struct {
2222
CUmodule moduleAtPoints;
2323
CeedInt num_points;
2424
CUfunction InterpAtPoints;
25+
CUfunction InterpTransposeAtPoints;
2526
CUfunction GradAtPoints;
27+
CUfunction GradTransposeAtPoints;
2628
CeedScalar *d_interp_1d;
2729
CeedScalar *d_grad_1d;
2830
CeedScalar *d_collo_grad_1d;

backends/hip-ref/ceed-hip-ref-basis.c

+9-6
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,9 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
192192
"BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
193193
max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
194194
CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
195+
CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
195196
CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
197+
CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
196198
}
197199

198200
// Get read/write access to u, v
@@ -218,16 +220,17 @@ static int CeedBasisApplyAtPointsCore_Hip(CeedBasis basis, bool apply_add, const
218220
// Basis action
219221
switch (eval_mode) {
220222
case CEED_EVAL_INTERP: {
221-
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
222-
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
223+
void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
224+
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
223225

224-
CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
226+
CeedCallBackend(
227+
CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
225228
} break;
226229
case CEED_EVAL_GRAD: {
227-
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
228-
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
230+
void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
231+
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
229232

230-
CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
233+
CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
231234
} break;
232235
case CEED_EVAL_WEIGHT:
233236
case CEED_EVAL_NONE: /* handled separately below */

backends/hip-ref/ceed-hip-ref.h

+2
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,9 @@ typedef struct {
7474
hipModule_t moduleAtPoints;
7575
CeedInt num_points;
7676
hipFunction_t InterpAtPoints;
77+
hipFunction_t InterpTransposeAtPoints;
7778
hipFunction_t GradAtPoints;
79+
hipFunction_t GradTransposeAtPoints;
7880
CeedScalar *d_interp_1d;
7981
CeedScalar *d_grad_1d;
8082
CeedScalar *d_q_weight_1d;

backends/hip-shared/ceed-hip-shared-basis.c

+9-6
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,9 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
354354
"BASIS_NUM_NODES", CeedIntPow(P_1d, dim), "BASIS_NUM_QPTS", CeedIntPow(Q_1d, dim), "BASIS_NUM_PTS",
355355
max_num_points, "POINTS_BUFF_LEN", CeedIntPow(Q_1d, dim - 1)));
356356
CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpAtPoints", &data->InterpAtPoints));
357+
CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "InterpTransposeAtPoints", &data->InterpTransposeAtPoints));
357358
CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradAtPoints", &data->GradAtPoints));
359+
CeedCallBackend(CeedGetKernel_Hip(ceed, data->moduleAtPoints, "GradTransposeAtPoints", &data->GradTransposeAtPoints));
358360
}
359361

360362
// Get read/write access to u, v
@@ -380,16 +382,17 @@ static int CeedBasisApplyAtPointsCore_Hip_shared(CeedBasis basis, bool apply_add
380382
// Basis action
381383
switch (eval_mode) {
382384
case CEED_EVAL_INTERP: {
383-
void *interp_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
384-
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
385+
void *interp_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
386+
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
385387

386-
CeedCallBackend(CeedRunKernel_Hip(ceed, data->InterpAtPoints, num_elem, block_size, interp_args));
388+
CeedCallBackend(
389+
CeedRunKernel_Hip(ceed, is_transpose ? data->InterpTransposeAtPoints : data->InterpAtPoints, num_elem, block_size, interp_args));
387390
} break;
388391
case CEED_EVAL_GRAD: {
389-
void *grad_args[] = {(void *)&num_elem, (void *)&is_transpose, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
390-
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
392+
void *grad_args[] = {(void *)&num_elem, &data->d_chebyshev_interp_1d, &data->d_points_per_elem, &d_x, &d_u, &d_v};
393+
const CeedInt block_size = CeedIntMin(CeedIntPow(Q_1d, dim), max_block_size);
391394

392-
CeedCallBackend(CeedRunKernel_Hip(ceed, data->GradAtPoints, num_elem, block_size, grad_args));
395+
CeedCallBackend(CeedRunKernel_Hip(ceed, is_transpose ? data->GradTransposeAtPoints : data->GradAtPoints, num_elem, block_size, grad_args));
393396
} break;
394397
case CEED_EVAL_WEIGHT:
395398
case CEED_EVAL_NONE: /* handled separately below */

backends/hip-shared/ceed-hip-shared.h

+2
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ typedef struct {
2222
hipModule_t moduleAtPoints;
2323
CeedInt num_points;
2424
hipFunction_t InterpAtPoints;
25+
hipFunction_t InterpTransposeAtPoints;
2526
hipFunction_t GradAtPoints;
27+
hipFunction_t GradTransposeAtPoints;
2628
CeedInt block_sizes[3]; // interp, grad, weight thread block sizes
2729
CeedScalar *d_interp_1d;
2830
CeedScalar *d_grad_1d;

0 commit comments

Comments
 (0)