Skip to content

Commit 5f954c1

Browse files
committed
gpu - template loop over points for basis action
1 parent 1b3d9bd commit 5f954c1

File tree

4 files changed

+256
-180
lines changed

4 files changed

+256
-180
lines changed

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,3 +446,123 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
446446
}
447447
}
448448
}
449+
450+
//------------------------------------------------------------------------------
451+
// Loops over points
452+
//------------------------------------------------------------------------------
453+
454+
//------------------------------------------------------------------------------
455+
// Interpolate to points
456+
//------------------------------------------------------------------------------
457+
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
458+
inline __device__ void InterpAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
459+
const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
460+
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
461+
462+
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
463+
const CeedInt p = i % NUM_PTS;
464+
CeedScalar r_X[DIM];
465+
466+
for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
467+
if (DIM == 1) {
468+
InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
469+
} else if (DIM == 2) {
470+
InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
471+
} else if (DIM == 3) {
472+
InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
473+
}
474+
if (i < NUM_PTS) {
475+
for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j];
476+
}
477+
}
478+
}
479+
480+
//------------------------------------------------------------------------------
481+
// Interpolate from points
482+
//------------------------------------------------------------------------------
483+
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
484+
inline __device__ void InterpTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
485+
const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
486+
CeedScalar *__restrict__ r_C) {
487+
// Clear register
488+
for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
489+
490+
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
491+
492+
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
493+
const CeedInt p = i % NUM_PTS;
494+
CeedScalar r_X[DIM];
495+
496+
for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
497+
for (CeedInt j = 0; j < NUM_COMP; j++) {
498+
if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
499+
else r_U[j] = 0.0;
500+
}
501+
if (BASIS_DIM == 1) {
502+
InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
503+
} else if (BASIS_DIM == 2) {
504+
InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
505+
} else if (BASIS_DIM == 3) {
506+
InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
507+
}
508+
}
509+
__syncthreads();
510+
}
511+
512+
//------------------------------------------------------------------------------
513+
// Gradient at points
514+
//------------------------------------------------------------------------------
515+
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
516+
inline __device__ void GradAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
517+
const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
518+
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
519+
520+
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
521+
const CeedInt p = i % NUM_PTS;
522+
CeedScalar r_X[DIM];
523+
524+
for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
525+
if (DIM == 1) {
526+
GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
527+
} else if (DIM == 2) {
528+
GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
529+
} else if (DIM == 3) {
530+
GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
531+
}
532+
if (i < NUM_PTS) {
533+
for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j];
534+
}
535+
}
536+
}
537+
538+
//------------------------------------------------------------------------------
539+
// Grad from points
540+
//------------------------------------------------------------------------------
541+
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
542+
inline __device__ void GradTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
543+
const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
544+
CeedScalar *__restrict__ r_C) {
545+
// Clear register
546+
for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
547+
548+
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
549+
550+
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
551+
const CeedInt p = i % NUM_PTS;
552+
CeedScalar r_X[DIM];
553+
554+
for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
555+
for (CeedInt j = 0; j < NUM_COMP * DIM; j++) {
556+
if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
557+
else r_U[j] = 0.0;
558+
}
559+
if (BASIS_DIM == 1) {
560+
GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
561+
} else if (BASIS_DIM == 2) {
562+
GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
563+
} else if (BASIS_DIM == 3) {
564+
GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
565+
}
566+
}
567+
__syncthreads();
568+
}

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h

Lines changed: 8 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -51,26 +51,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
5151
}
5252

5353
// Map to points
54-
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
55-
56-
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
57-
const CeedInt p = i % BASIS_NUM_PTS;
58-
CeedScalar r_X[BASIS_DIM];
59-
60-
for (CeedInt d = 0; d < BASIS_DIM; d++) {
61-
r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
62-
}
63-
if (BASIS_DIM == 1) {
64-
InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
65-
} else if (BASIS_DIM == 2) {
66-
InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
67-
} else if (BASIS_DIM == 3) {
68-
InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
69-
}
70-
for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
71-
if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
72-
}
73-
}
54+
InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
55+
&d_V[elem * BASIS_NUM_PTS]);
7456
}
7557
}
7658

@@ -92,32 +74,9 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
9274

9375
// Apply basis element by element
9476
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
95-
// Clear register
96-
for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
97-
9877
// Map from points
99-
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
100-
101-
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
102-
const CeedInt p = i % BASIS_NUM_PTS;
103-
CeedScalar r_X[BASIS_DIM];
104-
105-
for (CeedInt d = 0; d < BASIS_DIM; d++) {
106-
r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
107-
}
108-
for (CeedInt j = 0; j < BASIS_NUM_COMP; j++) {
109-
if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
110-
else r_U[j] = 0.0;
111-
}
112-
if (BASIS_DIM == 1) {
113-
InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
114-
} else if (BASIS_DIM == 2) {
115-
InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
116-
} else if (BASIS_DIM == 3) {
117-
InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
118-
}
119-
}
120-
__syncthreads();
78+
InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
79+
&d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
12180

12281
// Map from coefficients
12382
if (BASIS_DIM == 1) {
@@ -168,26 +127,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
168127
}
169128

170129
// Map to points
171-
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
172-
173-
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
174-
const CeedInt p = i % BASIS_NUM_PTS;
175-
CeedScalar r_X[BASIS_DIM];
176-
177-
for (CeedInt d = 0; d < BASIS_DIM; d++) {
178-
r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
179-
}
180-
if (BASIS_DIM == 1) {
181-
GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
182-
} else if (BASIS_DIM == 2) {
183-
GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
184-
} else if (BASIS_DIM == 3) {
185-
GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
186-
}
187-
for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
188-
if (i < BASIS_NUM_PTS) d_V[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i] = r_V[j];
189-
}
190-
}
130+
GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
131+
&d_V[elem * BASIS_NUM_PTS]);
191132
}
192133
}
193134

@@ -209,32 +150,9 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
209150

210151
// Apply basis element by element
211152
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
212-
// Clear register
213-
for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;
214-
215153
// Map from points
216-
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));
217-
218-
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
219-
const CeedInt p = i % BASIS_NUM_PTS;
220-
CeedScalar r_X[BASIS_DIM];
221-
222-
for (CeedInt d = 0; d < BASIS_DIM; d++) {
223-
r_X[d] = d_X[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p];
224-
}
225-
for (CeedInt j = 0; j < BASIS_NUM_COMP * BASIS_DIM; j++) {
226-
if (i < points_per_elem[elem]) r_U[j] = d_U[elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p];
227-
else r_U[j] = 0.0;
228-
}
229-
if (BASIS_DIM == 1) {
230-
GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
231-
} else if (BASIS_DIM == 2) {
232-
GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
233-
} else if (BASIS_DIM == 3) {
234-
GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
235-
}
236-
}
237-
__syncthreads();
154+
GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
155+
&d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
238156

239157
// Map from coefficients
240158
if (BASIS_DIM == 1) {

include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -446,3 +446,123 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI
446446
}
447447
}
448448
}
449+
450+
//------------------------------------------------------------------------------
451+
// Loops over points
452+
//------------------------------------------------------------------------------
453+
454+
//------------------------------------------------------------------------------
455+
// Interpolate to points
456+
//------------------------------------------------------------------------------
457+
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
458+
inline __device__ void InterpAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
459+
const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
460+
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
461+
462+
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
463+
const CeedInt p = i % NUM_PTS;
464+
CeedScalar r_X[DIM];
465+
466+
for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
467+
if (DIM == 1) {
468+
InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
469+
} else if (DIM == 2) {
470+
InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
471+
} else if (DIM == 3) {
472+
InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
473+
}
474+
if (i < NUM_PTS) {
475+
for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j];
476+
}
477+
}
478+
}
479+
480+
//------------------------------------------------------------------------------
481+
// Interpolate from points
482+
//------------------------------------------------------------------------------
483+
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
484+
inline __device__ void InterpTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem,
485+
const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
486+
CeedScalar *__restrict__ r_C) {
487+
// Clear register
488+
for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
489+
490+
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
491+
492+
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
493+
const CeedInt p = i % NUM_PTS;
494+
CeedScalar r_X[DIM];
495+
496+
for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
497+
for (CeedInt j = 0; j < NUM_COMP; j++) {
498+
if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
499+
else r_U[j] = 0.0;
500+
}
501+
if (BASIS_DIM == 1) {
502+
InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
503+
} else if (BASIS_DIM == 2) {
504+
InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
505+
} else if (BASIS_DIM == 3) {
506+
InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
507+
}
508+
}
509+
__syncthreads();
510+
}
511+
512+
//------------------------------------------------------------------------------
513+
// Gradient at points
514+
//------------------------------------------------------------------------------
515+
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
516+
inline __device__ void GradAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
517+
const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
518+
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
519+
520+
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
521+
const CeedInt p = i % NUM_PTS;
522+
CeedScalar r_X[DIM];
523+
524+
for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
525+
if (DIM == 1) {
526+
GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
527+
} else if (DIM == 2) {
528+
GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
529+
} else if (DIM == 3) {
530+
GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
531+
}
532+
if (i < NUM_PTS) {
533+
for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j];
534+
}
535+
}
536+
}
537+
538+
//------------------------------------------------------------------------------
539+
// Grad from points
540+
//------------------------------------------------------------------------------
541+
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
542+
inline __device__ void GradTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem,
543+
const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
544+
CeedScalar *__restrict__ r_C) {
545+
// Clear register
546+
for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;
547+
548+
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));
549+
550+
for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
551+
const CeedInt p = i % NUM_PTS;
552+
CeedScalar r_X[DIM];
553+
554+
for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
555+
for (CeedInt j = 0; j < NUM_COMP * DIM; j++) {
556+
if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
557+
else r_U[j] = 0.0;
558+
}
559+
if (BASIS_DIM == 1) {
560+
GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
561+
} else if (BASIS_DIM == 2) {
562+
GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
563+
} else if (BASIS_DIM == 3) {
564+
GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
565+
}
566+
}
567+
__syncthreads();
568+
}

0 commit comments

Comments
 (0)