@@ -51,26 +51,8 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
51
51
}
52
52
53
53
// Map to points
54
- const CeedInt bound = (blockDim .x * blockDim .y ) * ceil (1.0 * BASIS_NUM_PTS / (blockDim .x * blockDim .y ));
55
-
56
- for (CeedInt i = threadIdx .x + threadIdx .y * blockDim .x ; i < bound ; i += blockDim .x * blockDim .y ) {
57
- const CeedInt p = i % BASIS_NUM_PTS ;
58
- CeedScalar r_X [BASIS_DIM ];
59
-
60
- for (CeedInt d = 0 ; d < BASIS_DIM ; d ++ ) {
61
- r_X [d ] = d_X [elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p ];
62
- }
63
- if (BASIS_DIM == 1 ) {
64
- InterpAtPoints1d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_C , r_X , r_V );
65
- } else if (BASIS_DIM == 2 ) {
66
- InterpAtPoints2d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_C , r_X , r_V );
67
- } else if (BASIS_DIM == 3 ) {
68
- InterpAtPoints3d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_C , r_X , r_V );
69
- }
70
- for (CeedInt j = 0 ; j < BASIS_NUM_COMP ; j ++ ) {
71
- if (i < BASIS_NUM_PTS ) d_V [elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i ] = r_V [j ];
72
- }
73
- }
54
+ InterpAtPoints < BASIS_DIM , BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , num_elem * BASIS_NUM_PTS , r_C , & d_X [elem * BASIS_NUM_PTS ], r_V ,
55
+ & d_V [elem * BASIS_NUM_PTS ]);
74
56
}
75
57
}
76
58
@@ -92,32 +74,9 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
92
74
93
75
// Apply basis element by element
94
76
for (CeedInt elem = blockIdx .x * blockDim .z + threadIdx .z ; elem < num_elem ; elem += gridDim .x * blockDim .z ) {
95
- // Clear register
96
- for (CeedInt i = 0 ; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1 ); i ++ ) r_C [i ] = 0.0 ;
97
-
98
77
// Map from points
99
- const CeedInt bound = (blockDim .x * blockDim .y ) * ceil (1.0 * BASIS_NUM_PTS / (blockDim .x * blockDim .y ));
100
-
101
- for (CeedInt i = threadIdx .x + threadIdx .y * blockDim .x ; i < bound ; i += blockDim .x * blockDim .y ) {
102
- const CeedInt p = i % BASIS_NUM_PTS ;
103
- CeedScalar r_X [BASIS_DIM ];
104
-
105
- for (CeedInt d = 0 ; d < BASIS_DIM ; d ++ ) {
106
- r_X [d ] = d_X [elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p ];
107
- }
108
- for (CeedInt j = 0 ; j < BASIS_NUM_COMP ; j ++ ) {
109
- if (i < points_per_elem [elem ]) r_U [j ] = d_U [elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p ];
110
- else r_U [j ] = 0.0 ;
111
- }
112
- if (BASIS_DIM == 1 ) {
113
- InterpTransposeAtPoints1d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_U , r_X , r_C );
114
- } else if (BASIS_DIM == 2 ) {
115
- InterpTransposeAtPoints2d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_U , r_X , r_C );
116
- } else if (BASIS_DIM == 3 ) {
117
- InterpTransposeAtPoints3d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_U , r_X , r_C );
118
- }
119
- }
120
- __syncthreads ();
78
+ InterpTransposeAtPoints < BASIS_DIM , BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , num_elem * BASIS_NUM_PTS , points_per_elem [elem ],
79
+ & d_U [elem * BASIS_NUM_PTS ], r_U , & d_X [elem * BASIS_NUM_PTS ], r_C );
121
80
122
81
// Map from coefficients
123
82
if (BASIS_DIM == 1 ) {
@@ -168,26 +127,8 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
168
127
}
169
128
170
129
// Map to points
171
- const CeedInt bound = (blockDim .x * blockDim .y ) * ceil (1.0 * BASIS_NUM_PTS / (blockDim .x * blockDim .y ));
172
-
173
- for (CeedInt i = threadIdx .x + threadIdx .y * blockDim .x ; i < bound ; i += blockDim .x * blockDim .y ) {
174
- const CeedInt p = i % BASIS_NUM_PTS ;
175
- CeedScalar r_X [BASIS_DIM ];
176
-
177
- for (CeedInt d = 0 ; d < BASIS_DIM ; d ++ ) {
178
- r_X [d ] = d_X [elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p ];
179
- }
180
- if (BASIS_DIM == 1 ) {
181
- GradAtPoints1d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_C , r_X , r_V );
182
- } else if (BASIS_DIM == 2 ) {
183
- GradAtPoints2d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_C , r_X , r_V );
184
- } else if (BASIS_DIM == 3 ) {
185
- GradAtPoints3d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_C , r_X , r_V );
186
- }
187
- for (CeedInt j = 0 ; j < BASIS_NUM_COMP * BASIS_DIM ; j ++ ) {
188
- if (i < BASIS_NUM_PTS ) d_V [elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + i ] = r_V [j ];
189
- }
190
- }
130
+ GradAtPoints < BASIS_DIM , BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , num_elem * BASIS_NUM_PTS , r_C , & d_X [elem * BASIS_NUM_PTS ], r_V ,
131
+ & d_V [elem * BASIS_NUM_PTS ]);
191
132
}
192
133
}
193
134
@@ -209,32 +150,9 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
209
150
210
151
// Apply basis element by element
211
152
for (CeedInt elem = blockIdx .x * blockDim .z + threadIdx .z ; elem < num_elem ; elem += gridDim .x * blockDim .z ) {
212
- // Clear register
213
- for (CeedInt i = 0 ; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1 ); i ++ ) r_C [i ] = 0.0 ;
214
-
215
153
// Map from points
216
- const CeedInt bound = (blockDim .x * blockDim .y ) * ceil (1.0 * BASIS_NUM_PTS / (blockDim .x * blockDim .y ));
217
-
218
- for (CeedInt i = threadIdx .x + threadIdx .y * blockDim .x ; i < bound ; i += blockDim .x * blockDim .y ) {
219
- const CeedInt p = i % BASIS_NUM_PTS ;
220
- CeedScalar r_X [BASIS_DIM ];
221
-
222
- for (CeedInt d = 0 ; d < BASIS_DIM ; d ++ ) {
223
- r_X [d ] = d_X [elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * d + p ];
224
- }
225
- for (CeedInt j = 0 ; j < BASIS_NUM_COMP * BASIS_DIM ; j ++ ) {
226
- if (i < points_per_elem [elem ]) r_U [j ] = d_U [elem * BASIS_NUM_PTS + num_elem * BASIS_NUM_PTS * j + p ];
227
- else r_U [j ] = 0.0 ;
228
- }
229
- if (BASIS_DIM == 1 ) {
230
- GradTransposeAtPoints1d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_U , r_X , r_C );
231
- } else if (BASIS_DIM == 2 ) {
232
- GradTransposeAtPoints2d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_U , r_X , r_C );
233
- } else if (BASIS_DIM == 3 ) {
234
- GradTransposeAtPoints3d < BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , i , r_U , r_X , r_C );
235
- }
236
- }
237
- __syncthreads ();
154
+ GradTransposeAtPoints < BASIS_DIM , BASIS_NUM_COMP , BASIS_NUM_PTS , BASIS_Q_1D > (data , num_elem * BASIS_NUM_PTS , points_per_elem [elem ],
155
+ & d_U [elem * BASIS_NUM_PTS ], r_U , & d_X [elem * BASIS_NUM_PTS ], r_C );
238
156
239
157
// Map from coefficients
240
158
if (BASIS_DIM == 1 ) {
0 commit comments