@@ -179,13 +179,14 @@ inline __device__ void WriteLVecStrided2d(SharedData_Cuda &data, const CeedInt e
179
179
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
180
180
inline __device__ void ReadLVecStandard3d (SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
181
181
const CeedScalar *__restrict__ d_u, CeedScalar *__restrict__ r_u) {
182
- if (data.t_id_x < P_1d && data.t_id_y < P_1d)
182
+ if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
183
183
for (CeedInt z = 0 ; z < P_1d; z++) {
184
184
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
185
185
const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d];
186
186
187
187
for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + COMP_STRIDE * comp];
188
188
}
189
+ }
189
190
}
190
191
191
192
// ------------------------------------------------------------------------------
@@ -194,13 +195,14 @@ inline __device__ void ReadLVecStandard3d(SharedData_Cuda &data, const CeedInt n
194
195
template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
195
196
inline __device__ void ReadLVecStrided3d (SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ d_u,
196
197
CeedScalar *__restrict__ r_u) {
197
- if (data.t_id_x < P_1d && data.t_id_y < P_1d)
198
+ if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
198
199
for (CeedInt z = 0 ; z < P_1d; z++) {
199
200
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
200
201
const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
201
202
202
203
for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) r_u[z + comp * P_1d] = d_u[ind + comp * STRIDES_COMP];
203
204
}
205
+ }
204
206
}
205
207
206
208
// ------------------------------------------------------------------------------
@@ -238,13 +240,14 @@ inline __device__ void ReadEVecSliceStrided3d(SharedData_Cuda &data, const CeedI
238
240
template <int NUM_COMP, int COMP_STRIDE, int P_1d>
239
241
inline __device__ void WriteLVecStandard3d (SharedData_Cuda &data, const CeedInt num_nodes, const CeedInt elem, const CeedInt *__restrict__ indices,
240
242
const CeedScalar *__restrict__ r_v, CeedScalar *__restrict__ d_v) {
241
- if (data.t_id_x < P_1d && data.t_id_y < P_1d)
243
+ if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
242
244
for (CeedInt z = 0 ; z < P_1d; z++) {
243
245
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
244
246
const CeedInt ind = indices[node + elem * P_1d * P_1d * P_1d];
245
247
246
248
for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) atomicAdd (&d_v[ind + COMP_STRIDE * comp], r_v[z + comp * P_1d]);
247
249
}
250
+ }
248
251
}
249
252
250
253
// ------------------------------------------------------------------------------
@@ -253,13 +256,14 @@ inline __device__ void WriteLVecStandard3d(SharedData_Cuda &data, const CeedInt
253
256
template <int NUM_COMP, int P_1d, int STRIDES_NODE, int STRIDES_COMP, int STRIDES_ELEM>
254
257
inline __device__ void WriteLVecStrided3d (SharedData_Cuda &data, const CeedInt elem, const CeedScalar *__restrict__ r_v,
255
258
CeedScalar *__restrict__ d_v) {
256
- if (data.t_id_x < P_1d && data.t_id_y < P_1d)
259
+ if (data.t_id_x < P_1d && data.t_id_y < P_1d) {
257
260
for (CeedInt z = 0 ; z < P_1d; z++) {
258
261
const CeedInt node = data.t_id_x + data.t_id_y * P_1d + z * P_1d * P_1d;
259
262
const CeedInt ind = node * STRIDES_NODE + elem * STRIDES_ELEM;
260
263
261
264
for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) d_v[ind + comp * STRIDES_COMP] += r_v[z + comp * P_1d];
262
265
}
266
+ }
263
267
}
264
268
265
269
// ------------------------------------------------------------------------------
@@ -274,15 +278,19 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
274
278
__syncthreads ();
275
279
// X derivative
276
280
r_V[comp + 0 * NUM_COMP] = 0.0 ;
277
- for (CeedInt i = 0 ; i < Q_1d; i++)
278
- r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice [i + data.t_id_y * T_1D]; // Contract x direction (X derivative)
281
+ for (CeedInt i = 0 ; i < Q_1d; i++) {
282
+ r_V[comp + 0 * NUM_COMP] += c_G[i + data.t_id_x * Q_1d] * data.slice [i + data.t_id_y * T_1D];
283
+ }
279
284
// Y derivative
280
285
r_V[comp + 1 * NUM_COMP] = 0.0 ;
281
- for (CeedInt i = 0 ; i < Q_1d; i++)
282
- r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice [data.t_id_x + i * T_1D]; // Contract y direction (Y derivative)
286
+ for (CeedInt i = 0 ; i < Q_1d; i++) {
287
+ r_V[comp + 1 * NUM_COMP] += c_G[i + data.t_id_y * Q_1d] * data.slice [data.t_id_x + i * T_1D];
288
+ }
283
289
// Z derivative
284
290
r_V[comp + 2 * NUM_COMP] = 0.0 ;
285
- for (CeedInt i = 0 ; i < Q_1d; i++) r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d]; // Contract z direction (Z derivative)
291
+ for (CeedInt i = 0 ; i < Q_1d; i++) {
292
+ r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1d] * r_U[i + comp * Q_1d];
293
+ }
286
294
__syncthreads ();
287
295
}
288
296
}
@@ -296,21 +304,24 @@ inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const Ce
296
304
CeedScalar *__restrict__ r_V) {
297
305
if (data.t_id_x < Q_1d && data.t_id_y < Q_1d) {
298
306
for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) {
299
- // X derivative
300
307
data.slice [data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
301
308
__syncthreads ();
302
- for (CeedInt i = 0 ; i < Q_1d; i++)
303
- r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice [i + data.t_id_y * T_1D]; // Contract x direction (X derivative)
309
+ // X derivative
310
+ for (CeedInt i = 0 ; i < Q_1d; i++) {
311
+ r_V[q + comp * Q_1d] += c_G[data.t_id_x + i * Q_1d] * data.slice [i + data.t_id_y * T_1D];
312
+ }
304
313
__syncthreads ();
305
314
// Y derivative
306
315
data.slice [data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
307
316
__syncthreads ();
308
- for (CeedInt i = 0 ; i < Q_1d; i++)
309
- r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice [data.t_id_x + i * T_1D]; // Contract y direction (Y derivative)
317
+ for (CeedInt i = 0 ; i < Q_1d; i++) {
318
+ r_V[q + comp * Q_1d] += c_G[data.t_id_y + i * Q_1d] * data.slice [data.t_id_x + i * T_1D];
319
+ }
310
320
__syncthreads ();
311
321
// Z derivative
312
- for (CeedInt i = 0 ; i < Q_1d; i++)
313
- r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP]; // PARTIAL contract z direction (Z derivative)
322
+ for (CeedInt i = 0 ; i < Q_1d; i++) {
323
+ r_V[i + comp * Q_1d] += c_G[i + q * Q_1d] * r_U[comp + 2 * NUM_COMP];
324
+ }
314
325
}
315
326
}
316
327
}
0 commit comments