diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h index 8671dc6423..49e7eca873 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-read-write-templates.h @@ -161,3 +161,43 @@ inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt } } } + +//------------------------------------------------------------------------------ +// AtPoints +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +// E-vector -> single point +//------------------------------------------------------------------------------ +template +inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, + const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, + const CeedScalar *__restrict__ d_u, CeedScalar *r_u) { + const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; + + if (p < points_in_elem) { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_u[comp] = d_u[ind + comp * strides_comp]; + } + } else { + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + r_u[comp] = 0.0; + } + } +} + +//------------------------------------------------------------------------------ +// Single point -> E-vector +//------------------------------------------------------------------------------ +template +inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem, + const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v, + CeedScalar *d_v) { + if (p < points_in_elem) { + const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem; + + for (CeedInt comp = 0; comp < NUM_COMP; comp++) { + d_v[ind + comp * strides_comp] = r_v[comp]; + } + } +} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h index 35681df470..acf35a0dd7 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h @@ -446,123 +446,3 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed } } } - -//------------------------------------------------------------------------------ -// Loops over points -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ -// Interpolate to points -//------------------------------------------------------------------------------ -template -inline __device__ void InterpAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, - const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { - const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); - - for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { - const CeedInt p = i % NUM_PTS; - CeedScalar r_X[DIM]; - - for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; - if (DIM == 1) { - InterpAtPoints1d(data, i, r_C, r_X, r_V); - } else if (DIM == 2) { - InterpAtPoints2d(data, i, r_C, r_X, r_V); - } else if (DIM == 3) { - InterpAtPoints3d(data, i, r_C, r_X, r_V); - } - if (i < NUM_PTS) { - for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j]; - } - } -} - -//------------------------------------------------------------------------------ -// Interpolate from points -//------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, - CeedScalar *__restrict__ r_C) { - // Clear register - for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; - - const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); - - for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { - const CeedInt p = i % NUM_PTS; - CeedScalar r_X[DIM]; - - for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; - for (CeedInt j = 0; j < NUM_COMP; j++) { - if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; - else r_U[j] = 0.0; - } - if (BASIS_DIM == 1) { - InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); - } else if (BASIS_DIM == 2) { - InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); - } else if (BASIS_DIM == 3) { - InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); - } - } - __syncthreads(); -} - -//------------------------------------------------------------------------------ -// Gradient at points -//------------------------------------------------------------------------------ -template -inline __device__ void GradAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, - const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { - const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); - - for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { - const CeedInt p = i % NUM_PTS; - CeedScalar r_X[DIM]; - - for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; - if (DIM == 1) { - GradAtPoints1d(data, i, r_C, r_X, r_V); - } else if (DIM == 2) { - GradAtPoints2d(data, i, r_C, r_X, r_V); - } else if (DIM == 3) { - GradAtPoints3d(data, i, r_C, r_X, r_V); - } - if (i < NUM_PTS) { - for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j]; - } - } -} - -//------------------------------------------------------------------------------ -// Grad from points -//------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, - CeedScalar *__restrict__ r_C) { - // Clear register - for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; - - const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); - - for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { - const CeedInt p = i % NUM_PTS; - CeedScalar r_X[DIM]; - - for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; - for (CeedInt j = 0; j < NUM_COMP * DIM; j++) { - if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; - else r_U[j] = 0.0; - } - if (BASIS_DIM == 1) { - GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); - } else if (BASIS_DIM == 2) { - GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); - } else if (BASIS_DIM == 3) { - GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); - } - } - __syncthreads(); -} diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h index cfc6899476..cd9021611a 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h @@ -31,6 +31,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + CeedScalar r_X[BASIS_DIM]; CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP]; @@ -51,8 +52,21 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal } // Map to points - InterpAtPoints(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V, - &d_V[elem * BASIS_NUM_PTS]); + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + if (BASIS_DIM == 1) { + InterpAtPoints1d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 2) { + InterpAtPoints2d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 3) { + InterpAtPoints3d(data, i, r_C, r_X, r_V); + } + WritePoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); + } } } @@ -68,15 +82,33 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + CeedScalar r_X[BASIS_DIM]; CeedScalar r_U[BASIS_NUM_COMP]; CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + // Map from points - InterpTransposeAtPoints(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem], - &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C); + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); + if (BASIS_DIM == 1) { + InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + __syncthreads(); // Map from coefficients if (BASIS_DIM == 1) { @@ -107,6 +139,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + CeedScalar r_X[BASIS_DIM]; CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; @@ -127,8 +160,21 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar } // Map to points - GradAtPoints(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V, - &d_V[elem * BASIS_NUM_PTS]); + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + if (BASIS_DIM == 1) { + GradAtPoints1d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 2) { + GradAtPoints2d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 3) { + GradAtPoints3d(data, i, r_C, r_X, r_V); + } + WritePoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); + } } } @@ -144,15 +190,34 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + CeedScalar r_X[BASIS_DIM]; CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + // Map from points - GradTransposeAtPoints(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem], - &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C); + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, + r_U); + if (BASIS_DIM == 1) { + GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + __syncthreads(); // Map from coefficients if (BASIS_DIM == 1) { diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h index 7844810c2d..73d8cba91b 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h @@ -446,123 +446,3 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI } } } - -//------------------------------------------------------------------------------ -// Loops over points -//------------------------------------------------------------------------------ - -//------------------------------------------------------------------------------ -// Interpolate to points -//------------------------------------------------------------------------------ -template -inline __device__ void InterpAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, - const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { - const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); - - for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { - const CeedInt p = i % NUM_PTS; - CeedScalar r_X[DIM]; - - for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; - if (DIM == 1) { - InterpAtPoints1d(data, i, r_C, r_X, r_V); - } else if (DIM == 2) { - InterpAtPoints2d(data, i, r_C, r_X, r_V); - } else if (DIM == 3) { - InterpAtPoints3d(data, i, r_C, r_X, r_V); - } - if (i < NUM_PTS) { - for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j]; - } - } -} - -//------------------------------------------------------------------------------ -// Interpolate from points -//------------------------------------------------------------------------------ -template -inline __device__ void InterpTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, - CeedScalar *__restrict__ r_C) { - // Clear register - for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; - - const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); - - for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { - const CeedInt p = i % NUM_PTS; - CeedScalar r_X[DIM]; - - for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; - for (CeedInt j = 0; j < NUM_COMP; j++) { - if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; - else r_U[j] = 0.0; - } - if (BASIS_DIM == 1) { - InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); - } else if (BASIS_DIM == 2) { - InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); - } else if (BASIS_DIM == 3) { - InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); - } - } - __syncthreads(); -} - -//------------------------------------------------------------------------------ -// Gradient at points -//------------------------------------------------------------------------------ -template -inline __device__ void GradAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C, - const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) { - const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); - - for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { - const CeedInt p = i % NUM_PTS; - CeedScalar r_X[DIM]; - - for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p]; - if (DIM == 1) { - GradAtPoints1d(data, i, r_C, r_X, r_V); - } else if (DIM == 2) { - GradAtPoints2d(data, i, r_C, r_X, r_V); - } else if (DIM == 3) { - GradAtPoints3d(data, i, r_C, r_X, r_V); - } - if (i < NUM_PTS) { - for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j]; - } - } -} - -//------------------------------------------------------------------------------ -// Grad from points -//------------------------------------------------------------------------------ -template -inline __device__ void GradTransposeAtPoints(SharedData_Hip &data, const CeedInt comp_stride, const CeedInt points_per_elem, - const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X, - CeedScalar *__restrict__ r_C) { - // Clear register - for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0; - - const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y)); - - for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) { - const CeedInt p = i % NUM_PTS; - CeedScalar r_X[DIM]; - - for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p]; - for (CeedInt j = 0; j < NUM_COMP * DIM; j++) { - if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p]; - else r_U[j] = 0.0; - } - if (BASIS_DIM == 1) { - GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); - } else if (BASIS_DIM == 2) { - GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); - } else if (BASIS_DIM == 3) { - GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); - } - } - __syncthreads(); -} diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h index 355f53d0f4..9f5e947a07 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points.h @@ -37,6 +37,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + CeedScalar r_X[BASIS_DIM]; CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP]; @@ -57,8 +58,21 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ } // Map to points - InterpAtPoints(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V, - &d_V[elem * BASIS_NUM_PTS]); + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + if (BASIS_DIM == 1) { + InterpAtPoints1d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 2) { + InterpAtPoints2d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 3) { + InterpAtPoints3d(data, i, r_C, r_X, r_V); + } + WritePoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); + } } } @@ -79,15 +93,33 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + CeedScalar r_X[BASIS_DIM]; CeedScalar r_U[BASIS_NUM_COMP]; CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + // Map from points - InterpTransposeAtPoints(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem], - &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C); + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U); + if (BASIS_DIM == 1) { + InterpTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + InterpTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + InterpTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + __syncthreads(); // Map from coefficients if (BASIS_DIM == 1) { @@ -124,6 +156,7 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + CeedScalar r_X[BASIS_DIM]; CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)]; CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM]; @@ -144,8 +177,21 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ } // Map to points - GradAtPoints(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V, - &d_V[elem * BASIS_NUM_PTS]); + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + if (BASIS_DIM == 1) { + GradAtPoints1d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 2) { + GradAtPoints2d(data, i, r_C, r_X, r_V); + } else if (BASIS_DIM == 3) { + GradAtPoints3d(data, i, r_C, r_X, r_V); + } + WritePoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V); + } } } @@ -166,15 +212,34 @@ extern "C" __launch_bounds__(BASIS_INTERP_BLOCK_SIZE) __global__ data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1); + CeedScalar r_X[BASIS_DIM]; CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM]; CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)]; // Apply basis element by element for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) { + // Clear register + for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0; + // Map from points - GradTransposeAtPoints(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem], - &d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C); + const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y)); + + for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) { + const CeedInt p = i % BASIS_NUM_PTS; + + ReadPoint(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X); + ReadPoint(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, + r_U); + if (BASIS_DIM == 1) { + GradTransposeAtPoints1d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 2) { + GradTransposeAtPoints2d(data, i, r_U, r_X, r_C); + } else if (BASIS_DIM == 3) { + GradTransposeAtPoints3d(data, i, r_U, r_X, r_C); + } + } + __syncthreads(); // Map from coefficients if (BASIS_DIM == 1) {