Skip to content

Commit

Permalink
shared - AtPoints template changes for gen
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Dec 9, 2024
1 parent e0b3ede commit 889b824
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 256 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,43 @@ inline __device__ void SumElementStrided3d(SharedData_Cuda &data, const CeedInt
}
}
}

//------------------------------------------------------------------------------
// AtPoints
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------
// E-vector -> single point
//------------------------------------------------------------------------------
template <int NUM_COMP, int NUM_PTS>
inline __device__ void ReadPoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem,
const CeedScalar *__restrict__ d_u, CeedScalar *r_u) {
const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;

if (p < points_in_elem) {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
r_u[comp] = d_u[ind + comp * strides_comp];
}
} else {
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
r_u[comp] = 0.0;
}
}
}

//------------------------------------------------------------------------------
// Single point -> E-vector
//------------------------------------------------------------------------------
template <int NUM_COMP, int NUM_PTS>
inline __device__ void WritePoint(SharedData_Cuda &data, const CeedInt elem, const CeedInt p, const CeedInt points_in_elem,
const CeedInt strides_point, const CeedInt strides_comp, const CeedInt strides_elem, const CeedScalar *r_v,
CeedScalar *d_v) {
if (p < points_in_elem) {
const CeedInt ind = (p % NUM_PTS) * strides_point + elem * strides_elem;

for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
d_v[ind + comp * strides_comp] = r_v[comp];
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -446,123 +446,3 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed
}
}
}

//------------------------------------------------------------------------------
// Loops over points
//------------------------------------------------------------------------------

//------------------------------------------------------------------------------
// Interpolate to points
//------------------------------------------------------------------------------
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
inline __device__ void InterpAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));

for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
const CeedInt p = i % NUM_PTS;
CeedScalar r_X[DIM];

for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
if (DIM == 1) {
InterpAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
} else if (DIM == 2) {
InterpAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
} else if (DIM == 3) {
InterpAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
}
if (i < NUM_PTS) {
for (CeedInt j = 0; j < NUM_COMP; j++) d_V[comp_stride * j + p] = r_V[j];
}
}
}

//------------------------------------------------------------------------------
// Interpolate from points
//------------------------------------------------------------------------------
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
inline __device__ void InterpTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
CeedScalar *__restrict__ r_C) {
// Clear register
for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;

const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));

for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
const CeedInt p = i % NUM_PTS;
CeedScalar r_X[DIM];

for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
for (CeedInt j = 0; j < NUM_COMP; j++) {
if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
else r_U[j] = 0.0;
}
if (BASIS_DIM == 1) {
InterpTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
} else if (BASIS_DIM == 2) {
InterpTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
} else if (BASIS_DIM == 3) {
InterpTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
}
}
__syncthreads();
}

//------------------------------------------------------------------------------
// Gradient at points
//------------------------------------------------------------------------------
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
inline __device__ void GradAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedScalar *__restrict__ r_C,
const CeedScalar *__restrict__ d_X, CeedScalar *__restrict__ r_V, CeedScalar *__restrict__ d_V) {
const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));

for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
const CeedInt p = i % NUM_PTS;
CeedScalar r_X[DIM];

for (CeedInt d = 0; d < BASIS_DIM; d++) r_X[d] = d_X[comp_stride * d + p];
if (DIM == 1) {
GradAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
} else if (DIM == 2) {
GradAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
} else if (DIM == 3) {
GradAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_C, r_X, r_V);
}
if (i < NUM_PTS) {
for (CeedInt j = 0; j < NUM_COMP * DIM; j++) d_V[comp_stride * j + p] = r_V[j];
}
}
}

//------------------------------------------------------------------------------
// Grad from points
//------------------------------------------------------------------------------
template <int DIM, int NUM_COMP, int NUM_PTS, int Q_1D>
inline __device__ void GradTransposeAtPoints(SharedData_Cuda &data, const CeedInt comp_stride, const CeedInt points_per_elem,
const CeedScalar *__restrict__ d_U, CeedScalar *__restrict__ r_U, const CeedScalar *__restrict__ d_X,
CeedScalar *__restrict__ r_C) {
// Clear register
for (CeedInt i = 0; i < NUM_COMP * (DIM > 2 ? Q_1D : 1); i++) r_C[i] = 0.0;

const CeedInt bound = (blockDim.x * blockDim.y) * ceil(1.0 * NUM_PTS / (blockDim.x * blockDim.y));

for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < bound; i += blockDim.x * blockDim.y) {
const CeedInt p = i % NUM_PTS;
CeedScalar r_X[DIM];

for (CeedInt d = 0; d < DIM; d++) r_X[d] = d_X[comp_stride * d + p];
for (CeedInt j = 0; j < NUM_COMP * DIM; j++) {
if (i < points_per_elem) r_U[j] = d_U[comp_stride * j + p];
else r_U[j] = 0.0;
}
if (BASIS_DIM == 1) {
GradTransposeAtPoints1d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
} else if (BASIS_DIM == 2) {
GradTransposeAtPoints2d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
} else if (BASIS_DIM == 3) {
GradTransposeAtPoints3d<NUM_COMP, NUM_PTS, Q_1D>(data, i, r_U, r_X, r_C);
}
}
__syncthreads();
}
81 changes: 73 additions & 8 deletions include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);

CeedScalar r_X[BASIS_DIM];
CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
CeedScalar r_V[BASIS_NUM_COMP];
Expand All @@ -51,8 +52,21 @@ extern "C" __global__ void InterpAtPoints(const CeedInt num_elem, const CeedScal
}

// Map to points
InterpAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
&d_V[elem * BASIS_NUM_PTS]);
const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));

for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
const CeedInt p = i % BASIS_NUM_PTS;

ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
if (BASIS_DIM == 1) {
InterpAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
} else if (BASIS_DIM == 2) {
InterpAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
} else if (BASIS_DIM == 3) {
InterpAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
}
WritePoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
}
}
}

Expand All @@ -68,15 +82,33 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);

CeedScalar r_X[BASIS_DIM];
CeedScalar r_U[BASIS_NUM_COMP];
CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];

// Apply basis element by element
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
// Clear register
for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;

// Map from points
InterpTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
&d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));

for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
const CeedInt p = i % BASIS_NUM_PTS;

ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
ReadPoint<BASIS_NUM_COMP, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U, r_U);
if (BASIS_DIM == 1) {
InterpTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
} else if (BASIS_DIM == 2) {
InterpTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
} else if (BASIS_DIM == 3) {
InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
}
}
__syncthreads();

// Map from coefficients
if (BASIS_DIM == 1) {
Expand Down Expand Up @@ -107,6 +139,7 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);

CeedScalar r_X[BASIS_DIM];
CeedScalar r_U[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_P_1D : 1)];
CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
CeedScalar r_V[BASIS_NUM_COMP * BASIS_DIM];
Expand All @@ -127,8 +160,21 @@ extern "C" __global__ void GradAtPoints(const CeedInt num_elem, const CeedScalar
}

// Map to points
GradAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, r_C, &d_X[elem * BASIS_NUM_PTS], r_V,
&d_V[elem * BASIS_NUM_PTS]);
const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));

for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
const CeedInt p = i % BASIS_NUM_PTS;

ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
if (BASIS_DIM == 1) {
GradAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
} else if (BASIS_DIM == 2) {
GradAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
} else if (BASIS_DIM == 3) {
GradAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_C, r_X, r_V);
}
WritePoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, r_V, d_V);
}
}
}

Expand All @@ -144,15 +190,34 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x;
data.slice = slice + data.t_id_z * T_1D * (BASIS_DIM > 1 ? T_1D : 1);

CeedScalar r_X[BASIS_DIM];
CeedScalar r_U[BASIS_NUM_COMP * BASIS_DIM];
CeedScalar r_C[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];
CeedScalar r_V[BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1)];

// Apply basis element by element
for (CeedInt elem = blockIdx.x * blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x * blockDim.z) {
// Clear register
for (CeedInt i = 0; i < BASIS_NUM_COMP * (BASIS_DIM > 2 ? BASIS_Q_1D : 1); i++) r_C[i] = 0.0;

// Map from points
GradTransposeAtPoints<BASIS_DIM, BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, num_elem * BASIS_NUM_PTS, points_per_elem[elem],
&d_U[elem * BASIS_NUM_PTS], r_U, &d_X[elem * BASIS_NUM_PTS], r_C);
const CeedInt point_loop_bound = (blockDim.x * blockDim.y) * ceil(1.0 * BASIS_NUM_PTS / (blockDim.x * blockDim.y));

for (CeedInt i = threadIdx.x + threadIdx.y * blockDim.x; i < point_loop_bound; i += blockDim.x * blockDim.y) {
const CeedInt p = i % BASIS_NUM_PTS;

ReadPoint<BASIS_DIM, BASIS_NUM_PTS>(data, elem, p, BASIS_NUM_PTS, 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_X, r_X);
ReadPoint<BASIS_NUM_COMP * BASIS_DIM, BASIS_NUM_PTS>(data, elem, i, points_per_elem[elem], 1, num_elem * BASIS_NUM_PTS, BASIS_NUM_PTS, d_U,
r_U);
if (BASIS_DIM == 1) {
GradTransposeAtPoints1d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
} else if (BASIS_DIM == 2) {
GradTransposeAtPoints2d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
} else if (BASIS_DIM == 3) {
GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
}
}
__syncthreads();

// Map from coefficients
if (BASIS_DIM == 1) {
Expand Down
Loading

0 comments on commit 889b824

Please sign in to comment.