Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
jeremylt committed Dec 5, 2024
1 parent 290fc47 commit bc086f0
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 32 deletions.
99 changes: 76 additions & 23 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
// Setup fields
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points,
bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
std::string option_name = (is_input ? "inputs" : "outputs");
Expand Down Expand Up @@ -163,16 +164,55 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
case CEED_EVAL_NONE:
break;
case CEED_EVAL_INTERP:
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
else data->B.outputs[i] = basis_data->d_interp_1d;
if (is_at_points) {
// AtPoints
if (!basis_data->d_chebyshev_interp_1d) {
CeedSize interp_bytes;
CeedScalar *chebyshev_interp_1d;

interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
CeedCallCuda(CeedBasisReturnCeed(basis),
cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
CeedCallBackend(CeedFree(&chebyshev_interp_1d));
}
if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
} else {
// Standard quadrature
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
else data->B.outputs[i] = basis_data->d_interp_1d;
}
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
code << " loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
break;
case CEED_EVAL_GRAD:
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
else data->B.outputs[i] = basis_data->d_interp_1d;
if (is_at_points) {
// AtPoints
if (!basis_data->d_chebyshev_interp_1d) {
CeedSize interp_bytes;
CeedScalar *chebyshev_interp_1d;

interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
CeedCallCuda(CeedBasisReturnCeed(basis),
cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
CeedCallBackend(CeedFree(&chebyshev_interp_1d));
}
if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
} else {
// Standard quadrature
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
else data->B.outputs[i] = basis_data->d_interp_1d;
}
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
code << " loadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
if (is_at_points) break; // No G mat for AtPoints
if (use_3d_slices) {
if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
else data->G.outputs[i] = basis_data->d_collo_grad_1d;
Expand Down Expand Up @@ -209,7 +249,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
CeedInt Q_1d, bool is_input, bool use_3d_slices) {
CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = "P_1d" + var_suffix;
CeedEvalMode eval_mode = CEED_EVAL_NONE;
Expand Down Expand Up @@ -318,7 +358,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
bool use_3d_slices) {
bool is_at_points, bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
CeedEvalMode eval_mode = CEED_EVAL_NONE;
Expand Down Expand Up @@ -421,7 +461,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
CeedInt num_output_fields, CeedOperatorField *op_output_fields,
CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
bool use_3d_slices) {
bool is_at_points, bool use_3d_slices) {
std::string Q_name = "Q_1d";
CeedEvalMode eval_mode = CEED_EVAL_NONE;
CeedElemRestriction elem_rstr;
Expand Down Expand Up @@ -636,9 +676,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
// Build single operator kernel
//------------------------------------------------------------------------------
extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
bool is_tensor = true, use_3d_slices = false;
bool is_tensor = true, is_at_points = false, use_3d_slices = false;
Ceed ceed;
CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1;
CeedInt Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0;
CeedQFunctionField *qf_input_fields, *qf_output_fields;
CeedQFunction_Cuda_gen *qf_data;
CeedQFunction qf;
Expand All @@ -661,17 +701,23 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));

// Get operator data
CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
if (dim == 0) dim = 1;
data->dim = dim;
if (Q_1d == 0) {
CeedInt Q;

CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
Q_1d = Q;
CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
}
data->Q_1d = Q_1d;
if (is_at_points) {
CeedElemRestriction rstr_points = NULL;

CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
}
if (is_at_points) use_3d_slices = false;

// Check for restriction only identity operator
{
Expand Down Expand Up @@ -705,6 +751,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
// TODO: Add non-tensor, AtPoints
code << "// Tensor basis source\n";
code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
code << "// AtPoints basis source\n";
code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
code << "// CodeGen operator source\n";
code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";

Expand Down Expand Up @@ -746,7 +794,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
code << "// -----------------------------------------------------------------------------\n";
code << "extern \"C\" __global__ void " << operator_name
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {\n";
<< "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
"points) {\n";

// Scratch buffers
for (CeedInt i = 0; i < num_input_fields; i++) {
Expand Down Expand Up @@ -776,11 +825,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
// Initialize constants, and matrices B and G
code << "\n // Input field constants and basis data\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
CeedCallBackend(
CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices));
}
code << "\n // Output field constants and basis data\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
CeedCallBackend(
CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
}

// Loop over all elements
Expand Down Expand Up @@ -867,27 +918,29 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {

// ---- Restriction
CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
Q_1d, true, use_3d_slices));
Q_1d, true, is_at_points, use_3d_slices));

// ---- Basis action
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
CeedCallBackend(
CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices));
}

// -- Q function
CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points, use_3d_slices));

// -- Output basis and restriction
code << "\n // -- Output field basis action and restrictions\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
code << " // ---- Output field " << i << "\n";

// ---- Basis action
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points,
use_3d_slices));

// ---- Restriction
CeedCallBackend(
CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
is_at_points, use_3d_slices));
}

// Close loop and function
Expand Down
60 changes: 51 additions & 9 deletions backends/cuda-gen/ceed-cuda-gen-operator.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@
// Destroy operator
//------------------------------------------------------------------------------
static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
Ceed ceed;
CeedOperator_Cuda_gen *impl;

CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
CeedCallBackend(CeedOperatorGetData(op, &impl));
if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
CeedCallBackend(CeedFree(&impl));
CeedCallBackend(CeedDestroy(&ceed));
return CEED_ERROR_SUCCESS;
}

Expand Down Expand Up @@ -92,6 +96,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
// Apply and add to output
//------------------------------------------------------------------------------
static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
bool is_at_points;
Ceed ceed;
Ceed_Cuda *cuda_data;
CeedInt num_elem, num_input_fields, num_output_fields;
Expand Down Expand Up @@ -181,25 +186,52 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
}
}

// Point coordinates, if needed
CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
if (is_at_points) {
// Coords
CeedVector vec;

CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
CeedCallBackend(CeedVectorDestroy(&vec));

// Points per elem
if (num_elem != data->points.num_elem) {
CeedInt *points_per_elem;
const CeedInt num_bytes = num_elem * sizeof(CeedInt);
CeedElemRestriction rstr_points = NULL;

data->points.num_elem = num_elem;
CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
for (CeedInt e = 0; e < num_elem; e++) {
CeedInt num_points_elem;

CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
points_per_elem[e] = num_points_elem;
}
if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
CeedCallBackend(CeedFree(&points_per_elem));
}
}

// Get context data
CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));

// Apply operator
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
void *opargs[] = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
const CeedInt dim = data->dim;
const CeedInt Q_1d = data->Q_1d;
const CeedInt P_1d = data->max_P_1d;
const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
int max_threads_per_block, min_grid_size;
int max_threads_per_block, min_grid_size, grid;

CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
int block[3] =
{
thread_1d,
dim < 2 ? 1 : thread_1d,
-1,
},
grid;
int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1};

CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
Expand Down Expand Up @@ -236,6 +268,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
if (is_active) vec = output_vec;
// Check for multiple output modes
CeedInt index = -1;

for (CeedInt j = 0; j < i; j++) {
if (vec == output_vecs[j]) {
index = j;
Expand All @@ -249,6 +282,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
}
}

// Restore point coordinates, if needed
if (is_at_points) {
CeedVector vec;

CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
CeedCallBackend(CeedVectorDestroy(&vec));
}

// Restore context data
CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
CeedCallBackend(CeedDestroy(&ceed));
Expand Down
1 change: 1 addition & 0 deletions backends/cuda-gen/ceed-cuda-gen.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ typedef struct {
Fields_Cuda B;
Fields_Cuda G;
CeedScalar *W;
Points_Cuda points;
} CeedOperator_Cuda_gen;

typedef struct {
Expand Down
6 changes: 6 additions & 0 deletions include/ceed/jit-source/cuda/cuda-types.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ typedef struct {
CeedInt *outputs[CEED_CUDA_NUMBER_FIELDS];
} FieldsInt_Cuda;

typedef struct {
CeedInt num_elem;
const CeedInt *num_per_elem;
const CeedScalar *coords;
} Points_Cuda;

typedef struct {
CeedInt t_id_x;
CeedInt t_id_y;
Expand Down

0 comments on commit bc086f0

Please sign in to comment.