Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuda - remove duplicate mats in gen #1739

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
170 changes: 153 additions & 17 deletions backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
// Setup fields
//------------------------------------------------------------------------------
static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points,
bool use_3d_slices) {
CeedQFunctionField qf_field, CeedInt field_reuse[3], CeedInt Q_1d, bool is_input, bool is_tensor,
bool is_at_points, bool use_3d_slices) {
std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q";
std::string option_name = (is_input ? "inputs" : "outputs");
Expand All @@ -138,6 +138,12 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
CeedBasis_Cuda_shared *basis_data;
CeedBasis basis;

// Field reuse info
bool use_previous_field = field_reuse[0] != -1;
bool reuse_input = field_reuse[1];
CeedInt reuse_field = field_reuse[0];
CeedEvalMode reuse_mode = (CeedEvalMode)field_reuse[2];

code << " // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n";

// Get field data
Expand Down Expand Up @@ -188,8 +194,14 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
else data->B.outputs[i] = basis_data->d_interp_1d;
}
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
if (use_previous_field) {
std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));

code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
} else {
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
}
break;
case CEED_EVAL_GRAD:
if (is_at_points) {
Expand All @@ -214,27 +226,51 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
else data->B.outputs[i] = basis_data->d_interp_1d;
}
if (is_tensor) {
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
if (use_previous_field) {
std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));

code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n";
} else {
code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n";
code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
}
}
if (is_at_points) break; // No G mat for AtPoints
if (use_3d_slices) {
if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
else data->G.outputs[i] = basis_data->d_collo_grad_1d;
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));

code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
} else {
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
}
} else {
bool has_collo_grad = basis_data->d_collo_grad_1d;

if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d;
if (has_collo_grad) {
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));

code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
} else {
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n";
code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n";
}
} else {
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
code << " LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
<< var_suffix << ");\n";
if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) {
std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field));

code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n";
} else {
code << " __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n";
code << " LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G"
<< var_suffix << ");\n";
}
}
}
break;
Expand Down Expand Up @@ -1079,16 +1115,116 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n";
code << " data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n";

// -- Determine input mat reuse
CeedInt input_matrix_reuse[CEED_FIELD_MAX][3]; // field, is_input, eval_mode

for (CeedInt i = 0; i < num_input_fields; i++) {
input_matrix_reuse[i][0] = -1;
}
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedEvalMode eval_mode_i;
CeedBasis basis_i;

CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i));
if (eval_mode_i == CEED_EVAL_WEIGHT) continue;
CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i));
for (CeedInt j = 0; (input_matrix_reuse[i][0] == -1) && (j < i); j++) {
CeedEvalMode eval_mode_j;
CeedBasis basis_j;

CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
if (basis_i == basis_j) {
if (is_tensor) {
input_matrix_reuse[i][0] = j;
input_matrix_reuse[i][1] = true;
input_matrix_reuse[i][2] = eval_mode_j;
} else {
// For non-tensor can only re-use with the same eval mode
if (eval_mode_i == eval_mode_j) {
input_matrix_reuse[i][0] = j;
input_matrix_reuse[i][1] = true;
input_matrix_reuse[i][2] = eval_mode_j;
}
}
}
CeedCallBackend(CeedBasisDestroy(&basis_j));
}
CeedCallBackend(CeedBasisDestroy(&basis_i));
}

// -- Determine output mat reuse
CeedInt output_matrix_reuse[CEED_FIELD_MAX][3]; // field, is_input, eval_mode

for (CeedInt i = 0; i < num_output_fields; i++) {
output_matrix_reuse[i][0] = -1;
}
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedEvalMode eval_mode_i;
CeedBasis basis_i;

CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i));
CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i));
for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < num_input_fields); j++) {
CeedEvalMode eval_mode_j;
CeedBasis basis_j;

CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j));
if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j));
if (basis_i == basis_j) {
if (is_tensor) {
output_matrix_reuse[i][0] = j;
output_matrix_reuse[i][1] = true;
output_matrix_reuse[i][2] = eval_mode_j;
} else {
// For non-tensor can only re-use with the same eval mode
if (eval_mode_i == eval_mode_j) {
output_matrix_reuse[i][0] = j;
output_matrix_reuse[i][1] = true;
output_matrix_reuse[i][2] = eval_mode_j;
}
}
}
CeedCallBackend(CeedBasisDestroy(&basis_j));
}
for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < i); j++) {
CeedEvalMode eval_mode_j;
CeedBasis basis_j;

CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j));
if (eval_mode_j == CEED_EVAL_WEIGHT) continue;
CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j));
if (basis_i == basis_j) {
if (is_tensor) {
output_matrix_reuse[i][0] = j;
output_matrix_reuse[i][1] = false;
output_matrix_reuse[i][2] = eval_mode_j;
} else {
// For non-tensor can only re-use with the same eval mode
if (eval_mode_i == eval_mode_j) {
output_matrix_reuse[i][0] = j;
output_matrix_reuse[i][1] = false;
output_matrix_reuse[i][2] = eval_mode_j;
}
}
}
CeedCallBackend(CeedBasisDestroy(&basis_j));
}
CeedCallBackend(CeedBasisDestroy(&basis_i));
}

// Initialize constants, and matrices B and G
code << "\n // Input field constants and basis data\n";
for (CeedInt i = 0; i < num_input_fields; i++) {
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor,
is_at_points, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d,
true, is_tensor, is_at_points, use_3d_slices));
}
code << "\n // Output field constants and basis data\n";
for (CeedInt i = 0; i < num_output_fields; i++) {
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor,
is_at_points, use_3d_slices));
CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d,
false, is_tensor, is_at_points, use_3d_slices));
}

// Loop over all elements
Expand Down
Loading