diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp index d19eedd491..9fadf6c38e 100644 --- a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp +++ b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp @@ -127,8 +127,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie // Setup fields //------------------------------------------------------------------------------ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field, - CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_tensor, bool is_at_points, - bool use_3d_slices) { + CeedQFunctionField qf_field, CeedInt field_reuse[3], CeedInt Q_1d, bool is_input, bool is_tensor, + bool is_at_points, bool use_3d_slices) { std::string var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i); std::string P_name = (is_tensor ? "P_1d" : "P") + var_suffix, Q_name = is_tensor ? "Q_1d" : "Q"; std::string option_name = (is_input ? "inputs" : "outputs"); @@ -138,6 +138,12 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C CeedBasis_Cuda_shared *basis_data; CeedBasis basis; + // Field reuse info + bool use_previous_field = field_reuse[0] != -1; + bool reuse_input = field_reuse[1]; + CeedInt reuse_field = field_reuse[0]; + CeedEvalMode reuse_mode = (CeedEvalMode)field_reuse[2]; + code << " // -- " << (is_input ? "Input" : "Output") << " field " << i << "\n"; // Get field data @@ -188,8 +194,14 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C if (is_input) data->B.inputs[i] = basis_data->d_interp_1d; else data->B.outputs[i] = basis_data->d_interp_1d; } - code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; - code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; + if (use_previous_field) { + std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field)); + + code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; + } else { + code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; + code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; + } break; case CEED_EVAL_GRAD: if (is_at_points) { @@ -214,27 +226,51 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C else data->B.outputs[i] = basis_data->d_interp_1d; } if (is_tensor) { - code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; - code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; + if (use_previous_field) { + std::string reuse_var = "s_B" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field)); + + code << " CeedScalar *s_B" << var_suffix << " = " << reuse_var << ";\n"; + } else { + code << " __shared__ CeedScalar s_B" << var_suffix << "[" << P_name << "*" << Q_name << "];\n"; + code << " LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n"; + } } if (is_at_points) break; // No G mat for AtPoints if (use_3d_slices) { if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d; else data->G.outputs[i] = basis_data->d_collo_grad_1d; - code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; - code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) { + std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field)); + + code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; + } else { + code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; + code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + } } else { bool has_collo_grad = basis_data->d_collo_grad_1d; if (is_input) data->G.inputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; else data->G.outputs[i] = has_collo_grad ? basis_data->d_collo_grad_1d : basis_data->d_grad_1d; if (has_collo_grad) { - code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; - code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) { + std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field)); + + code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; + } else { + code << " __shared__ CeedScalar s_G" << var_suffix << "[" << Q_name << "*" << Q_name << "];\n"; + code << " LoadMatrix<" << Q_name << ", " << Q_name << ">(data, G." << option_name << "[" << i << "], s_G" << var_suffix << ");\n"; + } } else { - code << " __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n"; - code << " LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G" - << var_suffix << ");\n"; + if (use_previous_field && reuse_mode == CEED_EVAL_GRAD) { + std::string reuse_var = "s_G" + ((reuse_input ? "_in_" : "_out_") + std::to_string(reuse_field)); + + code << " CeedScalar *s_G" << var_suffix << " = " << reuse_var << ";\n"; + } else { + code << " __shared__ CeedScalar s_G" << var_suffix << "[" << P_name << "*" << Q_name << (is_tensor ? "" : "*dim") << "];\n"; + code << " LoadMatrix<" << P_name << ", " << Q_name << (is_tensor ? "" : "*dim") << ">(data, G." << option_name << "[" << i << "], s_G" + << var_suffix << ");\n"; + } } } break; @@ -1079,16 +1115,116 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) { code << " data.t_id = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;\n"; code << " data.slice = slice + data.t_id_z*T_1D" << ((!is_tensor || dim == 1) ? "" : "*T_1D") << ";\n"; + // -- Determine input mat reuse + CeedInt input_matrix_reuse[CEED_FIELD_MAX][3]; // field, is_input, eval_mode + + for (CeedInt i = 0; i < num_input_fields; i++) { + input_matrix_reuse[i][0] = -1; + } + for (CeedInt i = 0; i < num_input_fields; i++) { + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode_i)); + if (eval_mode_i == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[i], &basis_i)); + for (CeedInt j = 0; (input_matrix_reuse[i][0] == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + input_matrix_reuse[i][0] = j; + input_matrix_reuse[i][1] = true; + input_matrix_reuse[i][2] = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + input_matrix_reuse[i][0] = j; + input_matrix_reuse[i][1] = true; + input_matrix_reuse[i][2] = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + + // -- Determine output mat reuse + CeedInt output_matrix_reuse[CEED_FIELD_MAX][3]; // field, is_input, eval_mode + + for (CeedInt i = 0; i < num_output_fields; i++) { + output_matrix_reuse[i][0] = -1; + } + for (CeedInt i = 0; i < num_output_fields; i++) { + CeedEvalMode eval_mode_i; + CeedBasis basis_i; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode_i)); + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[i], &basis_i)); + for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < num_input_fields); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_input_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_input_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i][0] = j; + output_matrix_reuse[i][1] = true; + output_matrix_reuse[i][2] = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i][0] = j; + output_matrix_reuse[i][1] = true; + output_matrix_reuse[i][2] = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + for (CeedInt j = 0; (output_matrix_reuse[i][0] == -1) && (j < i); j++) { + CeedEvalMode eval_mode_j; + CeedBasis basis_j; + + CeedCallBackend(CeedQFunctionFieldGetEvalMode(qf_output_fields[j], &eval_mode_j)); + if (eval_mode_j == CEED_EVAL_WEIGHT) continue; + CeedCallBackend(CeedOperatorFieldGetBasis(op_output_fields[j], &basis_j)); + if (basis_i == basis_j) { + if (is_tensor) { + output_matrix_reuse[i][0] = j; + output_matrix_reuse[i][1] = false; + output_matrix_reuse[i][2] = eval_mode_j; + } else { + // For non-tensor can only re-use with the same eval mode + if (eval_mode_i == eval_mode_j) { + output_matrix_reuse[i][0] = j; + output_matrix_reuse[i][1] = false; + output_matrix_reuse[i][2] = eval_mode_j; + } + } + } + CeedCallBackend(CeedBasisDestroy(&basis_j)); + } + CeedCallBackend(CeedBasisDestroy(&basis_i)); + } + // Initialize constants, and matrices B and G code << "\n // Input field constants and basis data\n"; for (CeedInt i = 0; i < num_input_fields; i++) { - CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_tensor, - is_at_points, use_3d_slices)); + CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], input_matrix_reuse[i], Q_1d, + true, is_tensor, is_at_points, use_3d_slices)); } code << "\n // Output field constants and basis data\n"; for (CeedInt i = 0; i < num_output_fields; i++) { - CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_tensor, - is_at_points, use_3d_slices)); + CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], output_matrix_reuse[i], Q_1d, + false, is_tensor, is_at_points, use_3d_slices)); } // Loop over all elements