Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuda - remove duplicate mats in gen #1739

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

jeremylt
Copy link
Member

@jeremylt jeremylt commented Feb 6, 2025

Fixes #1737

This should let us run bigger elements for gen, and its just silly to do work we don't need to do.

Sample output:

// -----------------------------------------------------------------------------
// Operator Kernel
// 
// d_[in,out]_i:   CeedVector device array
// r_[in,out]_e_i: Element vector register
// r_[in,out]_q_i: Quadrature space vector register
// r_[in,out]_c_i: AtPoints Chebyshev coefficients register
// r_[in,out]_s_i: Quadrature space slice vector register
// 
// s_B_[in,out]_i: Interpolation matrix, shared memory
// s_G_[in,out]_i: Gradient matrix, shared memory
// -----------------------------------------------------------------------------
extern "C" __global__ void CeedKernelCudaGenOperator_Poisson3DApply(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda points) {
  const CeedScalar *d_in_0 = fields.inputs[0];
  const CeedScalar *d_in_1 = fields.inputs[1];
  CeedScalar *d_out_0 = fields.outputs[0];
  const CeedInt dim = 3;
  const CeedInt Q_1d = 6;
  extern __shared__ CeedScalar slice[];
  SharedData_Cuda data;
  data.t_id_x = threadIdx.x;
  data.t_id_y = threadIdx.y;
  data.t_id_z = threadIdx.z;
  data.t_id  = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*blockDim.y*blockDim.x;
  data.slice = slice + data.t_id_z*T_1D*T_1D;

  // Input field constants and basis data
  // -- Input field 0
  const CeedInt P_1d_in_0 = 5;
  const CeedInt num_comp_in_0 = 1;
  // EvalMode: gradient
  __shared__ CeedScalar s_B_in_0[P_1d_in_0*Q_1d];
  LoadMatrix<P_1d_in_0, Q_1d>(data, B.inputs[0], s_B_in_0);
  __shared__ CeedScalar s_G_in_0[Q_1d*Q_1d];
  LoadMatrix<Q_1d, Q_1d>(data, G.inputs[0], s_G_in_0);
  // -- Input field 1
  const CeedInt P_1d_in_1 = 6;
  const CeedInt num_comp_in_1 = 6;
  // EvalMode: none

  // Output field constants and basis data
  // -- Output field 0
  const CeedInt P_1d_out_0 = 5;
  const CeedInt num_comp_out_0 = 1;
  // EvalMode: gradient
  CeedScalar *s_B_out_0 = s_B_in_0;
  CeedScalar *s_G_out_0 = s_G_in_0;

  // Element loop
  __syncthreads();
  for (CeedInt elem = blockIdx.x*blockDim.z + threadIdx.z; elem < num_elem; elem += gridDim.x*blockDim.z) {
    // Scratch restriction buffer space
    CeedScalar r_e_scratch[1296];

    // -- Input field restrictions and basis actions
    // ---- Input field 0
    CeedScalar *r_e_in_0 = r_e_scratch;
    const CeedInt l_size_in_0 = 274625;
    // CompStride: 274625
    ReadLVecStandard3d<num_comp_in_0, 274625, P_1d_in_0>(data, l_size_in_0, elem, indices.inputs[0], d_in_0, r_e_in_0);
    // EvalMode: gradient
    CeedScalar r_q_in_0[num_comp_in_0*Q_1d];
    InterpTensor3d<num_comp_in_0, P_1d_in_0, Q_1d>(data, r_e_in_0, s_B_in_0, r_q_in_0);
    // ---- Input field 1
    CeedScalar r_e_in_1[num_comp_in_1*P_1d_in_1];
    // Strides: {1, 884736, 216}
    ReadLVecStrided3d<num_comp_in_1, P_1d_in_1, 1, 884736, 216>(data, elem, d_in_1, r_e_in_1);
    // EvalMode: none

    // -- Output field setup
    // ---- Output field 0
    CeedScalar r_q_out_0[num_comp_out_0*Q_1d];
    for (CeedInt i = 0; i < num_comp_out_0*Q_1d; i++) {
      r_q_out_0[i] = 0.0;
    }

    // Note: Using planes of 3D elements
    #pragma unroll
    for (CeedInt q = 0; q < Q_1d; q++) {
      // -- Input fields
      // ---- Input field 0
      // EvalMode: gradient
      CeedScalar r_s_in_0[num_comp_in_0*dim];
      GradColloSlice3d<num_comp_in_0, Q_1d>(data, q, r_q_in_0, s_G_in_0, r_s_in_0);
      // ---- Input field 1
      // EvalMode: none
      CeedScalar r_s_in_1[num_comp_in_1];
      // Strides: {1, 884736, 216}
      ReadEVecSliceStrided3d<num_comp_in_1, Q_1d, 1, 884736, 216>(data, elem, q, d_in_1, r_s_in_1);

      // -- Output fields
      // ---- Output field 0
      CeedScalar r_s_out_0[num_comp_out_0*dim];

      // -- QFunction inputs and outputs
      // ---- Inputs
      CeedScalar *inputs[2];
      // ------ Input field 0
      inputs[0] = r_s_in_0;
      // ------ Input field 1
      inputs[1] = r_s_in_1;
      // ---- Outputs
      CeedScalar *outputs[1];
      // ------ Output field 0
      outputs[0] = r_s_out_0;

      // -- Apply QFunction
      Poisson3DApply(ctx, 1, inputs, outputs);

      // -- Output fields
      // ---- Output field 0
      // EvalMode: gradient
      GradColloSliceTranspose3d<num_comp_out_0, Q_1d>(data, q, r_s_out_0, s_G_out_0, r_q_out_0);
    }

    // -- Output field basis action and restrictions
    // ---- Output field 0
    // EvalMode: gradient
    CeedScalar *r_e_out_0 = r_e_scratch;
    InterpTransposeTensor3d<num_comp_out_0, P_1d_out_0, Q_1d>(data, r_q_out_0, s_B_out_0, r_e_out_0);
    const CeedInt l_size_out_0 = 274625;
    // CompStride: 274625
    WriteLVecStandard3d<num_comp_out_0, 274625, P_1d_out_0>(data, l_size_out_0, elem, indices.outputs[0], r_e_out_0, d_out_0);
  }
}
// -----------------------------------------------------------------------------

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

De-duplicate Gen Basis Matrices
1 participant