wip

jeremylt · jeremylt · commit 16898a3ce133 · 2024-12-11T13:44:42.000-07:00
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp b/backends/cuda-gen/ceed-cuda-gen-operator-build.cpp
@@ -124,7 +124,8 @@ static int CeedOperatorBuildKernelData_Cuda_gen(Ceed ceed, CeedInt num_input_fie
 // Setup fields
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedOperatorField op_field,
-                                                     CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool use_3d_slices) {
+                                                     CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input, bool is_at_points,
+                                                     bool use_3d_slices) {
   std::string            var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string            P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
   std::string            option_name = (is_input ? "inputs" : "outputs");
@@ -163,16 +164,55 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
     case CEED_EVAL_NONE:
       break;
     case CEED_EVAL_INTERP:
-      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
-      else data->B.outputs[i] = basis_data->d_interp_1d;
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
+
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallCuda(CeedBasisReturnCeed(basis),
+                       cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
+      } else {
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
+      }
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
       code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
       break;
     case CEED_EVAL_GRAD:
-      if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
-      else data->B.outputs[i] = basis_data->d_interp_1d;
+      if (is_at_points) {
+        // AtPoints
+        if (!basis_data->d_chebyshev_interp_1d) {
+          CeedSize    interp_bytes;
+          CeedScalar *chebyshev_interp_1d;
+
+          interp_bytes = P_1d * Q_1d * sizeof(CeedScalar);
+          CeedCallBackend(CeedCalloc(P_1d * Q_1d, &chebyshev_interp_1d));
+          CeedCallBackend(CeedBasisGetChebyshevInterp1D(basis, chebyshev_interp_1d));
+          CeedCallCuda(CeedBasisReturnCeed(basis), cudaMalloc((void **)&basis_data->d_chebyshev_interp_1d, interp_bytes));
+          CeedCallCuda(CeedBasisReturnCeed(basis),
+                       cudaMemcpy(basis_data->d_chebyshev_interp_1d, chebyshev_interp_1d, interp_bytes, cudaMemcpyHostToDevice));
+          CeedCallBackend(CeedFree(&chebyshev_interp_1d));
+        }
+        if (is_input) data->B.inputs[i] = basis_data->d_chebyshev_interp_1d;
+        else data->B.outputs[i] = basis_data->d_chebyshev_interp_1d;
+      } else {
+        // Standard quadrature
+        if (is_input) data->B.inputs[i] = basis_data->d_interp_1d;
+        else data->B.outputs[i] = basis_data->d_interp_1d;
+      }
       code << "  __shared__ CeedScalar s_B" << var_suffix << "[" << P_1d * Q_1d << "];\n";
       code << "  LoadMatrix<" << P_name << ", " << Q_name << ">(data, B." << option_name << "[" << i << "], s_B" << var_suffix << ");\n";
+      if (is_at_points) break;  // No G mat for AtPoints
       if (use_3d_slices) {
         if (is_input) data->G.inputs[i] = basis_data->d_collo_grad_1d;
         else data->G.outputs[i] = basis_data->d_collo_grad_1d;
@@ -208,7 +248,7 @@ static int CeedOperatorBuildKernelFieldData_Cuda_gen(std::ostringstream &code, C
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
                                                        CeedInt field_input_buffer[], CeedOperatorField op_field, CeedQFunctionField qf_field,
-                                                       CeedInt Q_1d, bool is_input, bool use_3d_slices) {
+                                                       CeedInt Q_1d, bool is_input, bool is_at_points, bool use_3d_slices) {
   std::string               var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string               P_name     = "P_1d" + var_suffix;
   CeedEvalMode              eval_mode  = CEED_EVAL_NONE;
@@ -335,7 +375,7 @@ static int CeedOperatorBuildKernelRestriction_Cuda_gen(std::ostringstream &code,
 //------------------------------------------------------------------------------
 static int CeedOperatorBuildKernelBasis_Cuda_gen(std::ostringstream &code, CeedOperator_Cuda_gen *data, CeedInt i, CeedInt dim,
                                                  CeedOperatorField op_field, CeedQFunctionField qf_field, CeedInt Q_1d, bool is_input,
-                                                 bool use_3d_slices) {
+                                                 bool is_at_points, bool use_3d_slices) {
   std::string         var_suffix = (is_input ? "_in_" : "_out_") + std::to_string(i);
   std::string         P_name = "P_1d" + var_suffix, Q_name = "Q_1d";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
@@ -438,7 +478,7 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
                                                      CeedOperatorField *op_input_fields, CeedQFunctionField *qf_input_fields,
                                                      CeedInt num_output_fields, CeedOperatorField *op_output_fields,
                                                      CeedQFunctionField *qf_output_fields, std::string qfunction_name, CeedInt Q_1d,
-                                                     bool use_3d_slices) {
+                                                     bool is_at_points, bool use_3d_slices) {
   std::string         Q_name    = "Q_1d";
   CeedEvalMode        eval_mode = CEED_EVAL_NONE;
   CeedElemRestriction elem_rstr;
@@ -653,9 +693,9 @@ static int CeedOperatorBuildKernelQFunction_Cuda_gen(std::ostringstream &code, C
 // Build single operator kernel
 //------------------------------------------------------------------------------
 extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
-  bool                    is_tensor = true, use_3d_slices = false;
+  bool                    is_tensor = true, is_at_points = false, use_3d_slices = false;
   Ceed                    ceed;
-  CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1;
+  CeedInt                 Q_1d, num_input_fields, num_output_fields, dim = 1, max_num_points = 0;
   CeedQFunctionField     *qf_input_fields, *qf_output_fields;
   CeedQFunction_Cuda_gen *qf_data;
   CeedQFunction           qf;
@@ -678,17 +718,23 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   CeedCallBackend(CeedQFunctionGetFields(qf, NULL, &qf_input_fields, NULL, &qf_output_fields));
 
   // Get operator data
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
   CeedCallBackend(CeedOperatorBuildKernelData_Cuda_gen(ceed, num_input_fields, op_input_fields, qf_input_fields, num_output_fields, op_output_fields,
                                                        qf_output_fields, &data->max_P_1d, &Q_1d, &dim, &is_tensor, &use_3d_slices));
   if (dim == 0) dim = 1;
   data->dim = dim;
   if (Q_1d == 0) {
-    CeedInt Q;
-
-    CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q));
-    Q_1d = Q;
+    CeedCallBackend(CeedOperatorGetNumQuadraturePoints(op, &Q_1d));
   }
   data->Q_1d = Q_1d;
+  if (is_at_points) {
+    CeedElemRestriction rstr_points = NULL;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+    CeedCallBackend(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &max_num_points));
+    CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+  }
+  if (is_at_points) use_3d_slices = false;
 
   // Check for restriction only identity operator
   {
@@ -722,6 +768,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   // TODO: Add non-tensor, AtPoints
   code << "// Tensor basis source\n";
   code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-templates.h>\n\n";
+  code << "// AtPoints basis source\n";
+  code << "#include <ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h>\n\n";
   code << "// CodeGen operator source\n";
   code << "#include <ceed/jit-source/cuda/cuda-gen-templates.h>\n\n";
 
@@ -763,7 +811,8 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   code << "// s_G_[in,out]_i: Gradient matrix, shared memory\n";
   code << "// -----------------------------------------------------------------------------\n";
   code << "extern \"C\" __global__ void " << operator_name
-       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W) {\n";
+       << "(CeedInt num_elem, void* ctx, FieldsInt_Cuda indices, Fields_Cuda fields, Fields_Cuda B, Fields_Cuda G, CeedScalar *W, Points_Cuda "
+          "points) {\n";
 
   // Scratch buffers
   for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -793,11 +842,13 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
   // Initialize constants, and matrices B and G
   code << "\n  // Input field constants and basis data\n";
   for (CeedInt i = 0; i < num_input_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_input_fields[i], qf_input_fields[i], Q_1d, true, is_at_points, use_3d_slices));
   }
   code << "\n  // Output field constants and basis data\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
-    CeedCallBackend(CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelFieldData_Cuda_gen(code, data, i, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points, use_3d_slices));
   }
 
   // Loop over all elements
@@ -884,27 +935,29 @@ extern "C" int CeedOperatorBuildKernel_Cuda_gen(CeedOperator op) {
 
     // ---- Restriction
     CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, f, dim, field_rstr_in_buffer, op_input_fields[f], qf_input_fields[f],
-                                                                Q_1d, true, use_3d_slices));
+                                                                Q_1d, true, is_at_points, use_3d_slices));
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, use_3d_slices));
+    CeedCallBackend(
+        CeedOperatorBuildKernelBasis_Cuda_gen(code, data, f, dim, op_input_fields[f], qf_input_fields[f], Q_1d, true, is_at_points, use_3d_slices));
   }
 
   // -- Q function
   CeedCallBackend(CeedOperatorBuildKernelQFunction_Cuda_gen(code, data, dim, num_input_fields, op_input_fields, qf_input_fields, num_output_fields,
-                                                            op_output_fields, qf_output_fields, qfunction_name, Q_1d, use_3d_slices));
+                                                            op_output_fields, qf_output_fields, qfunction_name, Q_1d, is_at_points, use_3d_slices));
 
   // -- Output basis and restriction
   code << "\n    // -- Output field basis action and restrictions\n";
   for (CeedInt i = 0; i < num_output_fields; i++) {
     code << "    // ---- Output field " << i << "\n";
 
     // ---- Basis action
-    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelBasis_Cuda_gen(code, data, i, dim, op_output_fields[i], qf_output_fields[i], Q_1d, false, is_at_points,
+                                                          use_3d_slices));
 
     // ---- Restriction
-    CeedCallBackend(
-        CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false, use_3d_slices));
+    CeedCallBackend(CeedOperatorBuildKernelRestriction_Cuda_gen(code, data, i, dim, NULL, op_output_fields[i], qf_output_fields[i], Q_1d, false,
+                                                                is_at_points, use_3d_slices));
   }
 
   // Close loop and function
diff --git a/backends/cuda-gen/ceed-cuda-gen-operator.c b/backends/cuda-gen/ceed-cuda-gen-operator.c
@@ -8,6 +8,8 @@
 #include <ceed.h>
 #include <ceed/backend.h>
 #include <ceed/jit-source/cuda/cuda-types.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
 #include <stddef.h>
 
 #include "../cuda/ceed-cuda-common.h"
@@ -19,10 +21,14 @@
 // Destroy operator
 //------------------------------------------------------------------------------
 static int CeedOperatorDestroy_Cuda_gen(CeedOperator op) {
+  Ceed                   ceed;
   CeedOperator_Cuda_gen *impl;
 
+  CeedCallBackend(CeedOperatorGetCeed(op, &ceed));
   CeedCallBackend(CeedOperatorGetData(op, &impl));
+  if (impl->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)impl->points.num_per_elem));
   CeedCallBackend(CeedFree(&impl));
+  CeedCallBackend(CeedDestroy(&ceed));
   return CEED_ERROR_SUCCESS;
 }
 
@@ -92,6 +98,7 @@ static size_t dynamicSMemSize(int threads) { return threads * sizeof(CeedScalar)
 // Apply and add to output
 //------------------------------------------------------------------------------
 static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec, CeedVector output_vec, CeedRequest *request) {
+  bool                    is_at_points;
   Ceed                    ceed;
   Ceed_Cuda              *cuda_data;
   CeedInt                 num_elem, num_input_fields, num_output_fields;
@@ -181,25 +188,52 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
     }
   }
 
+  // Point coordinates, if needed
+  CeedCallBackend(CeedOperatorIsAtPoints(op, &is_at_points));
+  if (is_at_points) {
+    // Coords
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+
+    // Points per elem
+    if (num_elem != data->points.num_elem) {
+      CeedInt            *points_per_elem;
+      const CeedInt       num_bytes   = num_elem * sizeof(CeedInt);
+      CeedElemRestriction rstr_points = NULL;
+
+      data->points.num_elem = num_elem;
+      CeedCallBackend(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+      CeedCallBackend(CeedCalloc(num_elem, &points_per_elem));
+      for (CeedInt e = 0; e < num_elem; e++) {
+        CeedInt num_points_elem;
+
+        CeedCallBackend(CeedElemRestrictionGetNumPointsInElement(rstr_points, e, &num_points_elem));
+        points_per_elem[e] = num_points_elem;
+      }
+      if (data->points.num_per_elem) CeedCallCuda(ceed, cudaFree((void **)data->points.num_per_elem));
+      CeedCallCuda(ceed, cudaMalloc((void **)&data->points.num_per_elem, num_bytes));
+      CeedCallCuda(ceed, cudaMemcpy((void *)data->points.num_per_elem, points_per_elem, num_bytes, cudaMemcpyHostToDevice));
+      CeedCallBackend(CeedElemRestrictionDestroy(&rstr_points));
+      CeedCallBackend(CeedFree(&points_per_elem));
+    }
+  }
+
   // Get context data
   CeedCallBackend(CeedQFunctionGetInnerContextData(qf, CEED_MEM_DEVICE, &qf_data->d_c));
 
   // Apply operator
-  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W};
+  void         *opargs[]  = {(void *)&num_elem, &qf_data->d_c, &data->indices, &data->fields, &data->B, &data->G, &data->W, &data->points};
   const CeedInt dim       = data->dim;
   const CeedInt Q_1d      = data->Q_1d;
   const CeedInt P_1d      = data->max_P_1d;
   const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);
-  int           max_threads_per_block, min_grid_size;
+  int           max_threads_per_block, min_grid_size, grid;
 
   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_threads_per_block, data->op, dynamicSMemSize, 0, 0x10000));
-  int block[3] =
-      {
-          thread_1d,
-          dim < 2 ? 1 : thread_1d,
-          -1,
-      },
-      grid;
+  int block[3] = {thread_1d, dim < 2 ? 1 : thread_1d, -1};
 
   CeedCallBackend(BlockGridCalculate(num_elem, min_grid_size / cuda_data->device_prop.multiProcessorCount, max_threads_per_block,
                                      cuda_data->device_prop.maxThreadsDim[2], cuda_data->device_prop.warpSize, block, &grid));
@@ -236,6 +270,7 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
       if (is_active) vec = output_vec;
       // Check for multiple output modes
       CeedInt index = -1;
+
       for (CeedInt j = 0; j < i; j++) {
         if (vec == output_vecs[j]) {
           index = j;
@@ -249,6 +284,15 @@ static int CeedOperatorApplyAdd_Cuda_gen(CeedOperator op, CeedVector input_vec,
     }
   }
 
+  // Restore point coordinates, if needed
+  if (is_at_points) {
+    CeedVector vec;
+
+    CeedCallBackend(CeedOperatorAtPointsGetPoints(op, NULL, &vec));
+    CeedCallBackend(CeedVectorRestoreArrayRead(vec, &data->points.coords));
+    CeedCallBackend(CeedVectorDestroy(&vec));
+  }
+
   // Restore context data
   CeedCallBackend(CeedQFunctionRestoreInnerContextData(qf, &qf_data->d_c));
   CeedCallBackend(CeedDestroy(&ceed));
diff --git a/backends/cuda-gen/ceed-cuda-gen.h b/backends/cuda-gen/ceed-cuda-gen.h
@@ -22,6 +22,7 @@ typedef struct {
   Fields_Cuda    B;
   Fields_Cuda    G;
   CeedScalar    *W;
+  Points_Cuda    points;
 } CeedOperator_Cuda_gen;
 
 typedef struct {
diff --git a/include/ceed/jit-source/cuda/cuda-types.h b/include/ceed/jit-source/cuda/cuda-types.h
@@ -23,6 +23,12 @@ typedef struct {
   CeedInt *outputs[CEED_CUDA_NUMBER_FIELDS];
 } FieldsInt_Cuda;
 
+typedef struct {
+  CeedInt           num_elem;
+  const CeedInt    *num_per_elem;
+  const CeedScalar *coords;
+} Points_Cuda;
+
 typedef struct {
   CeedInt     t_id_x;
   CeedInt     t_id_y;