From 3f919cbc23e59285304a4fc152ac9ef29c71e01d Mon Sep 17 00:00:00 2001
From: Jeremy L Thompson <jeremy@jeremylt.org>
Date: Tue, 3 Dec 2024 14:27:53 -0700
Subject: [PATCH] op - fix FLOPs estimates AtPoints

---
 include/ceed/backend.h    |   3 +-
 interface/ceed-basis.c    |  90 +++++++++++++++++++--------
 interface/ceed-operator.c |  44 +++++++++++---
 tests/t595-operator.c     | 125 ++++++++++++++++++++++++++++++++++++++
 tests/t595-operator.h     |  17 ++++++
 5 files changed, 246 insertions(+), 33 deletions(-)
 create mode 100644 tests/t595-operator.c
 create mode 100644 tests/t595-operator.h

diff --git a/include/ceed/backend.h b/include/ceed/backend.h
index e27d97cab3..5ec604ee5d 100644
--- a/include/ceed/backend.h
+++ b/include/ceed/backend.h
@@ -324,7 +324,8 @@ CEED_EXTERN int CeedBasisGetData(CeedBasis basis, void *data);
 CEED_EXTERN int CeedBasisSetData(CeedBasis basis, void *data);
 CEED_EXTERN int CeedBasisReference(CeedBasis basis);
 CEED_EXTERN int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode, CeedInt *q_comp);
-CEED_EXTERN int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedSize *flops);
+CEED_EXTERN int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, bool is_at_points, CeedInt num_points,
+                                          CeedSize *flops);
 CEED_EXTERN int CeedBasisGetFESpace(CeedBasis basis, CeedFESpace *fe_space);
 CEED_EXTERN int CeedBasisGetTopologyDimension(CeedElemTopology topo, CeedInt *dim);
 CEED_EXTERN int CeedBasisGetTensorContract(CeedBasis basis, CeedTensorContract *contract);
diff --git a/interface/ceed-basis.c b/interface/ceed-basis.c
index 902207f75e..4a4f5fb180 100644
--- a/interface/ceed-basis.c
+++ b/interface/ceed-basis.c
@@ -779,17 +779,21 @@ int CeedBasisGetNumQuadratureComponents(CeedBasis basis, CeedEvalMode eval_mode,
 /**
   @brief Estimate number of FLOPs required to apply `CeedBasis` in `t_mode` and `eval_mode`
 
-  @param[in]  basis     `CeedBasis` to estimate FLOPs for
-  @param[in]  t_mode    Apply basis or transpose
-  @param[in]  eval_mode @ref CeedEvalMode
-  @param[out] flops     Address of variable to hold FLOPs estimate
+  @param[in]  basis        `CeedBasis` to estimate FLOPs for
+  @param[in]  t_mode       Apply basis or transpose
+  @param[in]  eval_mode    @ref CeedEvalMode
+  @param[in]  is_at_points Evaluate the basis at points or quadrature points
+  @param[in]  num_points   Number of points basis is evaluated at
+  @param[out] flops        Address of variable to hold FLOPs estimate
 
   @ref Backend
 **/
-int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, CeedSize *flops) {
+int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEvalMode eval_mode, bool is_at_points, CeedInt num_points,
+                              CeedSize *flops) {
   bool is_tensor;
 
   CeedCall(CeedBasisIsTensor(basis, &is_tensor));
+  CeedCheck(!is_at_points || is_tensor, CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Can only evaluate tensor-product bases at points");
   if (is_tensor) {
     CeedInt dim, num_comp, P_1d, Q_1d;
 
@@ -802,32 +806,68 @@ int CeedBasisGetFlopsEstimate(CeedBasis basis, CeedTransposeMode t_mode, CeedEva
       Q_1d = P_1d;
     }
     CeedInt tensor_flops = 0, pre = num_comp * CeedIntPow(P_1d, dim - 1), post = 1;
+
     for (CeedInt d = 0; d < dim; d++) {
       tensor_flops += 2 * pre * P_1d * post * Q_1d;
       pre /= P_1d;
       post *= Q_1d;
     }
-    switch (eval_mode) {
-      case CEED_EVAL_NONE:
-        *flops = 0;
-        break;
-      case CEED_EVAL_INTERP:
-        *flops = tensor_flops;
-        break;
-      case CEED_EVAL_GRAD:
-        *flops = tensor_flops * 2;
-        break;
-      case CEED_EVAL_DIV:
-      case CEED_EVAL_CURL: {
-        // LCOV_EXCL_START
-        return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported",
-                         CeedEvalModes[eval_mode]);
-        break;
-        // LCOV_EXCL_STOP
+    if (is_at_points) {
+      CeedInt chebyshev_flops = (Q_1d - 2) * 3 + 1, d_chebyshev_flops = (Q_1d - 2) * 8 + 1;
+      CeedInt point_tensor_flops = 0, pre = CeedIntPow(Q_1d, dim - 1), post = 1;
+
+      for (CeedInt d = 0; d < dim; d++) {
+        point_tensor_flops += 2 * pre * Q_1d * post * 1;
+        pre /= P_1d;
+        post *= Q_1d;
+      }
+
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          *flops = 0;
+          break;
+        case CEED_EVAL_INTERP:
+          *flops = tensor_flops + num_points * (dim * chebyshev_flops + point_tensor_flops + (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0));
+          break;
+        case CEED_EVAL_GRAD:
+          *flops = tensor_flops + num_points * (dim * (d_chebyshev_flops + (dim - 1) * chebyshev_flops + point_tensor_flops +
+                                                       (t_mode == CEED_TRANSPOSE ? CeedIntPow(Q_1d, dim) : 0)));
+          break;
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL: {
+          // LCOV_EXCL_START
+          return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported",
+                           CeedEvalModes[eval_mode]);
+          break;
+          // LCOV_EXCL_STOP
+        }
+        case CEED_EVAL_WEIGHT:
+          *flops = num_points;
+          break;
+      }
+    } else {
+      switch (eval_mode) {
+        case CEED_EVAL_NONE:
+          *flops = 0;
+          break;
+        case CEED_EVAL_INTERP:
+          *flops = tensor_flops;
+          break;
+        case CEED_EVAL_GRAD:
+          *flops = tensor_flops * 2;
+          break;
+        case CEED_EVAL_DIV:
+        case CEED_EVAL_CURL: {
+          // LCOV_EXCL_START
+          return CeedError(CeedBasisReturnCeed(basis), CEED_ERROR_INCOMPATIBLE, "Tensor basis evaluation for %s not supported",
+                           CeedEvalModes[eval_mode]);
+          break;
+          // LCOV_EXCL_STOP
+        }
+        case CEED_EVAL_WEIGHT:
+          *flops = dim * CeedIntPow(Q_1d, dim);
+          break;
       }
-      case CEED_EVAL_WEIGHT:
-        *flops = dim * CeedIntPow(Q_1d, dim);
-        break;
     }
   } else {
     CeedInt dim, num_comp, q_comp, num_nodes, num_qpts;
diff --git a/interface/ceed-operator.c b/interface/ceed-operator.c
index 96fe26a834..cc3493db87 100644
--- a/interface/ceed-operator.c
+++ b/interface/ceed-operator.c
@@ -1097,8 +1097,14 @@ int CeedOperatorAtPointsGetPoints(CeedOperator op, CeedElemRestriction *rstr_poi
   CeedCheck(is_at_points, CeedOperatorReturnCeed(op), CEED_ERROR_MINOR, "Only defined for operator at points");
   CeedCall(CeedOperatorCheckReady(op));
 
-  if (rstr_points) CeedCall(CeedElemRestrictionReferenceCopy(op->rstr_points, rstr_points));
-  if (point_coords) CeedCall(CeedVectorReferenceCopy(op->point_coords, point_coords));
+  if (rstr_points) {
+    *rstr_points = NULL;
+    CeedCall(CeedElemRestrictionReferenceCopy(op->rstr_points, rstr_points));
+  }
+  if (point_coords) {
+    *point_coords = NULL;
+    CeedCall(CeedVectorReferenceCopy(op->point_coords, point_coords));
+  }
   return CEED_ERROR_SUCCESS;
 }
 
@@ -1693,16 +1699,39 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
       *flops += suboperator_flops;
     }
   } else {
-    CeedInt             num_input_fields, num_output_fields, num_elem = 0;
+    bool                is_at_points;
+    CeedInt             num_input_fields, num_output_fields, num_elem = 0, num_points = 0;
     CeedQFunction       qf;
     CeedQFunctionField *qf_input_fields, *qf_output_fields;
     CeedOperatorField  *op_input_fields, *op_output_fields;
 
+    CeedCall(CeedOperatorIsAtPoints(op, &is_at_points));
+    CeedCall(CeedOperatorGetNumElements(op, &num_elem));
+    if (is_at_points) {
+      CeedMemType         mem_type;
+      CeedElemRestriction rstr_points = NULL;
+
+      CeedCall(CeedOperatorAtPointsGetPoints(op, &rstr_points, NULL));
+      CeedCall(CeedGetPreferredMemType(CeedOperatorReturnCeed(op), &mem_type));
+      if (mem_type == CEED_MEM_DEVICE) {
+        // Device backends pad out to the same number of points per element
+        CeedCall(CeedElemRestrictionGetMaxPointsInElement(rstr_points, &num_points));
+      } else {
+        num_points = 0;
+        for (CeedInt i = 0; i < num_elem; i++) {
+          CeedInt points_in_elem = 0;
+
+          CeedCall(CeedElemRestrictionGetNumPointsInElement(rstr_points, i, &points_in_elem));
+          num_points += points_in_elem;
+        }
+        num_points = num_points / num_elem + (num_points % num_elem > 0);
+      }
+      CeedCall(CeedElemRestrictionDestroy(&rstr_points));
+    }
     CeedCall(CeedOperatorGetQFunction(op, &qf));
     CeedCall(CeedQFunctionGetFields(qf, &num_input_fields, &qf_input_fields, &num_output_fields, &qf_output_fields));
     CeedCall(CeedQFunctionDestroy(&qf));
     CeedCall(CeedOperatorGetFields(op, NULL, &op_input_fields, NULL, &op_output_fields));
-    CeedCall(CeedOperatorGetNumElements(op, &num_elem));
 
     // Input FLOPs
     for (CeedInt i = 0; i < num_input_fields; i++) {
@@ -1721,7 +1750,7 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
         *flops += rstr_flops;
         CeedCall(CeedOperatorFieldGetBasis(op_input_fields[i], &basis));
         CeedCall(CeedQFunctionFieldGetEvalMode(qf_input_fields[i], &eval_mode));
-        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_NOTRANSPOSE, eval_mode, &basis_flops));
+        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_NOTRANSPOSE, eval_mode, is_at_points, num_points, &basis_flops));
         CeedCall(CeedBasisDestroy(&basis));
         *flops += basis_flops * num_elem;
       }
@@ -1733,7 +1762,8 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
       CeedSize      qf_flops;
       CeedQFunction qf;
 
-      CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts));
+      if (is_at_points) num_qpts = num_points;
+      else CeedCall(CeedOperatorGetNumQuadraturePoints(op, &num_qpts));
       CeedCall(CeedOperatorGetQFunction(op, &qf));
       CeedCall(CeedQFunctionGetFlopsEstimate(qf, &qf_flops));
       CeedCall(CeedQFunctionDestroy(&qf));
@@ -1759,7 +1789,7 @@ int CeedOperatorGetFlopsEstimate(CeedOperator op, CeedSize *flops) {
         *flops += rstr_flops;
         CeedCall(CeedOperatorFieldGetBasis(op_output_fields[i], &basis));
         CeedCall(CeedQFunctionFieldGetEvalMode(qf_output_fields[i], &eval_mode));
-        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_TRANSPOSE, eval_mode, &basis_flops));
+        CeedCall(CeedBasisGetFlopsEstimate(basis, CEED_TRANSPOSE, eval_mode, is_at_points, num_points, &basis_flops));
         CeedCall(CeedBasisDestroy(&basis));
         *flops += basis_flops * num_elem;
       }
diff --git a/tests/t595-operator.c b/tests/t595-operator.c
new file mode 100644
index 0000000000..e874ccb2ba
--- /dev/null
+++ b/tests/t595-operator.c
@@ -0,0 +1,125 @@
+/// @file
+/// Test FLOP estimation for mass matrix operator at points
+/// \test Test FLOP estimation for mass matrix operator at points
+#include "t595-operator.h"
+
+#include <ceed.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char **argv) {
+  Ceed    ceed;
+  CeedInt num_elem_1d = 3, num_elem = num_elem_1d * num_elem_1d, dim = 2, p = 3, q = 5;
+  CeedInt num_nodes = (num_elem_1d * (p - 1) + 1) * (num_elem_1d * (p - 1) + 1), num_points_per_elem = 4, num_points = num_elem * num_points_per_elem;
+  CeedSize            flop_estimate = 0;
+  CeedVector          x_points, q_data;
+  CeedElemRestriction elem_restriction_x_points, elem_restriction_q_data, elem_restriction_u;
+  CeedBasis           basis_x, basis_u;
+  CeedQFunction       qf_mass;
+  CeedOperator        op_mass;
+  bool                is_at_points;
+
+  CeedInit(argv[1], &ceed);
+
+  // Point reference coordinates
+  CeedVectorCreate(ceed, dim * num_points, &x_points);
+  {
+    CeedScalar x_array[dim * num_points];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      for (CeedInt d = 0; d < dim; d++) {
+        x_array[num_points_per_elem * (e * dim + d) + 0] = 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 1] = d == 0 ? -0.25 : 0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 2] = d == 0 ? 0.25 : -0.25;
+        x_array[num_points_per_elem * (e * dim + d) + 3] = 0.25;
+      }
+    }
+    CeedVectorSetArray(x_points, CEED_MEM_HOST, CEED_COPY_VALUES, x_array);
+  }
+  {
+    CeedInt ind_x[num_elem + 1 + num_points];
+
+    for (CeedInt i = 0; i <= num_elem; i++) ind_x[i] = num_elem + 1 + i * num_points_per_elem;
+    for (CeedInt i = 0; i < num_points; i++) ind_x[num_elem + 1 + i] = i;
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, dim, num_points * dim, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x,
+                                      &elem_restriction_x_points);
+    CeedElemRestrictionCreateAtPoints(ceed, num_elem, num_points, 1, num_points, CEED_MEM_HOST, CEED_COPY_VALUES, ind_x, &elem_restriction_q_data);
+  }
+
+  // Q data
+  CeedVectorCreate(ceed, num_points, &q_data);
+
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, dim, 2, q, CEED_GAUSS, &basis_x);
+
+  // Cell solution
+  {
+    CeedInt ind_u[num_elem * p * p];
+
+    for (CeedInt e = 0; e < num_elem; e++) {
+      CeedInt elem_xy[2] = {1, 1}, n_d[2] = {0, 0};
+
+      for (CeedInt d = 0; d < dim; d++) n_d[d] = num_elem_1d * (p - 1) + 1;
+      {
+        CeedInt r_e = e;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          elem_xy[d] = r_e % num_elem_1d;
+          r_e /= num_elem_1d;
+        }
+      }
+      CeedInt num_nodes_in_elem = p * p, *elem_nodes = ind_u + e * num_nodes_in_elem;
+
+      for (CeedInt n = 0; n < num_nodes_in_elem; n++) {
+        CeedInt g_node = 0, g_node_stride = 1, r_node = n;
+
+        for (CeedInt d = 0; d < dim; d++) {
+          g_node += (elem_xy[d] * (p - 1) + r_node % p) * g_node_stride;
+          g_node_stride *= n_d[d];
+          r_node /= p;
+        }
+        elem_nodes[n] = g_node;
+      }
+    }
+    CeedElemRestrictionCreate(ceed, num_elem, p * p, 1, 1, num_nodes, CEED_MEM_HOST, CEED_COPY_VALUES, ind_u, &elem_restriction_u);
+  }
+  CeedBasisCreateTensorH1Lagrange(ceed, dim, 1, p, q, CEED_GAUSS, &basis_u);
+
+  // Mass operator
+  CeedQFunctionCreateInterior(ceed, 1, mass, mass_loc, &qf_mass);
+  CeedQFunctionAddInput(qf_mass, "u", 1, CEED_EVAL_INTERP);
+  CeedQFunctionAddInput(qf_mass, "rho", 1, CEED_EVAL_NONE);
+  CeedQFunctionAddOutput(qf_mass, "v", 1, CEED_EVAL_INTERP);
+
+  CeedOperatorCreateAtPoints(ceed, qf_mass, CEED_QFUNCTION_NONE, CEED_QFUNCTION_NONE, &op_mass);
+  CeedOperatorSetField(op_mass, "u", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorSetField(op_mass, "rho", elem_restriction_q_data, CEED_BASIS_NONE, q_data);
+  CeedOperatorSetField(op_mass, "v", elem_restriction_u, basis_u, CEED_VECTOR_ACTIVE);
+  CeedOperatorAtPointsSetPoints(op_mass, elem_restriction_x_points, x_points);
+
+  CeedOperatorIsAtPoints(op_mass, &is_at_points);
+  if (!is_at_points) printf("Error: Operator should be at points\n");
+
+  // Estimate FLOPs
+  CeedQFunctionSetUserFlopsEstimate(qf_mass, 1);
+  CeedOperatorGetFlopsEstimate(op_mass, &flop_estimate);
+
+  // Check output
+  if (flop_estimate != 16317) {
+    // LCOV_EXCL_START
+    printf("Incorrect FLOP estimate computed, %ld != 16317\n", flop_estimate);
+    // LCOV_EXCL_STOP
+  }
+
+  CeedVectorDestroy(&x_points);
+  CeedVectorDestroy(&q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_x_points);
+  CeedElemRestrictionDestroy(&elem_restriction_q_data);
+  CeedElemRestrictionDestroy(&elem_restriction_u);
+  CeedBasisDestroy(&basis_x);
+  CeedBasisDestroy(&basis_u);
+  CeedQFunctionDestroy(&qf_mass);
+  CeedOperatorDestroy(&op_mass);
+  CeedDestroy(&ceed);
+  return 0;
+}
diff --git a/tests/t595-operator.h b/tests/t595-operator.h
new file mode 100644
index 0000000000..e2dcddf09d
--- /dev/null
+++ b/tests/t595-operator.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
+// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
+//
+// SPDX-License-Identifier: BSD-2-Clause
+//
+// This file is part of CEED:  http://github.com/ceed
+
+#include <ceed/types.h>
+
+CEED_QFUNCTION(mass)(void *ctx, const CeedInt Q, const CeedScalar *const *in, CeedScalar *const *out) {
+  const CeedScalar *u = in[0], *rho = in[1];
+  CeedScalar       *v = out[0];
+
+  // Quadrature point loop
+  CeedPragmaSIMD for (CeedInt i = 0; i < Q; i++) { v[i] = rho[i] * u[i]; }
+  return 0;
+}