Implement _fft_c2r core ATen op (#10208)

pssrawat · facebook-github-bot · commit f725f63459e1 · 2025-04-18T09:57:25.000-07:00
Summary: Pull Request resolved: #10208 Add ff2_c2r Differential Revision: D73006888
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
@@ -6,6 +6,8 @@
 
 - op: _fake_quantize_per_tensor_affine_cachemask_tensor_qparams.out
 
+- op: _fft_c2r.out
+
 - op: _fft_r2c.out
 
 - op: _linalg_det.result
diff --git a/kernels/optimized/cpu/fft_utils.h b/kernels/optimized/cpu/fft_utils.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <pocketfft_hdronly.h>
+#include <optional>
+
+namespace torch::executor::native {
+
+// TODO: contents of this anonymous namespace are copy/pasted from
+// PyTorch core (aten/src/ATen/native/mkl/SpectralOps.cpp). Small
+// portions (the parts that don't depend on Tensor) could be reused;
+// refactor to enable that once we can share headers from PyTorch
+// core.
+namespace {
+pocketfft::stride_t stride_from_tensor(const Tensor& t) {
+  pocketfft::stride_t stride(t.strides().begin(), t.strides().end());
+  for (auto& s : stride) {
+    s *= t.element_size();
+  }
+  return stride;
+}
+
+pocketfft::shape_t shape_from_tensor(const Tensor& t) {
+  return pocketfft::shape_t(t.sizes().begin(), t.sizes().end());
+}
+
+// NOTE: The reinterpret_cast in tensor_cdata is UB, but it's what
+// PyTorch core does and I'm not aware of a portable way to do this
+// that doesn't rely on UB.
+template <typename T>
+inline std::complex<T>* tensor_cdata(Tensor& t) {
+  return reinterpret_cast<std::complex<T>*>(
+      t.data_ptr<executorch::runtime::etensor::complex<T>>());
+}
+
+template <typename T>
+inline const std::complex<T>* tensor_cdata(const Tensor& t) {
+  return reinterpret_cast<const std::complex<T>*>(
+      t.const_data_ptr<executorch::runtime::etensor::complex<T>>());
+}
+
+// NOTE: in particular this is in ATen/native/SpectralOpsUtils.h and
+// could be shared immediately.
+enum class fft_norm_mode {
+  none, // No normalization
+  by_root_n, // Divide by sqrt(signal_size)
+  by_n, // Divide by signal_size
+};
+
+// NOTE: slight fork from upstream PyTorch to use ET_KERNEL_CHECK;
+// upstream with TORCH_CHECK will be fine to use once we have code
+// sharing.
+template <typename T>
+std::optional<T>
+compute_fct(KernelRuntimeContext& ctx, int64_t size, int64_t normalization) {
+  constexpr auto one = static_cast<T>(1);
+  switch (static_cast<fft_norm_mode>(normalization)) {
+    case fft_norm_mode::none:
+      return one;
+    case fft_norm_mode::by_n:
+      return one / static_cast<T>(size);
+    case fft_norm_mode::by_root_n:
+      return one / std::sqrt(static_cast<T>(size));
+  }
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      false,
+      InvalidArgument,
+      std::nullopt,
+      "Unsupported normalization type: %" PRId64,
+      normalization);
+}
+
+template <typename T>
+std::optional<T> compute_fct(
+    KernelRuntimeContext& ctx,
+    const Tensor& t,
+    IntArrayRef dim,
+    int64_t normalization) {
+  if (static_cast<fft_norm_mode>(normalization) == fft_norm_mode::none) {
+    return static_cast<T>(1);
+  }
+  const auto& sizes = t.sizes();
+  int64_t n = 1;
+  for (auto idx : dim) {
+    n *= sizes[idx];
+  }
+  return compute_fct<T>(ctx, n, normalization);
+}
+} // namespace
+
+} // namespace torch::executor::native
diff --git a/kernels/optimized/cpu/op_fft_c2r.cpp b/kernels/optimized/cpu/op_fft_c2r.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/optimized/cpu/fft_utils.h>
+#include <executorch/runtime/core/span.h>
+
+namespace torch::executor::native {
+Tensor& opt_fft_c2r_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    IntArrayRef dim,
+    int64_t normalization,
+    int64_t last_dim_size,
+    Tensor& out) {
+  auto in_sizes = in.sizes();
+  ET_KERNEL_CHECK(ctx, in.dim() <= kTensorDimensionLimit, InvalidArgument, out);
+
+  ET_KERNEL_CHECK(ctx, !dim.empty(), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, last_dim_size >= 1, InvalidArgument, out);
+
+  // Determine the output size
+  std::array<Tensor::SizesType, kTensorDimensionLimit> out_sizes_storage{};
+  executorch::runtime::Span<Tensor::SizesType> out_sizes(
+      out_sizes_storage.data(), in_sizes.size());
+  std::copy(in_sizes.begin(), in_sizes.end(), out_sizes.begin());
+  out_sizes[dim.back()] = last_dim_size;
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      in.scalar_type() == executorch::runtime::toComplexType(out.scalar_type()),
+      InvalidArgument,
+      out,
+      "the input type for _fft_c2r must be the Complex type corresponding to the output type");
+
+  for (auto d : dim) {
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        d >= 0 && d < in.dim(),
+        InvalidArgument,
+        out,
+        "dims must be in bounds (got %" PRId64 ")",
+        d);
+  }
+
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(
+          out,
+          executorch::runtime::ArrayRef<Tensor::SizesType>(
+              out_sizes.data(), out_sizes.size())) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor (last dim %d).",
+      out_sizes[dim.back()]);
+
+  pocketfft::shape_t axes(dim.begin(), dim.end());
+  auto out_shape = shape_from_tensor(out);
+  // TODO: if arbitrary strides are a possibility, we need to validate
+  // these, because pocketfft README says "Strides that lead to
+  // multiple accesses of the same memory address are not allowed."
+  auto in_stride = stride_from_tensor(in);
+  auto out_stride = stride_from_tensor(out);
+  // NOTE: as of this writing, upstream PyTorch only supports
+  // float/double, so we follow suit.
+  ET_SWITCH_FLOAT_TYPES(out.scalar_type(), ctx, "_fft_c2r.out", CTYPE_OUT, [&] {
+    auto fct = compute_fct<CTYPE_OUT>(ctx, out, dim, normalization);
+    if (!fct) {
+      // Check failed, just bail out of the lambda.
+      return;
+    }
+    pocketfft::c2r<CTYPE_OUT>(
+        out_shape,
+        in_stride,
+        out_stride,
+        axes,
+        false /* forward */,
+        tensor_cdata<CTYPE_OUT>(in),
+        out.mutable_data_ptr<CTYPE_OUT>(),
+        *fct);
+  });
+  return out;
+}
+} // namespace torch::executor::native
diff --git a/kernels/optimized/cpu/op_fft_r2c.cpp b/kernels/optimized/cpu/op_fft_r2c.cpp
@@ -6,99 +6,10 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/kernels/optimized/cpu/fft_utils.h>
 #include <executorch/runtime/core/span.h>
-#include <executorch/runtime/kernel/kernel_includes.h>
-
-#include <pocketfft_hdronly.h>
-
-#include <optional>
 
 namespace torch::executor::native {
-
-// TODO: contents of this anonymous namespace are copy/pasted from
-// PyTorch core (aten/src/ATen/native/mkl/SpectralOps.cpp). Small
-// portions (the parts that don't depend on Tensor) could be reused;
-// refactor to enable that once we can share headers from PyTorch
-// core.
-namespace {
-pocketfft::stride_t stride_from_tensor(const Tensor& t) {
-  pocketfft::stride_t stride(t.strides().begin(), t.strides().end());
-  for (auto& s : stride) {
-    s *= t.element_size();
-  }
-  return stride;
-}
-
-pocketfft::shape_t shape_from_tensor(const Tensor& t) {
-  return pocketfft::shape_t(t.sizes().begin(), t.sizes().end());
-}
-
-// NOTE: The reinterpret_cast in tensor_cdata is UB, but it's what
-// PyTorch core does and I'm not aware of a portable way to do this
-// that doesn't rely on UB.
-template <typename T>
-inline std::complex<T>* tensor_cdata(Tensor& t) {
-  return reinterpret_cast<std::complex<T>*>(
-      t.data_ptr<executorch::runtime::etensor::complex<T>>());
-}
-
-template <typename T>
-inline const std::complex<T>* tensor_cdata(const Tensor& t) {
-  return reinterpret_cast<const std::complex<T>*>(
-      t.const_data_ptr<executorch::runtime::etensor::complex<T>>());
-}
-
-// NOTE: in particular this is in ATen/native/SpectralOpsUtils.h and
-// could be shared immediately.
-enum class fft_norm_mode {
-  none, // No normalization
-  by_root_n, // Divide by sqrt(signal_size)
-  by_n, // Divide by signal_size
-};
-
-// NOTE: slight fork from upstream PyTorch to use ET_KERNEL_CHECK;
-// upstream with TORCH_CHECK will be fine to use once we have code
-// sharing.
-template <typename T>
-std::optional<T>
-compute_fct(KernelRuntimeContext& ctx, int64_t size, int64_t normalization) {
-  constexpr auto one = static_cast<T>(1);
-  switch (static_cast<fft_norm_mode>(normalization)) {
-    case fft_norm_mode::none:
-      return one;
-    case fft_norm_mode::by_n:
-      return one / static_cast<T>(size);
-    case fft_norm_mode::by_root_n:
-      return one / std::sqrt(static_cast<T>(size));
-  }
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      false,
-      InvalidArgument,
-      std::nullopt,
-      "Unsupported normalization type: %" PRId64,
-      normalization);
-}
-
-template <typename T>
-std::optional<T> compute_fct(
-    KernelRuntimeContext& ctx,
-    const Tensor& t,
-    IntArrayRef dim,
-    int64_t normalization) {
-  if (static_cast<fft_norm_mode>(normalization) == fft_norm_mode::none) {
-    return static_cast<T>(1);
-  }
-  const auto& sizes = t.sizes();
-  int64_t n = 1;
-  for (auto idx : dim) {
-    n *= sizes[idx];
-  }
-  return compute_fct<T>(ctx, n, normalization);
-}
-
-} // namespace
-
 Tensor& opt_fft_r2c_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
@@ -35,13 +35,21 @@ _OPTIMIZED_ATEN_OPS = (
         ],
     ),
     op_target(name = "op_exp"),
+    op_target(
+        name = "op_fft_c2r",
+        compiler_flags = [] if runtime.is_oss else [
+            "-Wno-global-constructors",
+            "-Wno-shadow",
+        ],
+        deps = [":fft_utils"],
+    ),
     op_target(
         name = "op_fft_r2c",
         compiler_flags = [] if runtime.is_oss else [
             "-Wno-global-constructors",
             "-Wno-shadow",
         ],
-        deps = [] if runtime.is_oss else ["fbsource//third-party/pocket_fft:pocketfft"],
+        deps = [":fft_utils"],
     ),
     op_target(name = "op_sigmoid"),
     op_target(
@@ -143,6 +151,14 @@ def define_common_targets():
         exported_deps = ["//executorch/runtime/core:core"],
     )
 
+    runtime.cxx_library(
+        name = "fft_utils",
+        srcs = [],
+        exported_headers = ["fft_utils.h"],
+        visibility = ["//executorch/kernels/optimized/cpu/..."],
+        exported_deps = [] if runtime.is_oss else ["fbsource//third-party/pocket_fft:pocketfft"],
+    )
+
     runtime.cxx_library(
         name = "binary_ops",
         exported_headers = ["binary_ops.h"],
diff --git a/kernels/optimized/optimized.yaml b/kernels/optimized/optimized.yaml
@@ -2,6 +2,11 @@
 #
 # This yaml file contains operators that have optimized kernels available.
 
+- op: _fft_c2r.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_fft_c2r_out
+
 - op: _fft_r2c.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/CMakeLists.txt b/kernels/test/CMakeLists.txt
@@ -276,6 +276,7 @@ set(_optimized_kernels_test_sources
     "op_div_test.cpp"
     "op_elu_test.cpp"
     "op_exp_test.cpp"
+    "op_fft_c2r_test.cpp"
     "op_fft_r2c_test.cpp"
     "op_gelu_test.cpp"
     "op_le_test.cpp"
diff --git a/kernels/test/op_fft_c2r_test.cpp b/kernels/test/op_fft_c2r_test.cpp
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl