From bfd122cc263c728d479c93c95b0702457ab272d2 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 25 Mar 2025 10:05:36 -0700
Subject: [PATCH] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_expm1.cpp             | 54 ++++++++++++++++++-
 .../kernels/portable/op_registration_util.bzl |  1 +
 2 files changed, 53 insertions(+), 2 deletions(-)
diff --git a/kernels/portable/cpu/op_expm1.cpp b/kernels/portable/cpu/op_expm1.cpp
index f2d49f615b1..c73321b28e8 100644
--- a/kernels/portable/cpu/op_expm1.cpp
+++ b/kernels/portable/cpu/op_expm1.cpp
@@ -7,6 +7,7 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <cmath>
 
@@ -14,9 +15,58 @@ namespace torch {
 namespace executor {
 namespace native {
 
+// REVIEW: I'm not entirely sure what the best way to implement this
+// namespace is. Some options:
+// 1) All in one file, with or without an `IMPLEMENT_VECTORIZED_MATH_OP` macro.
+// 2) Include in each `unary_ufunc_*` op_foo.cpp, with or without an
+// `IMPLEMENT_VECTORIZED_MATH_OP` macro.
+//
+// I think my preferred option would be (2) with a macro, but I've
+// left the macro out for ease of reading this PoC PR.
+namespace math {
+using std::expm1;
+#ifdef ET_USE_PYTORCH_HEADERS
+template <typename T>
+auto expm1(at::vec::Vectorized<T> x) {
+  // ATen knows to do this conversion because the TensorIterator for this op
+  // (and lots of similar ones in aten/src/ATen/native/UnaryOps.cpp) is created
+  // with build_borrowing_unary_float_op.
+  if constexpr (!executorch::runtime::is_floating_point<T>::value) {
+    return at::vec::convert<float>(x).expm1();
+  } else {
+    return x.expm1();
+  }
+}
+#endif
+} // namespace math
 Tensor& expm1_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
-  return internal::unary_ufunc_realhbbf16_to_floathbf16(
-      std::expm1, ctx, in, out);
+  ET_KERNEL_CHECK(ctx, tensor_is_floating_type(out), InvalidArgument, out);
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, in.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  static constexpr const char op_name[] = "expm1.out";
+  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_IN,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](auto x) { return math::expm1(x); },
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALHBBF16,
+        out);
+  });
+
+  return out;
 }
 
 } // namespace native
diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
index b56413b92f4..e5f5e211730 100644
--- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
+++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl
@@ -527,6 +527,7 @@ ATEN_OPS = (
         name = "op_expm1",
         deps = [
             "//executorch/kernels/portable/cpu/pattern:pattern",
+            "//executorch/kernels/portable/cpu/util:elementwise_util",
         ],
     ),
     op_target(