From 9fcd8857fb0e00bee0b401f5e25f1fd081fe3c9c Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:12 -0700
Subject: [PATCH 01/10] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.h       | 11 ----------
 kernels/portable/cpu/util/elementwise_util.h | 23 ++++++++++++++++----
 2 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 2bbd5de4577..59b82cdc51b 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -86,12 +86,6 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool_or_byte(
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_same_as_compute(
     const Tensor& t) {
-  constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-  ET_CHECK_MSG(
-      t.scalar_type() == common_scalar_type,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(common_scalar_type),
-      op_name);
   return internal::load_and_convert<CTYPE_COMMON, CTYPE_COMMON>;
 }
 
@@ -180,11 +174,6 @@ template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) {
   constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-  ET_CHECK_MSG(
-      t.scalar_type() == common_scalar_type,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(common_scalar_type),
-      op_name);
   return internal::convert_and_store<CTYPE_COMMON, CTYPE_COMMON>;
 }
 
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index f5932069005..021ec42bf27 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -51,6 +51,13 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 }
 
 namespace internal {
+template <typename Ignore, typename T>
+using ignore_first_yield_second = T;
+
+template <typename CTYPE_COMMON, typename Op, typename... Args>
+using op_call_result =
+    std::invoke_result_t<Op, ignore_first_yield_second<Args, CTYPE_COMMON>...>;
+
 template <
     typename CTYPE_COMMON,
     const char* op_name,
@@ -89,9 +96,16 @@ inline void apply_elementwise_fn(
       inputs.first->element_size(),
   })...};
 
-  const auto store_common_to_out =
-      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
-          out, out_dtypes);
+  // NOTE: the result of compute_fun is not necessarily CTYPE_COMMON!
+  // For example, consider the possibility that compute_fun is a
+  // trigonometric function like acos, the common input type is bool,
+  // and the output type is float -- we would truncate acos(0) ~= 1.67
+  // to just 1. Conveniently, it costs us nothing at runtime to handle
+  // this correctly.
+  const auto store_compute_result_to_out =
+      internal::get_store_common_to_tensor_fn<
+          op_call_result<CTYPE_COMMON, Op, Args...>,
+          op_name>(out, out_dtypes);
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
   const auto out_element_size = out.element_size();
 
@@ -114,7 +128,8 @@ inline void apply_elementwise_fn(
                      .data_ptr[indexes[idx + 1] * input_info.element_size]);
           }
           auto result = std::apply(compute_fun, loaded_inputs);
-          store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
+          store_compute_result_to_out(
+              result, &data_out[indexes[0] * out_element_size]);
         }
       });
 }

From 29d6de9d2e63b567e242aea0b7949d7250f12b34 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:16 -0700
Subject: [PATCH 02/10] Update

[ghstack-poisoned]
---
 .../cpu/pattern/unary_ufunc_realh.cpp         | 19 ++++---
 .../pattern/unary_ufunc_realhb_to_bool.cpp    | 26 +++++-----
 .../unary_ufunc_realhbbf16_to_floathbf16.cpp  | 27 +++++-----
 kernels/portable/cpu/util/dtype_util.cpp      |  4 ++
 kernels/portable/cpu/util/dtype_util.h        | 50 +++++++++++++++++++
 5 files changed, 94 insertions(+), 32 deletions(-)

diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp
index 16d847ace31..f7050e8410b 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp
@@ -7,7 +7,7 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -36,12 +36,19 @@ Tensor& unary_ufunc_realh(
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
-  ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] {
-    apply_unary_map_fn(
+  // TODO: this is broken for dtype_selective_build: this was
+  // __func__, which isn't the operator name.
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "unary_ufunc_realh";
+
+  ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
         [fn](const CTYPE val_in) { return static_cast<CTYPE>(fn(val_in)); },
-        in.const_data_ptr<CTYPE>(),
-        out.mutable_data_ptr<CTYPE>(),
-        in.numel());
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALH,
+        out,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
index 367137ad02c..5a7332efc07 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
@@ -7,7 +7,7 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -30,25 +30,23 @@ Tensor& unary_ufunc_realhb_to_bool(
       out,
       "Failed to resize output tensor.");
 
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      out.scalar_type() == executorch::aten::ScalarType::Bool,
-      InvalidArgument,
-      out,
-      "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.",
-      static_cast<int8_t>(out.scalar_type()));
-
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
   const auto in_type = in.scalar_type();
 
-  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
-    apply_unary_map_fn(
+  // TODO: this is broken for dtype_selective_build: this was
+  // __func__, which isn't the operator name.
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "unary_ufunc_realhb_to_bool";
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE_IN, [&] {
+    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name>(
         [fn](const CTYPE_IN val_in) { return fn(val_in); },
-        in.const_data_ptr<CTYPE_IN>(),
-        out.mutable_data_ptr<bool>(),
-        in.numel());
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALHBBF16,
+        out,
+        utils::SupportedTensorDtypes::BOOL);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
index 602b5b1bfd2..3dcdbd4050c 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
@@ -7,7 +7,7 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -38,17 +38,20 @@ Tensor& unary_ufunc_realhbbf16_to_floathbf16(
   const auto in_type = in.scalar_type();
   const auto out_type = out.scalar_type();
 
-  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
-    ET_SWITCH_FLOATHBF16_TYPES(out_type, ctx, __func__, CTYPE_OUT, [&] {
-      apply_unary_map_fn(
-          [fn](const CTYPE_IN val_in) {
-            CTYPE_OUT xi = static_cast<CTYPE_OUT>(val_in);
-            return static_cast<CTYPE_OUT>(fn(xi));
-          },
-          in.const_data_ptr<CTYPE_IN>(),
-          out.mutable_data_ptr<CTYPE_OUT>(),
-          in.numel());
-    });
+  // TODO: this is broken for dtype_selective_build: this was
+  // __func__, which isn't the operator name.
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] =
+      "unary_ufunc_realhbbf16_to_floathbf16";
+
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE_IN, [&] {
+    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name>(
+        [fn](const CTYPE_IN val_in) { return fn(val_in); },
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALHBBF16,
+        out,
+        utils::SupportedTensorDtypes::FLOATHBF16);
   });
 
   return out;
diff --git a/kernels/portable/cpu/util/dtype_util.cpp b/kernels/portable/cpu/util/dtype_util.cpp
index d240b9f83bc..81b1b203a54 100644
--- a/kernels/portable/cpu/util/dtype_util.cpp
+++ b/kernels/portable/cpu/util/dtype_util.cpp
@@ -23,10 +23,14 @@ bool check_tensor_dtype(
       return executorch::runtime::tensor_is_realhbbf16_type(t);
     case SupportedTensorDtypes::REALHBF16:
       return executorch::runtime::tensor_is_realhbf16_type(t);
+    case SupportedTensorDtypes::REALH:
+      return executorch::runtime::tensor_is_realh_type(t);
     case SupportedTensorDtypes::FLOATHBF16:
       return executorch::runtime::tensor_is_floating_type(t);
     case SupportedTensorDtypes::INTB:
       return executorch::runtime::tensor_is_integral_type(t, true);
+    case SupportedTensorDtypes::BOOL:
+      return executorch::runtime::tensor_is_type(t, ScalarType::Bool);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return (executorch::runtime::tensor_is_type(
           t, ScalarType::Bool, ScalarType::Byte));
diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 59b82cdc51b..19bee220005 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -51,6 +51,15 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_realhbf16(
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_realh(const Tensor& t) {
+  CTYPE_COMMON (*result)(const void*) = nullptr;
+  ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+    result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+  });
+  return result;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_floathbf16(
     const Tensor& t) {
@@ -72,6 +81,16 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_intb(const Tensor& t) {
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool(const Tensor& t) {
+  ET_CHECK_MSG(
+      t.scalar_type() == ScalarType::Bool,
+      "Unhandled dtype %s for %s",
+      ::executorch::runtime::toString(t.scalar_type()),
+      op_name);
+  return internal::load_and_convert<CTYPE_COMMON, bool>;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool_or_byte(
     const Tensor& t) {
@@ -137,6 +156,16 @@ store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_realhbf16(
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_realh(
+    const Tensor& t) {
+  void (*result)(CTYPE_COMMON, void*) = nullptr;
+  ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+    result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+  });
+  return result;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_floathbf16(const Tensor& t) {
@@ -159,6 +188,17 @@ store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_intb(
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_bool(
+    const Tensor& t) {
+  ET_CHECK_MSG(
+      t.scalar_type() == ScalarType::Bool,
+      "Unhandled dtype %s for %s",
+      ::executorch::runtime::toString(t.scalar_type()),
+      op_name);
+  return internal::convert_and_store<bool, CTYPE_COMMON>;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) {
@@ -206,8 +246,10 @@ get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
 enum class SupportedTensorDtypes {
   REALHBBF16,
   REALHBF16,
+  REALH,
   FLOATHBF16,
   INTB,
+  BOOL,
   BOOL_OR_BYTE,
   SAME_AS_COMPUTE,
   SAME_AS_COMMON,
@@ -224,10 +266,14 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn(
       return get_load_to_common_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::REALHBF16:
       return get_load_to_common_fn_realhbf16<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::REALH:
+      return get_load_to_common_fn_realh<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::FLOATHBF16:
       return get_load_to_common_fn_realhbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::INTB:
       return get_load_to_common_fn_intb<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::BOOL:
+      return get_load_to_common_fn_bool<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return get_load_to_common_fn_bool_or_byte<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::SAME_AS_COMPUTE:
@@ -248,10 +294,14 @@ store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn(
       return get_store_common_to_tensor_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::REALHBF16:
       return get_store_common_to_tensor_fn_realhbf16<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::REALH:
+      return get_store_common_to_tensor_fn_realh<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::FLOATHBF16:
       return get_store_common_to_tensor_fn_floathbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::INTB:
       return get_store_common_to_tensor_fn_intb<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::BOOL:
+      return get_store_common_to_tensor_fn_bool<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return get_store_common_to_tensor_fn_bool_or_byte<CTYPE_COMMON, op_name>(
           t);

From 79b908c798961ff453b71594793586b309641702 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:21 -0700
Subject: [PATCH 03/10] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/elementwise_util.h | 101 +++++++++++++++++--
 1 file changed, 91 insertions(+), 10 deletions(-)

diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index 021ec42bf27..aa9883a0b26 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -60,10 +60,9 @@ using op_call_result =
 
 template <
     typename CTYPE_COMMON,
-    const char* op_name,
     typename Op,
-    typename... Args>
-inline void apply_elementwise_fn(
+  typename... Args>
+inline bool validate_elementwise_fn_inputs(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
     const Tensor& out,
@@ -72,7 +71,6 @@ inline void apply_elementwise_fn(
   static_assert(
       (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
        ...));
-  constexpr auto kNumInputs = sizeof...(inputs);
   constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
   const auto check_input_dtype = [](auto input, auto compute_type) {
     return internal::check_tensor_dtype(
@@ -82,7 +80,33 @@ inline void apply_elementwise_fn(
       ctx,
       (check_input_dtype(inputs, compute_type) && ...) &&
           internal::check_tensor_dtype(out, out_dtypes, compute_type),
-      InvalidArgument, );
+      InvalidArgument, false);
+
+  return true;
+}
+
+template <
+    typename CTYPE_COMMON,
+    const char* op_name,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes,
+    Args... inputs) {
+  const bool inputs_valid = validate_elementwise_fn_inputs<CTYPE_COMMON>(
+      compute_fun,
+      ctx,
+      out,
+      out_dtypes,
+      inputs...);
+  if (!inputs_valid) {
+    return;
+  }
+
+  constexpr auto kNumInputs = sizeof...(inputs);
 
   struct InputInfo {
     load_to_common_fn<CTYPE_COMMON> load_to_common;
@@ -135,6 +159,7 @@ inline void apply_elementwise_fn(
 }
 } // namespace internal
 
+/// DEPRECATED: prefer the variant with out_dtypes in the template argument.
 template <typename CTYPE_COMMON, const char* op_name, typename Op>
 inline void apply_unitensor_elementwise_fn(
     const Op& compute_fun,
@@ -147,12 +172,45 @@ inline void apply_unitensor_elementwise_fn(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
+template <typename CTYPE_COMMON, const char* op_name, SupportedTensorDtypes out_dtypes, typename Op>
+inline void apply_unitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& out) {
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
+}
+
+/**
+ * DEPRECATED: prefer the variant with out_dtypes in the template argument list.
+ */
+template <typename CTYPE_COMMON, const char* op_name, typename Op>
+inline void apply_bitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& b,
+    SupportedTensorDtypes b_dtypes,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes) {
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun,
+      ctx,
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
+}
+
 /**
  * Useful for bi-tensor elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
  */
-template <typename CTYPE_COMMON, const char* op_name, typename Op>
+template <typename CTYPE_COMMON, const char* op_name, SupportedTensorDtypes out_dtypes, typename Op>
 inline void apply_bitensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -160,6 +218,29 @@ inline void apply_bitensor_elementwise_fn(
     SupportedTensorDtypes a_dtypes,
     const Tensor& b,
     SupportedTensorDtypes b_dtypes,
+    const Tensor& out) {
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun,
+      ctx,
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
+}
+
+/**
+ * DEPRECATED: prefer the variant with out_dtypes in the template argument list.
+ */
+template <typename CTYPE_COMMON, const char* op_name, typename Op>
+inline void apply_tritensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& b,
+    SupportedTensorDtypes b_dtypes,
+    const Tensor& c,
+    SupportedTensorDtypes c_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
   internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
@@ -168,7 +249,8 @@ inline void apply_bitensor_elementwise_fn(
       out,
       out_dtypes,
       std::make_pair(&a, a_dtypes),
-      std::make_pair(&b, b_dtypes));
+      std::make_pair(&b, b_dtypes),
+      std::make_pair(&c, c_dtypes));
 }
 
 /**
@@ -191,7 +273,7 @@ inline void apply_bitensor_elementwise_fn(
  * static constexpr const char op_name[] = "my_op";
  * apply_ternary_elementwise_fn<CTYPE_COMMON, op_name>.
  */
-template <typename CTYPE_COMMON, const char* op_name, typename Op>
+template <typename CTYPE_COMMON, const char* op_name, SupportedTensorDtypes out_dtypes, typename Op>
 inline void apply_tritensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -201,8 +283,7 @@ inline void apply_tritensor_elementwise_fn(
     SupportedTensorDtypes b_dtypes,
     const Tensor& c,
     SupportedTensorDtypes c_dtypes,
-    const Tensor& out,
-    SupportedTensorDtypes out_dtypes) {
+    const Tensor& out) {
   internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
       compute_fun,
       ctx,

From 40c1b1be46d2ad91f6ca39fe3008d9b685d3f45b Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 19 Mar 2025 09:58:10 -0700
Subject: [PATCH 04/10] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.h | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 76579301850..1f0e3403e82 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -173,27 +173,13 @@ get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) {
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) {
-  return internal::convert_and_store<CTYPE_COMMON, CTYPE_COMMON>;
+  // We already validate tensor types earlier in the process, so at
+  // this phase, treat same_as_compute the same as our widest
+  // SupportedTensorDtypes set.
+  return get_store_common_to_tensor_fn_realhbf16<CTYPE_COMMON, op_name>(t);
 }
 
-template <
-    typename CTYPE_COMMON,
-    const char* op_name,
-    std::enable_if_t<std::is_same_v<CTYPE_COMMON, float>, bool> = true>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
-  ET_SWITCH_THREE_TYPES(
-      Float, Half, BFloat16, t.scalar_type(), unused, op_name, CTYPE, [&]() {
-        result = internal::convert_and_store<CTYPE, CTYPE_COMMON>;
-      });
-  return result;
-}
-
-template <
-    typename CTYPE_COMMON,
-    const char* op_name,
-    std::enable_if_t<!std::is_same_v<CTYPE_COMMON, float>, bool> = true>
+template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
   return get_store_common_to_tensor_fn_same_as_compute<CTYPE_COMMON, op_name>(

From 4553283773f0a5fb325a1a3eac54e26835327cbd Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 26 Mar 2025 16:20:19 -0700
Subject: [PATCH 05/10] Update

[ghstack-poisoned]
---
 kernels/portable/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index 53ad88880d6..edea045d65f 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -66,9 +66,7 @@ gen_operators_lib(
 # Portable kernels support optional parallelization (and, in the
 # future, perhaps other performance features). If support is present,
 # produce an optimized version.
-set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
-
-if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_library(optimized_portable_kernels ${_portable_kernels__srcs})
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)

From 943ab82be758baadbafc4287756fa6dbe904f6df Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 28 Mar 2025 09:51:22 -0700
Subject: [PATCH 06/10] Update

[ghstack-poisoned]
---
 tools/cmake/executorch-config.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 49aa6cf08af..56c7fa2d7d4 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -149,7 +149,7 @@ endif()
 if(TARGET coremldelegate)
   set_target_properties(
     coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES
-                             "coreml_inmemoryfs;coreml_util"
+                              "coreml_inmemoryfs;coreml_util"
   )
 endif()
 
@@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib)
 endif()
 if(TARGET extension_threadpool)
   target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+  set_target_properties(
+    extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES
+                                    "cpuinfo;pthreadpool"
+  )
 endif()

From f22d039d23db4103dc0697b638813ce2e4bc4a5f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 28 Mar 2025 09:51:28 -0700
Subject: [PATCH 07/10] Update

[ghstack-poisoned]
---
 test/CMakeLists.txt               | 13 +++++++
 test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 test/build_optimized_size_test.sh

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3932f1097e1..812e8e4a67a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections")
 endif()
 
+#
+# size_test_all_optimized_ops: binary with optimized ops and no delegate backend
+#
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+add_executable(size_test_all_optimized_ops ${_size_test__srcs})
+target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+target_link_libraries(
+  size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib)
+if(CMAKE_BUILD_TYPE EQUAL "Release")
+  target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections")
+endif()
+endif()
+
 # Print all summary
 executorch_print_configuration_summary()
diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh
new file mode 100644
index 00000000000..181c2ce617d
--- /dev/null
+++ b/test/build_optimized_size_test.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Unlike build_size_test.sh, this script:
+# - does not attempt to disable exceptions and RTTI
+# - as a consequence, is able to build optimized kernels
+# - uses MinSizeRel builds
+# - is not currently intended to run in CI
+# - sets -g to make it easier to use tools like bloaty to investigate size
+
+set -e
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
+
+cmake_install_executorch_lib() {
+  echo "Installing libexecutorch.a"
+  clean_executorch_install_folders
+  update_tokenizers_git_submodule
+  CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \
+          -DCMAKE_CXX_STANDARD_REQUIRED=ON \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE=MinSizeRel \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DOPTIMIZE_SIZE=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+  cmake --build cmake-out -j9 --target install --config MinSizeRel
+}
+
+test_cmake_size_test() {
+    CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test
+
+    echo "Build size test"
+    cmake --build cmake-out/test -j9 --config MinSizeRel
+
+    echo 'ExecuTorch with no ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test
+
+    echo 'ExecuTorch with portable ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_ops
+
+    echo 'ExecuTorch with optimized ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_optimized_ops
+}
+
+if [[ -z $PYTHON_EXECUTABLE ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+cmake_install_executorch_lib
+test_cmake_size_test

From 7f2bbdb098596d232cd1193ea76422308ab74dc3 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 1 Apr 2025 19:32:41 -0700
Subject: [PATCH 08/10] Update

[ghstack-poisoned]
---
 kernels/portable/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index 53ad88880d6..edea045d65f 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -66,9 +66,7 @@ gen_operators_lib(
 # Portable kernels support optional parallelization (and, in the
 # future, perhaps other performance features). If support is present,
 # produce an optimized version.
-set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
-
-if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_library(optimized_portable_kernels ${_portable_kernels__srcs})
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)

From 9e42e93a96531d2daa189ac66b796aa515e56cd4 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 1 Apr 2025 19:32:46 -0700
Subject: [PATCH 09/10] Update

[ghstack-poisoned]
---
 tools/cmake/executorch-config.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 49aa6cf08af..56c7fa2d7d4 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -149,7 +149,7 @@ endif()
 if(TARGET coremldelegate)
   set_target_properties(
     coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES
-                             "coreml_inmemoryfs;coreml_util"
+                              "coreml_inmemoryfs;coreml_util"
   )
 endif()
 
@@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib)
 endif()
 if(TARGET extension_threadpool)
   target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+  set_target_properties(
+    extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES
+                                    "cpuinfo;pthreadpool"
+  )
 endif()

From 96d258eb5cb1601283f636f6dd8a046ae5c9e4ae Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 1 Apr 2025 19:33:00 -0700
Subject: [PATCH 10/10] Update

[ghstack-poisoned]
---
 test/CMakeLists.txt               | 13 +++++++
 test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 test/build_optimized_size_test.sh

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3932f1097e1..812e8e4a67a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections")
 endif()
 
+#
+# size_test_all_optimized_ops: binary with optimized ops and no delegate backend
+#
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+add_executable(size_test_all_optimized_ops ${_size_test__srcs})
+target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+target_link_libraries(
+  size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib)
+if(CMAKE_BUILD_TYPE EQUAL "Release")
+  target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections")
+endif()
+endif()
+
 # Print all summary
 executorch_print_configuration_summary()
diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh
new file mode 100644
index 00000000000..181c2ce617d
--- /dev/null
+++ b/test/build_optimized_size_test.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Unlike build_size_test.sh, this script:
+# - does not attempt to disable exceptions and RTTI
+# - as a consequence, is able to build optimized kernels
+# - uses MinSizeRel builds
+# - is not currently intended to run in CI
+# - sets -g to make it easier to use tools like bloaty to investigate size
+
+set -e
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
+
+cmake_install_executorch_lib() {
+  echo "Installing libexecutorch.a"
+  clean_executorch_install_folders
+  update_tokenizers_git_submodule
+  CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \
+          -DCMAKE_CXX_STANDARD_REQUIRED=ON \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE=MinSizeRel \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DOPTIMIZE_SIZE=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+  cmake --build cmake-out -j9 --target install --config MinSizeRel
+}
+
+test_cmake_size_test() {
+    CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test
+
+    echo "Build size test"
+    cmake --build cmake-out/test -j9 --config MinSizeRel
+
+    echo 'ExecuTorch with no ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test
+
+    echo 'ExecuTorch with portable ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_ops
+
+    echo 'ExecuTorch with optimized ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_optimized_ops
+}
+
+if [[ -z $PYTHON_EXECUTABLE ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+cmake_install_executorch_lib
+test_cmake_size_test