From 31a49e0212d88b6b16979f926f437beed9cde1dc Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:07 -0700
Subject: [PATCH 01/27] Update

[ghstack-poisoned]
---
 CMakeLists.txt                                 | 9 +++++++--
 kernels/optimized/CMakeLists.txt               | 1 +
 kernels/portable/CMakeLists.txt                | 2 ++
 runtime/core/portable_type/c10/c10/targets.bzl | 3 ++-
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a53b8a6e2a..8fe08a2c25e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch)
 # Real integrations should supply their own YAML file that only lists the
 # operators necessary for the models that will run.
 #
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  # find pytorch lib here to make it available to all
+  # sub-directories. Find it before including portable so that
+  # optimized_portabale_kernels can use it.
+  find_package_torch_headers()
+endif()
+
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
-  # find pytorch lib here to make it available to all sub-directories
-  find_package_torch_headers()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 7cba9e91fe5..693be68c35e 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}")
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
 target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
+target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
 target_link_libraries(
   optimized_kernels PUBLIC executorch_core cpublas extension_threadpool
 )
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index e27ba12ac0d..53ad88880d6 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -73,6 +73,8 @@ if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
   target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
+  target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
+  target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
   install(
     TARGETS optimized_portable_kernels
     DESTINATION lib
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index a727cb19ac1..b8883c75bfe 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -73,6 +73,7 @@ def define_common_targets():
             # -Wmacro-redefined, and we only care about getting
             # reasonable vectorization and Sleef support.
             "-DCPU_CAPABILITY_AVX2",
+            "-DET_USE_PYTORCH_HEADERS",
             "-DHAVE_AVX2_CPU_DEFINITION",
             "-DSTANDALONE_TORCH_HEADER",
         ] + get_sleef_preprocessor_flags(),
@@ -87,5 +88,5 @@ def define_common_targets():
             # linker failure.
             "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(),
             "DEFAULT": [],
-        }) + ["-DSTANDALONE_TORCH_HEADER"],
+        }) + ["-DET_USE_PYTORCH_HEADERS", "-DSTANDALONE_TORCH_HEADER"],
     )

From 9fcd8857fb0e00bee0b401f5e25f1fd081fe3c9c Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:12 -0700
Subject: [PATCH 02/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.h       | 11 ----------
 kernels/portable/cpu/util/elementwise_util.h | 23 ++++++++++++++++----
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 2bbd5de4577..59b82cdc51b 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -86,12 +86,6 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool_or_byte(
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_same_as_compute(
     const Tensor& t) {
-  constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-  ET_CHECK_MSG(
-      t.scalar_type() == common_scalar_type,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(common_scalar_type),
-      op_name);
   return internal::load_and_convert<CTYPE_COMMON, CTYPE_COMMON>;
 }
 
@@ -180,11 +174,6 @@ template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) {
   constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-  ET_CHECK_MSG(
-      t.scalar_type() == common_scalar_type,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(common_scalar_type),
-      op_name);
   return internal::convert_and_store<CTYPE_COMMON, CTYPE_COMMON>;
 }
 
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index f5932069005..021ec42bf27 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -51,6 +51,13 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 }
 
 namespace internal {
+template <typename Ignore, typename T>
+using ignore_first_yield_second = T;
+
+template <typename CTYPE_COMMON, typename Op, typename... Args>
+using op_call_result =
+    std::invoke_result_t<Op, ignore_first_yield_second<Args, CTYPE_COMMON>...>;
+
 template <
     typename CTYPE_COMMON,
     const char* op_name,
@@ -89,9 +96,16 @@ inline void apply_elementwise_fn(
       inputs.first->element_size(),
   })...};
 
-  const auto store_common_to_out =
-      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
-          out, out_dtypes);
+  // NOTE: the result of compute_fun is not necessarily CTYPE_COMMON!
+  // For example, consider the possibility that compute_fun is a
+  // trigonometric function like acos, the common input type is bool,
+  // and the output type is float -- we would truncate acos(0) ~= 1.67
+  // to just 1. Conveniently, it costs us nothing at runtime to handle
+  // this correctly.
+  const auto store_compute_result_to_out =
+      internal::get_store_common_to_tensor_fn<
+          op_call_result<CTYPE_COMMON, Op, Args...>,
+          op_name>(out, out_dtypes);
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
   const auto out_element_size = out.element_size();
 
@@ -114,7 +128,8 @@ inline void apply_elementwise_fn(
                      .data_ptr[indexes[idx + 1] * input_info.element_size]);
           }
           auto result = std::apply(compute_fun, loaded_inputs);
-          store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
+          store_compute_result_to_out(
+              result, &data_out[indexes[0] * out_element_size]);
         }
       });
 }

From 29d6de9d2e63b567e242aea0b7949d7250f12b34 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:16 -0700
Subject: [PATCH 03/27] Update

[ghstack-poisoned]
---
 .../cpu/pattern/unary_ufunc_realh.cpp         | 19 ++++---
 .../pattern/unary_ufunc_realhb_to_bool.cpp    | 26 +++++-----
 .../unary_ufunc_realhbbf16_to_floathbf16.cpp  | 27 +++++-----
 kernels/portable/cpu/util/dtype_util.cpp      |  4 ++
 kernels/portable/cpu/util/dtype_util.h        | 50 +++++++++++++++++++
 5 files changed, 94 insertions(+), 32 deletions(-)

diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp
index 16d847ace31..f7050e8410b 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp
@@ -7,7 +7,7 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -36,12 +36,19 @@ Tensor& unary_ufunc_realh(
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
-  ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] {
-    apply_unary_map_fn(
+  // TODO: this is broken for dtype_selective_build: this was
+  // __func__, which isn't the operator name.
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "unary_ufunc_realh";
+
+  ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
+    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
         [fn](const CTYPE val_in) { return static_cast<CTYPE>(fn(val_in)); },
-        in.const_data_ptr<CTYPE>(),
-        out.mutable_data_ptr<CTYPE>(),
-        in.numel());
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALH,
+        out,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
index 367137ad02c..5a7332efc07 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp
@@ -7,7 +7,7 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -30,25 +30,23 @@ Tensor& unary_ufunc_realhb_to_bool(
       out,
       "Failed to resize output tensor.");
 
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      out.scalar_type() == executorch::aten::ScalarType::Bool,
-      InvalidArgument,
-      out,
-      "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.",
-      static_cast<int8_t>(out.scalar_type()));
-
   ET_KERNEL_CHECK(
       ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
 
   const auto in_type = in.scalar_type();
 
-  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
-    apply_unary_map_fn(
+  // TODO: this is broken for dtype_selective_build: this was
+  // __func__, which isn't the operator name.
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "unary_ufunc_realhb_to_bool";
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE_IN, [&] {
+    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name>(
         [fn](const CTYPE_IN val_in) { return fn(val_in); },
-        in.const_data_ptr<CTYPE_IN>(),
-        out.mutable_data_ptr<bool>(),
-        in.numel());
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALHBBF16,
+        out,
+        utils::SupportedTensorDtypes::BOOL);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
index 602b5b1bfd2..3dcdbd4050c 100644
--- a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
+++ b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp
@@ -7,7 +7,7 @@
  */
 
 #include <executorch/kernels/portable/cpu/pattern/pattern.h>
-#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/kernels/portable/cpu/util/elementwise_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -38,17 +38,20 @@ Tensor& unary_ufunc_realhbbf16_to_floathbf16(
   const auto in_type = in.scalar_type();
   const auto out_type = out.scalar_type();
 
-  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] {
-    ET_SWITCH_FLOATHBF16_TYPES(out_type, ctx, __func__, CTYPE_OUT, [&] {
-      apply_unary_map_fn(
-          [fn](const CTYPE_IN val_in) {
-            CTYPE_OUT xi = static_cast<CTYPE_OUT>(val_in);
-            return static_cast<CTYPE_OUT>(fn(xi));
-          },
-          in.const_data_ptr<CTYPE_IN>(),
-          out.mutable_data_ptr<CTYPE_OUT>(),
-          in.numel());
-    });
+  // TODO: this is broken for dtype_selective_build: this was
+  // __func__, which isn't the operator name.
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] =
+      "unary_ufunc_realhbbf16_to_floathbf16";
+
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE_IN, [&] {
+    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name>(
+        [fn](const CTYPE_IN val_in) { return fn(val_in); },
+        ctx,
+        in,
+        utils::SupportedTensorDtypes::REALHBBF16,
+        out,
+        utils::SupportedTensorDtypes::FLOATHBF16);
   });
 
   return out;
diff --git a/kernels/portable/cpu/util/dtype_util.cpp b/kernels/portable/cpu/util/dtype_util.cpp
index d240b9f83bc..81b1b203a54 100644
--- a/kernels/portable/cpu/util/dtype_util.cpp
+++ b/kernels/portable/cpu/util/dtype_util.cpp
@@ -23,10 +23,14 @@ bool check_tensor_dtype(
       return executorch::runtime::tensor_is_realhbbf16_type(t);
     case SupportedTensorDtypes::REALHBF16:
       return executorch::runtime::tensor_is_realhbf16_type(t);
+    case SupportedTensorDtypes::REALH:
+      return executorch::runtime::tensor_is_realh_type(t);
     case SupportedTensorDtypes::FLOATHBF16:
       return executorch::runtime::tensor_is_floating_type(t);
     case SupportedTensorDtypes::INTB:
       return executorch::runtime::tensor_is_integral_type(t, true);
+    case SupportedTensorDtypes::BOOL:
+      return executorch::runtime::tensor_is_type(t, ScalarType::Bool);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return (executorch::runtime::tensor_is_type(
           t, ScalarType::Bool, ScalarType::Byte));
diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 59b82cdc51b..19bee220005 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -51,6 +51,15 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_realhbf16(
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_realh(const Tensor& t) {
+  CTYPE_COMMON (*result)(const void*) = nullptr;
+  ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+    result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+  });
+  return result;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_floathbf16(
     const Tensor& t) {
@@ -72,6 +81,16 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_intb(const Tensor& t) {
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool(const Tensor& t) {
+  ET_CHECK_MSG(
+      t.scalar_type() == ScalarType::Bool,
+      "Unhandled dtype %s for %s",
+      ::executorch::runtime::toString(t.scalar_type()),
+      op_name);
+  return internal::load_and_convert<CTYPE_COMMON, bool>;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool_or_byte(
     const Tensor& t) {
@@ -137,6 +156,16 @@ store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_realhbf16(
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_realh(
+    const Tensor& t) {
+  void (*result)(CTYPE_COMMON, void*) = nullptr;
+  ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+    result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+  });
+  return result;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_floathbf16(const Tensor& t) {
@@ -159,6 +188,17 @@ store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_intb(
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_bool(
+    const Tensor& t) {
+  ET_CHECK_MSG(
+      t.scalar_type() == ScalarType::Bool,
+      "Unhandled dtype %s for %s",
+      ::executorch::runtime::toString(t.scalar_type()),
+      op_name);
+  return internal::convert_and_store<bool, CTYPE_COMMON>;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) {
@@ -206,8 +246,10 @@ get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
 enum class SupportedTensorDtypes {
   REALHBBF16,
   REALHBF16,
+  REALH,
   FLOATHBF16,
   INTB,
+  BOOL,
   BOOL_OR_BYTE,
   SAME_AS_COMPUTE,
   SAME_AS_COMMON,
@@ -224,10 +266,14 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn(
       return get_load_to_common_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::REALHBF16:
       return get_load_to_common_fn_realhbf16<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::REALH:
+      return get_load_to_common_fn_realh<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::FLOATHBF16:
       return get_load_to_common_fn_realhbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::INTB:
       return get_load_to_common_fn_intb<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::BOOL:
+      return get_load_to_common_fn_bool<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return get_load_to_common_fn_bool_or_byte<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::SAME_AS_COMPUTE:
@@ -248,10 +294,14 @@ store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn(
       return get_store_common_to_tensor_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::REALHBF16:
       return get_store_common_to_tensor_fn_realhbf16<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::REALH:
+      return get_store_common_to_tensor_fn_realh<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::FLOATHBF16:
       return get_store_common_to_tensor_fn_floathbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::INTB:
       return get_store_common_to_tensor_fn_intb<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::BOOL:
+      return get_store_common_to_tensor_fn_bool<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return get_store_common_to_tensor_fn_bool_or_byte<CTYPE_COMMON, op_name>(
           t);

From 79b908c798961ff453b71594793586b309641702 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:21 -0700
Subject: [PATCH 04/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/elementwise_util.h | 101 +++++++++++++++++--
 1 file changed, 91 insertions(+), 10 deletions(-)

diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index 021ec42bf27..aa9883a0b26 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -60,10 +60,9 @@ using op_call_result =
 
 template <
     typename CTYPE_COMMON,
-    const char* op_name,
     typename Op,
-    typename... Args>
-inline void apply_elementwise_fn(
+  typename... Args>
+inline bool validate_elementwise_fn_inputs(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
     const Tensor& out,
@@ -72,7 +71,6 @@ inline void apply_elementwise_fn(
   static_assert(
       (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
        ...));
-  constexpr auto kNumInputs = sizeof...(inputs);
   constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
   const auto check_input_dtype = [](auto input, auto compute_type) {
     return internal::check_tensor_dtype(
@@ -82,7 +80,33 @@ inline void apply_elementwise_fn(
       ctx,
       (check_input_dtype(inputs, compute_type) && ...) &&
           internal::check_tensor_dtype(out, out_dtypes, compute_type),
-      InvalidArgument, );
+      InvalidArgument, false);
+
+  return true;
+}
+
+template <
+    typename CTYPE_COMMON,
+    const char* op_name,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes,
+    Args... inputs) {
+  const bool inputs_valid = validate_elementwise_fn_inputs<CTYPE_COMMON>(
+      compute_fun,
+      ctx,
+      out,
+      out_dtypes,
+      inputs...);
+  if (!inputs_valid) {
+    return;
+  }
+
+  constexpr auto kNumInputs = sizeof...(inputs);
 
   struct InputInfo {
     load_to_common_fn<CTYPE_COMMON> load_to_common;
@@ -135,6 +159,7 @@ inline void apply_elementwise_fn(
 }
 } // namespace internal
 
+/// DEPRECATED: prefer the variant with out_dtypes in the template argument.
 template <typename CTYPE_COMMON, const char* op_name, typename Op>
 inline void apply_unitensor_elementwise_fn(
     const Op& compute_fun,
@@ -147,12 +172,45 @@ inline void apply_unitensor_elementwise_fn(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
+template <typename CTYPE_COMMON, const char* op_name, SupportedTensorDtypes out_dtypes, typename Op>
+inline void apply_unitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& out) {
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
+}
+
+/**
+ * DEPRECATED: prefer the variant with out_dtypes in the template argument list.
+ */
+template <typename CTYPE_COMMON, const char* op_name, typename Op>
+inline void apply_bitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& b,
+    SupportedTensorDtypes b_dtypes,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes) {
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun,
+      ctx,
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
+}
+
 /**
  * Useful for bi-tensor elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
  */
-template <typename CTYPE_COMMON, const char* op_name, typename Op>
+template <typename CTYPE_COMMON, const char* op_name, SupportedTensorDtypes out_dtypes, typename Op>
 inline void apply_bitensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -160,6 +218,29 @@ inline void apply_bitensor_elementwise_fn(
     SupportedTensorDtypes a_dtypes,
     const Tensor& b,
     SupportedTensorDtypes b_dtypes,
+    const Tensor& out) {
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+      compute_fun,
+      ctx,
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
+}
+
+/**
+ * DEPRECATED: prefer the variant with out_dtypes in the template argument list.
+ */
+template <typename CTYPE_COMMON, const char* op_name, typename Op>
+inline void apply_tritensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& b,
+    SupportedTensorDtypes b_dtypes,
+    const Tensor& c,
+    SupportedTensorDtypes c_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
   internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
@@ -168,7 +249,8 @@ inline void apply_bitensor_elementwise_fn(
       out,
       out_dtypes,
       std::make_pair(&a, a_dtypes),
-      std::make_pair(&b, b_dtypes));
+      std::make_pair(&b, b_dtypes),
+      std::make_pair(&c, c_dtypes));
 }
 
 /**
@@ -191,7 +273,7 @@ inline void apply_bitensor_elementwise_fn(
  * static constexpr const char op_name[] = "my_op";
  * apply_ternary_elementwise_fn<CTYPE_COMMON, op_name>.
  */
-template <typename CTYPE_COMMON, const char* op_name, typename Op>
+template <typename CTYPE_COMMON, const char* op_name, SupportedTensorDtypes out_dtypes, typename Op>
 inline void apply_tritensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -201,8 +283,7 @@ inline void apply_tritensor_elementwise_fn(
     SupportedTensorDtypes b_dtypes,
     const Tensor& c,
     SupportedTensorDtypes c_dtypes,
-    const Tensor& out,
-    SupportedTensorDtypes out_dtypes) {
+    const Tensor& out) {
   internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
       compute_fun,
       ctx,

From fd62a079438de94ac23de004c2a562e982d8689c Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:25 -0700
Subject: [PATCH 05/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_mul.cpp              |   8 +-
 kernels/portable/cpu/util/dtype_util.h       |  22 +++
 kernels/portable/cpu/util/elementwise_util.h | 143 +++++++++++++++----
 3 files changed, 146 insertions(+), 27 deletions(-)

diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index 1ee73d342ca..114e60ff171 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -52,7 +52,10 @@ Tensor& mul_out(
       out);
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
           return val_a * val_b;
         },
@@ -61,8 +64,7 @@ Tensor& mul_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 19bee220005..9a75432e184 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -324,6 +324,28 @@ bool check_tensor_dtype(
     SupportedTensorDtypes dtypes,
     const ScalarType compute_type);
 
+/// Return the one output type we are willing to emit specialized code
+/// to handle, given a compute type of CTYPE_COMMON and supported
+/// output types of out_dtypes.
+template <typename CTYPE_COMMON>
+inline constexpr ScalarType specialized_output_scalar_type(
+    SupportedTensorDtypes out_dtypes) {
+  switch (out_dtypes) {
+    case SupportedTensorDtypes::BOOL:
+      return ScalarType::Bool;
+    case SupportedTensorDtypes::BOOL_OR_BYTE:
+      return ScalarType::Bool;
+    case SupportedTensorDtypes::REALHBBF16:
+    case SupportedTensorDtypes::REALHBF16:
+    case SupportedTensorDtypes::REALH:
+    case SupportedTensorDtypes::FLOATHBF16:
+    case SupportedTensorDtypes::INTB:
+    case SupportedTensorDtypes::SAME_AS_COMPUTE:
+    case SupportedTensorDtypes::SAME_AS_COMMON:
+      return CppTypeToScalarType<CTYPE_COMMON>::value;
+  }
+}
+
 } // namespace internal
 } // namespace utils
 } // namespace native
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index aa9883a0b26..14bf9293957 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -60,8 +60,43 @@ using op_call_result =
 
 template <
     typename CTYPE_COMMON,
+    typename CTYPE_OUT,
     typename Op,
-  typename... Args>
+    typename... Args>
+inline void dtype_specialized_elementwise_fn_impl(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    Args... inputs) {
+  constexpr auto kNumInputs = sizeof...(inputs);
+  ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMMON)) && ...));
+
+  std::array<const CTYPE_COMMON*, kNumInputs> inputs_data_ptrs = {
+      inputs.first->template const_data_ptr<CTYPE_COMMON>()...};
+
+  CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
+
+  ::executorch::extension::parallel_for(
+      0,
+      out.numel(),
+      ::executorch::extension::internal::GRAIN_SIZE,
+      [&](const auto begin, const auto end) {
+        const auto range =
+            BroadcastIndexesRange<kNumInputs>(out, (*inputs.first)...);
+        auto begin_it = range.begin();
+        begin_it += begin;
+        for (; (*begin_it)[0] < end; ++begin_it) {
+          const auto& indexes = *begin_it;
+          std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+          for (const auto idx : c10::irange(kNumInputs)) {
+            loaded_inputs[idx] = inputs_data_ptrs[idx][indexes[idx + 1]];
+          }
+          data_out[indexes[0]] = std::apply(compute_fun, loaded_inputs);
+        }
+      });
+}
+
+template <typename CTYPE_COMMON, typename Op, typename... Args>
 inline bool validate_elementwise_fn_inputs(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -80,7 +115,8 @@ inline bool validate_elementwise_fn_inputs(
       ctx,
       (check_input_dtype(inputs, compute_type) && ...) &&
           internal::check_tensor_dtype(out, out_dtypes, compute_type),
-      InvalidArgument, false);
+      InvalidArgument,
+      false);
 
   return true;
 }
@@ -90,22 +126,12 @@ template <
     const char* op_name,
     typename Op,
     typename... Args>
-inline void apply_elementwise_fn(
+inline void apply_elementwise_fn_generic_impl(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes,
     Args... inputs) {
-  const bool inputs_valid = validate_elementwise_fn_inputs<CTYPE_COMMON>(
-      compute_fun,
-      ctx,
-      out,
-      out_dtypes,
-      inputs...);
-  if (!inputs_valid) {
-    return;
-  }
-
   constexpr auto kNumInputs = sizeof...(inputs);
 
   struct InputInfo {
@@ -157,6 +183,65 @@ inline void apply_elementwise_fn(
         }
       });
 }
+
+template <
+    typename CTYPE_COMMON,
+    const char* op_name,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn_runtime_out_dtypes(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes,
+    Args... inputs) {
+  const bool inputs_valid = validate_elementwise_fn_inputs<CTYPE_COMMON>(
+      compute_fun, ctx, out, out_dtypes, inputs...);
+  if (!inputs_valid) {
+    return;
+  }
+
+  apply_elementwise_fn_generic_impl<CTYPE_COMMON, op_name>(
+      compute_fun, ctx, out, out_dtypes, inputs...);
+}
+
+template <
+    typename CTYPE_COMMON,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    Args... inputs) {
+  const bool inputs_valid = validate_elementwise_fn_inputs<CTYPE_COMMON>(
+      compute_fun, ctx, out, out_dtypes, inputs...);
+  if (!inputs_valid) {
+    return;
+  }
+
+  constexpr auto kNumInputs = sizeof...(inputs);
+
+  constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMMON>::value;
+  const bool all_inputs_compute_dtype =
+      ((inputs.first->scalar_type() == compute_type) && ...);
+
+  constexpr ScalarType out_specialized_scalar_type =
+      specialized_output_scalar_type<CTYPE_COMMON>(out_dtypes);
+  if (all_inputs_compute_dtype &&
+      out.scalar_type() == out_specialized_scalar_type) {
+    using CTYPE_OUT =
+        typename ScalarTypeToCppType<out_specialized_scalar_type>::type;
+    dtype_specialized_elementwise_fn_impl<CTYPE_COMMON, CTYPE_OUT>(
+        compute_fun, ctx, out, inputs...);
+    return;
+  }
+
+  apply_elementwise_fn_generic_impl<CTYPE_COMMON, op_name>(
+      compute_fun, ctx, out, out_dtypes, inputs...);
+}
 } // namespace internal
 
 /// DEPRECATED: prefer the variant with out_dtypes in the template argument.
@@ -168,18 +253,22 @@ inline void apply_unitensor_elementwise_fn(
     SupportedTensorDtypes a_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+  internal::apply_elementwise_fn_runtime_out_dtypes<CTYPE_COMMON, op_name>(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
-template <typename CTYPE_COMMON, const char* op_name, SupportedTensorDtypes out_dtypes, typename Op>
+template <
+    typename CTYPE_COMMON,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op>
 inline void apply_unitensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
     const Tensor& a,
     SupportedTensorDtypes a_dtypes,
     const Tensor& out) {
-  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name, out_dtypes>(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
@@ -196,7 +285,7 @@ inline void apply_bitensor_elementwise_fn(
     SupportedTensorDtypes b_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+  internal::apply_elementwise_fn_runtime_out_dtypes<CTYPE_COMMON, op_name>(
       compute_fun,
       ctx,
       out,
@@ -210,7 +299,11 @@ inline void apply_bitensor_elementwise_fn(
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
  */
-template <typename CTYPE_COMMON, const char* op_name, SupportedTensorDtypes out_dtypes, typename Op>
+template <
+    typename CTYPE_COMMON,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op>
 inline void apply_bitensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -219,11 +312,10 @@ inline void apply_bitensor_elementwise_fn(
     const Tensor& b,
     SupportedTensorDtypes b_dtypes,
     const Tensor& out) {
-  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name, out_dtypes>(
       compute_fun,
       ctx,
       out,
-      out_dtypes,
       std::make_pair(&a, a_dtypes),
       std::make_pair(&b, b_dtypes));
 }
@@ -243,7 +335,7 @@ inline void apply_tritensor_elementwise_fn(
     SupportedTensorDtypes c_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
-  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+  internal::apply_elementwise_fn_runtime_out_dtypes<CTYPE_COMMON, op_name>(
       compute_fun,
       ctx,
       out,
@@ -273,7 +365,11 @@ inline void apply_tritensor_elementwise_fn(
  * static constexpr const char op_name[] = "my_op";
  * apply_ternary_elementwise_fn<CTYPE_COMMON, op_name>.
  */
-template <typename CTYPE_COMMON, const char* op_name, SupportedTensorDtypes out_dtypes, typename Op>
+template <
+    typename CTYPE_COMMON,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op>
 inline void apply_tritensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -284,11 +380,10 @@ inline void apply_tritensor_elementwise_fn(
     const Tensor& c,
     SupportedTensorDtypes c_dtypes,
     const Tensor& out) {
-  internal::apply_elementwise_fn<CTYPE_COMMON, op_name>(
+  internal::apply_elementwise_fn<CTYPE_COMMON, op_name, out_dtypes>(
       compute_fun,
       ctx,
       out,
-      out_dtypes,
       std::make_pair(&a, a_dtypes),
       std::make_pair(&b, b_dtypes),
       std::make_pair(&c, c_dtypes));

From 40c1b1be46d2ad91f6ca39fe3008d9b685d3f45b Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 19 Mar 2025 09:58:10 -0700
Subject: [PATCH 06/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.h | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 76579301850..1f0e3403e82 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -173,27 +173,13 @@ get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) {
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) {
-  return internal::convert_and_store<CTYPE_COMMON, CTYPE_COMMON>;
+  // We already validate tensor types earlier in the process, so at
+  // this phase, treat same_as_compute the same as our widest
+  // SupportedTensorDtypes set.
+  return get_store_common_to_tensor_fn_realhbf16<CTYPE_COMMON, op_name>(t);
 }
 
-template <
-    typename CTYPE_COMMON,
-    const char* op_name,
-    std::enable_if_t<std::is_same_v<CTYPE_COMMON, float>, bool> = true>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
-  ET_SWITCH_THREE_TYPES(
-      Float, Half, BFloat16, t.scalar_type(), unused, op_name, CTYPE, [&]() {
-        result = internal::convert_and_store<CTYPE, CTYPE_COMMON>;
-      });
-  return result;
-}
-
-template <
-    typename CTYPE_COMMON,
-    const char* op_name,
-    std::enable_if_t<!std::is_same_v<CTYPE_COMMON, float>, bool> = true>
+template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
   return get_store_common_to_tensor_fn_same_as_compute<CTYPE_COMMON, op_name>(

From 8782a900668b190307a16a97e485b0c350b96e8f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 19 Mar 2025 17:36:12 -0700
Subject: [PATCH 07/27] Update

[ghstack-poisoned]
---
 kernels/optimized/cpu/binary_ops.h            | 25 +----------------
 kernels/optimized/cpu/targets.bzl             |  5 +++-
 .../cpu/util/broadcast_indexes_range.h        | 27 ++++++++++++++++++-
 3 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h
index f59c9fd5d76..dd4425e4ce6 100644
--- a/kernels/optimized/cpu/binary_ops.h
+++ b/kernels/optimized/cpu/binary_ops.h
@@ -10,34 +10,11 @@
 
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_indexes_range.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
 namespace executor {
-namespace internal {
-// NOTE: we bake ArrayRef iterators being pointers into the return
-// type here because we assume that iterators are portable across
-// ArrayRef copies.
-inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
-    ArrayRef<Tensor::SizesType> arr) {
-  return std::find_if(
-      arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
-}
-
-inline bool sizes_match_ignoring_leading_1s(
-    ArrayRef<Tensor::SizesType> lhs,
-    ArrayRef<Tensor::SizesType> rhs) {
-  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
-  auto lhs_end = lhs.end();
-
-  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
-  auto rhs_end = rhs.end();
-
-  return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
-      std::equal(lhs_begin, lhs_end, rhs_begin);
-}
-} // namespace internal
-
 enum class ElementwiseOptimizedPath {
   kNone,
   kTreatAs1d,
diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl
index b868a5901fd..edf01ca110c 100644
--- a/kernels/optimized/cpu/targets.bzl
+++ b/kernels/optimized/cpu/targets.bzl
@@ -130,7 +130,10 @@ def define_common_targets():
         srcs = [],
         exported_headers = ["op_add_sub_impl.h"],
         visibility = ["//executorch/kernels/optimized/cpu/..."],
-        exported_deps = ["//executorch/runtime/core:core"],
+        exported_deps = [
+            "//executorch/runtime/core:core",
+            "//executorch/kernels/portable/cpu/util:broadcast_indexes_range",
+        ],
     )
 
     runtime.cxx_library(
diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h
index aaf7207d0c9..4d3ba46b51b 100644
--- a/kernels/portable/cpu/util/broadcast_indexes_range.h
+++ b/kernels/portable/cpu/util/broadcast_indexes_range.h
@@ -21,6 +21,28 @@
 namespace torch::executor {
 
 namespace internal {
+// NOTE: we bake ArrayRef iterators being pointers into the return
+// type here because we assume that iterators are portable across
+// ArrayRef copies.
+inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> arr) {
+  return std::find_if(
+      arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; });
+}
+
+inline bool sizes_match_ignoring_leading_1s(
+    ArrayRef<Tensor::SizesType> lhs,
+    ArrayRef<Tensor::SizesType> rhs) {
+  auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs);
+  auto lhs_end = lhs.end();
+
+  auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs);
+  auto rhs_end = rhs.end();
+
+  return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) &&
+      std::equal(lhs_begin, lhs_end, rhs_begin);
+}
+
 template <std::size_t kNumInputs>
 class BroadcastIndexesIterator {
  public:
@@ -35,7 +57,10 @@ class BroadcastIndexesIterator {
   template <typename... Args>
   explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args)
       : output_dim_or_zero_if_no_broadcasting_(
-            ((args.sizes() == output.sizes()) && ...) ? 0 : output.dim()),
+            (sizes_match_ignoring_leading_1s(args.sizes(), output.sizes()) &&
+             ...)
+                ? 0
+                : output.dim()),
         output_shape_(output.sizes()) {
     static_assert(
         sizeof...(args) == kNumInputs && (std::is_same_v<Args, Tensor> && ...),

From 75f8970ef85c122b82467b2c6f89d090d8e5c0a8 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 19 Mar 2025 17:36:18 -0700
Subject: [PATCH 08/27] Update

[ghstack-poisoned]
---
 .lintrunner.toml                              |  2 +
 kernels/portable/cpu/op_mul.cpp               |  4 +-
 kernels/portable/cpu/pattern/pattern.h        | 15 ++-
 kernels/portable/cpu/util/elementwise_util.h  | 96 ++++++++++++++++++-
 kernels/portable/cpu/util/targets.bzl         |  1 +
 .../core/portable_type/c10/c10/targets.bzl    |  5 +-
 6 files changed, 109 insertions(+), 14 deletions(-)

diff --git a/.lintrunner.toml b/.lintrunner.toml
index 842b4b1c6cb..1f3d128dd60 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -264,6 +264,8 @@ exclude_patterns = [
     'examples/**',
     'exir/verification/bindings.cpp',
     'extension/**',
+    # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
+    'kernels/portable/cpu/util/elementwise_util.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
     # Want to be able to keep c10 in sync with PyTorch core.
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index 114e60ff171..e97263ef1bf 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -56,9 +56,7 @@ Tensor& mul_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          return val_a * val_b;
-        },
+        [](const auto val_a, const auto val_b) { return val_a * val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
diff --git a/kernels/portable/cpu/pattern/pattern.h b/kernels/portable/cpu/pattern/pattern.h
index eae8a746d0e..1596a518b33 100644
--- a/kernels/portable/cpu/pattern/pattern.h
+++ b/kernels/portable/cpu/pattern/pattern.h
@@ -80,13 +80,12 @@ Tensor& unary_ufunc_realh(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
 
   ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
-    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
+    utils::apply_unitensor_elementwise_fn<CTYPE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>(
         fn,
         ctx,
         in,
         utils::SupportedTensorDtypes::REALH,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
   return out;
 }
@@ -107,13 +106,12 @@ Tensor& unary_ufunc_realhb_to_bool(
     return out;
   }
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
-    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name>(
+    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name, utils::SupportedTensorDtypes::BOOL>(
         [fn](const CTYPE_IN val_in) { return fn(val_in); },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::BOOL);
+        out);
   });
 
   return out;
@@ -138,13 +136,12 @@ Tensor& unary_ufunc_realhbbf16_to_floathbf16(
   }
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
-    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name>(
+    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name, utils::SupportedTensorDtypes::FLOATHBF16>(
         [fn](const CTYPE_IN val_in) { return fn(val_in); },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index 83289664cbb..28aab92fceb 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -15,6 +15,10 @@
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 #include <executorch/runtime/kernel/thread_parallel_interface.h>
 
+#ifdef ET_USE_PYTORCH_HEADERS
+#include <ATen/cpu/vec/vec.h>
+#endif // ET_USE_PYTORCH_HEADERS
+
 #include <array>
 #include <utility>
 
@@ -58,6 +62,38 @@ template <typename CTYPE_COMMON, typename Op, typename... Args>
 using op_call_result =
     std::invoke_result_t<Op, ignore_first_yield_second<Args, CTYPE_COMMON>...>;
 
+#ifdef ET_USE_PYTORCH_HEADERS
+template <typename T>
+struct is_vectorized : public std::false_type {};
+
+template <typename T>
+struct is_vectorized<at::vec::Vectorized<T>> : public std::true_type {};
+
+// TODO: can_use_vectorized and can_use_vectorized_impl are a failed
+// attempt to use SFINAE to detect whether our generic lambda argument
+// with deduced return type would compile if it was passed
+// Vectorized<CTYPE_COMMON> instead of CTYPE_COMMON. SFINAE does not
+// work that way (see
+// e.g. https://stackoverflow.com/questions/53344484/hard-error-when-using-stdinvoke-result-t-with-a-generic-lambda,
+// https://stackoverflow.com/questions/31368601/how-to-detect-if-a-generic-lambda-is-uncompilable-in-c-14);
+// if we really want to do it then we need to at least require that
+// our lambdas actively participate in being SFINAE-friendly, as in
+// https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable.
+template <typename CTYPE_COMMON, typename Op, typename Enable=void, typename... Args>
+struct can_use_vectorized_impl : std::false_type {};
+template <typename CTYPE_COMMON, typename Op, typename... Args>
+struct can_use_vectorized_impl<CTYPE_COMMON, Op, typename std::void_t<decltype(std::declval<std::invoke_result_t<
+      Op,
+                                                                               ignore_first_yield_second<Args, at::vec::Vectorized<CTYPE_COMMON>>...>>().store(std::declval<CTYPE_COMMON*>()))>, Args...> : public std::true_type {};//std::bool_constant<is_vectorized<std::invoke_result_t<Op,ignore_first_yield_second<Args, at::vec::Vectorized<CTYPE_COMMON>>...>>::value> {};
+
+// Can I call a function of type Op with sizeof...(Args) arguments of type
+// at::vec::Vectorized<CTYPE_COMMON>?
+// This is not possible in C++17 as the code is currently set up; see TODO above.
+template <typename CTYPE_COMMON, typename Op, typename...Args>
+struct can_use_vectorized : public can_use_vectorized_impl<CTYPE_COMMON, Op, void, Args...> {};
+
+#endif // ET_USE_PYTORCH_HEADERS
+
 template <
     typename CTYPE_COMMON,
     typename CTYPE_OUT,
@@ -68,14 +104,72 @@ inline void dtype_specialized_elementwise_fn_impl(
     KernelRuntimeContext& ctx,
     const Tensor& out,
     Args... inputs) {
+  static_assert(
+      (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
+       ...));
   constexpr auto kNumInputs = sizeof...(inputs);
-  ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMMON)) && ...));
+  // All inputs must be of type CTYPE_COMMON.
+  ET_DCHECK(
+      ((inputs.first->scalar_type() ==
+        CppTypeToScalarType<CTYPE_COMMON>::value) &&
+       ...));
 
   std::array<const CTYPE_COMMON*, kNumInputs> inputs_data_ptrs = {
       inputs.first->template const_data_ptr<CTYPE_COMMON>()...};
 
   CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
 
+#ifdef ET_USE_PYTORCH_HEADERS
+  if constexpr (can_use_vectorized<CTYPE_COMMON, Op, Args...>::value) {
+    const bool any_is_broadcasted =
+        !(torch::executor::internal::sizes_match_ignoring_leading_1s(
+              inputs.first->sizes(), out.sizes()) &&
+          ...);
+    if (!any_is_broadcasted) {
+      using Vec = at::vec::Vectorized<CTYPE_COMMON>;
+      ::executorch::extension::parallel_for(
+          0,
+          out.numel(),
+          ::executorch::extension::internal::GRAIN_SIZE,
+          [&](const auto begin, const auto end) {
+            const auto vectorized_begin =
+                begin + (Vec::size() - begin % Vec::size()) % Vec::size();
+            const auto vectorized_end = end - (end % Vec::size());
+            // Scalar prologue.
+            for (const auto idx : c10::irange(begin, vectorized_begin)) {
+              std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+              for (const auto input_idx : c10::irange(kNumInputs)) {
+                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
+              }
+              data_out[idx] = std::apply(compute_fun, loaded_inputs);
+            }
+
+            // Main vectorized loop.
+            for (auto idx = vectorized_begin; idx < vectorized_end;
+                 idx += Vec::size()) {
+              std::array<Vec, kNumInputs> loaded_vec_inputs;
+              for (const auto input_idx : c10::irange(kNumInputs)) {
+                loaded_vec_inputs[input_idx] =
+                    Vec::loadu(&inputs_data_ptrs[input_idx][idx]);
+              }
+              auto result_vec = std::apply(compute_fun, loaded_vec_inputs);
+              result_vec.store(&data_out[idx]);
+            }
+
+            // Scalar epilogue.
+            for (const auto idx : c10::irange(vectorized_end, end)) {
+              std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+              for (const auto input_idx : c10::irange(kNumInputs)) {
+                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
+              }
+              data_out[idx] = std::apply(compute_fun, loaded_inputs);
+            }
+          });
+      return;
+    }
+  }
+#endif
+
   ::executorch::extension::parallel_for(
       0,
       out.numel(),
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
index a623b9d4d7a..4a53041041e 100644
--- a/kernels/portable/cpu/util/targets.bzl
+++ b/kernels/portable/cpu/util/targets.bzl
@@ -110,6 +110,7 @@ def define_common_targets():
             ":broadcast_indexes_range",
             ":broadcast_util",
             ":dtype_util",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
             "//executorch/runtime/kernel:kernel_runtime_context",
             "//executorch/runtime/kernel:thread_parallel_interface",
         ],
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index b8883c75bfe..8403f092de4 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -49,7 +49,10 @@ def define_common_targets():
     runtime.cxx_library(
         name = "aten_headers_for_executorch",
         srcs = [],
-        visibility = ["//executorch/kernels/optimized/..."],
+        visibility = [
+            "//executorch/kernels/optimized/...",
+            "//executorch/kernels/portable/cpu/util/...",
+        ],
         exported_deps = select({
             "DEFAULT": [],
             "ovr_config//cpu:arm64": [

From 2d19e75d70e62056bab1426db4f6b7ffd10d5fc4 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 19 Mar 2025 19:57:07 -0700
Subject: [PATCH 09/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/elementwise_util.h | 49 +++++++++-----------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index 28aab92fceb..3f1f712bec0 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -63,35 +63,16 @@ using op_call_result =
     std::invoke_result_t<Op, ignore_first_yield_second<Args, CTYPE_COMMON>...>;
 
 #ifdef ET_USE_PYTORCH_HEADERS
-template <typename T>
-struct is_vectorized : public std::false_type {};
-
-template <typename T>
-struct is_vectorized<at::vec::Vectorized<T>> : public std::true_type {};
-
-// TODO: can_use_vectorized and can_use_vectorized_impl are a failed
-// attempt to use SFINAE to detect whether our generic lambda argument
-// with deduced return type would compile if it was passed
-// Vectorized<CTYPE_COMMON> instead of CTYPE_COMMON. SFINAE does not
-// work that way (see
-// e.g. https://stackoverflow.com/questions/53344484/hard-error-when-using-stdinvoke-result-t-with-a-generic-lambda,
-// https://stackoverflow.com/questions/31368601/how-to-detect-if-a-generic-lambda-is-uncompilable-in-c-14);
-// if we really want to do it then we need to at least require that
-// our lambdas actively participate in being SFINAE-friendly, as in
-// https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable.
-template <typename CTYPE_COMMON, typename Op, typename Enable=void, typename... Args>
-struct can_use_vectorized_impl : std::false_type {};
-template <typename CTYPE_COMMON, typename Op, typename... Args>
-struct can_use_vectorized_impl<CTYPE_COMMON, Op, typename std::void_t<decltype(std::declval<std::invoke_result_t<
-      Op,
-                                                                               ignore_first_yield_second<Args, at::vec::Vectorized<CTYPE_COMMON>>...>>().store(std::declval<CTYPE_COMMON*>()))>, Args...> : public std::true_type {};//std::bool_constant<is_vectorized<std::invoke_result_t<Op,ignore_first_yield_second<Args, at::vec::Vectorized<CTYPE_COMMON>>...>>::value> {};
-
 // Can I call a function of type Op with sizeof...(Args) arguments of type
 // at::vec::Vectorized<CTYPE_COMMON>?
-// This is not possible in C++17 as the code is currently set up; see TODO above.
-template <typename CTYPE_COMMON, typename Op, typename...Args>
-struct can_use_vectorized : public can_use_vectorized_impl<CTYPE_COMMON, Op, void, Args...> {};
-
+//
+// See [NOTE: Generic lambdas] below for requirements on Op.
+template <typename CTYPE_COMMON, typename Op, typename... Args>
+constexpr bool can_use_vectorized() {
+  return std::is_invocable_v<
+      Op,
+      ignore_first_yield_second<Args, at::vec::Vectorized<CTYPE_COMMON>>...>;
+}
 #endif // ET_USE_PYTORCH_HEADERS
 
 template <
@@ -349,6 +330,17 @@ inline void apply_unitensor_elementwise_fn(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
+/**
+ * Useful for unary elementwise operators. For each element of the
+ * input, call Op and write to the corresponding element of the
+ * output. Tensor broadcasting is applied wherever it is required.
+ *
+ * [NOTE: Generic lambdas]: If Op is a *generic* lambda (i.e., one with `auto`
+ * parameters; normal lambdas are fine), it must fulfill one of the
+ * following conditions. Either:
+ * 1) It must in fact compile when passed at::vec::Vectorized<CTYPE_COMMON>, or
+ * 2) It must be actively SFINAE-friendly, as per the C++17 examples in https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable .
+ */
 template <
     typename CTYPE_COMMON,
     const char* op_name,
@@ -390,6 +382,7 @@ inline void apply_bitensor_elementwise_fn(
  * Useful for bi-tensor elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
+ * See [NOTE: Generic lambdas] if you want to pass a generic lambda for compute_fun.
  */
 template <
     typename CTYPE_COMMON,
@@ -456,6 +449,8 @@ inline void apply_tritensor_elementwise_fn(
  *
  * static constexpr const char op_name[] = "my_op";
  * apply_ternary_elementwise_fn<CTYPE_COMMON, op_name>.
+ *
+ * See [NOTE: Generic lambdas] if you want to pass a generic lambda for compute_fun.
  */
 template <
     typename CTYPE_COMMON,

From 44ee51ad2f209d9981c78c069d0138d64de6481c Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 25 Mar 2025 10:05:24 -0700
Subject: [PATCH 10/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.cpp |  4 ++
 kernels/portable/cpu/util/dtype_util.h   | 50 ++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/kernels/portable/cpu/util/dtype_util.cpp b/kernels/portable/cpu/util/dtype_util.cpp
index d240b9f83bc..81b1b203a54 100644
--- a/kernels/portable/cpu/util/dtype_util.cpp
+++ b/kernels/portable/cpu/util/dtype_util.cpp
@@ -23,10 +23,14 @@ bool check_tensor_dtype(
       return executorch::runtime::tensor_is_realhbbf16_type(t);
     case SupportedTensorDtypes::REALHBF16:
       return executorch::runtime::tensor_is_realhbf16_type(t);
+    case SupportedTensorDtypes::REALH:
+      return executorch::runtime::tensor_is_realh_type(t);
     case SupportedTensorDtypes::FLOATHBF16:
       return executorch::runtime::tensor_is_floating_type(t);
     case SupportedTensorDtypes::INTB:
       return executorch::runtime::tensor_is_integral_type(t, true);
+    case SupportedTensorDtypes::BOOL:
+      return executorch::runtime::tensor_is_type(t, ScalarType::Bool);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return (executorch::runtime::tensor_is_type(
           t, ScalarType::Bool, ScalarType::Byte));
diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 71d3625052d..df92428004e 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -51,6 +51,15 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_realhbf16(
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_realh(const Tensor& t) {
+  CTYPE_COMMON (*result)(const void*) = nullptr;
+  ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+    result = internal::load_and_convert<CTYPE_COMMON, TENSOR_CTYPE>;
+  });
+  return result;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_floathbf16(
     const Tensor& t) {
@@ -72,6 +81,16 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_intb(const Tensor& t) {
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool(const Tensor& t) {
+  ET_CHECK_MSG(
+      t.scalar_type() == ScalarType::Bool,
+      "Unhandled dtype %s for %s",
+      ::executorch::runtime::toString(t.scalar_type()),
+      op_name);
+  return internal::load_and_convert<CTYPE_COMMON, bool>;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool_or_byte(
     const Tensor& t) {
@@ -137,6 +156,16 @@ store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_realhbf16(
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_realh(
+    const Tensor& t) {
+  void (*result)(CTYPE_COMMON, void*) = nullptr;
+  ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() {
+    result = internal::convert_and_store<TENSOR_CTYPE, CTYPE_COMMON>;
+  });
+  return result;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_floathbf16(const Tensor& t) {
@@ -159,6 +188,17 @@ store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_intb(
   return result;
 }
 
+template <typename CTYPE_COMMON, const char* op_name>
+store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn_bool(
+    const Tensor& t) {
+  ET_CHECK_MSG(
+      t.scalar_type() == ScalarType::Bool,
+      "Unhandled dtype %s for %s",
+      ::executorch::runtime::toString(t.scalar_type()),
+      op_name);
+  return internal::convert_and_store<bool, CTYPE_COMMON>;
+}
+
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) {
@@ -191,8 +231,10 @@ get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
 enum class SupportedTensorDtypes {
   REALHBBF16,
   REALHBF16,
+  REALH,
   FLOATHBF16,
   INTB,
+  BOOL,
   BOOL_OR_BYTE,
   SAME_AS_COMPUTE,
   SAME_AS_COMMON,
@@ -209,10 +251,14 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn(
       return get_load_to_common_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::REALHBF16:
       return get_load_to_common_fn_realhbf16<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::REALH:
+      return get_load_to_common_fn_realh<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::FLOATHBF16:
       return get_load_to_common_fn_realhbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::INTB:
       return get_load_to_common_fn_intb<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::BOOL:
+      return get_load_to_common_fn_bool<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return get_load_to_common_fn_bool_or_byte<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::SAME_AS_COMPUTE:
@@ -233,10 +279,14 @@ store_common_to_tensor_fn<CTYPE_COMMON> get_store_common_to_tensor_fn(
       return get_store_common_to_tensor_fn_realhbbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::REALHBF16:
       return get_store_common_to_tensor_fn_realhbf16<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::REALH:
+      return get_store_common_to_tensor_fn_realh<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::FLOATHBF16:
       return get_store_common_to_tensor_fn_floathbf16<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::INTB:
       return get_store_common_to_tensor_fn_intb<CTYPE_COMMON, op_name>(t);
+    case SupportedTensorDtypes::BOOL:
+      return get_store_common_to_tensor_fn_bool<CTYPE_COMMON, op_name>(t);
     case SupportedTensorDtypes::BOOL_OR_BYTE:
       return get_store_common_to_tensor_fn_bool_or_byte<CTYPE_COMMON, op_name>(
           t);

From 7f57a19d5135cd8e1b0689e737b1189d80f47065 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 26 Mar 2025 15:54:09 -0700
Subject: [PATCH 11/27] Update

[ghstack-poisoned]
---
 runtime/core/portable_type/c10/c10/targets.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index a49f30c72a0..d9d72b5be3f 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -87,5 +87,5 @@ def define_common_targets():
             # linker failure.
             "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(),
             "DEFAULT": [],
-        }) + ["-DET_USE_PYTORCH_HEADERS", "-DSTANDALONE_TORCH_HEADER"],
+        }) + ["-DSTANDALONE_TORCH_HEADER"] + ([] if runtime.is_oss else ["-DET_USE_PYTORCH_HEADERS"]),
     )

From 4553283773f0a5fb325a1a3eac54e26835327cbd Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 26 Mar 2025 16:20:19 -0700
Subject: [PATCH 12/27] Update

[ghstack-poisoned]
---
 kernels/portable/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index 53ad88880d6..edea045d65f 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -66,9 +66,7 @@ gen_operators_lib(
 # Portable kernels support optional parallelization (and, in the
 # future, perhaps other performance features). If support is present,
 # produce an optimized version.
-set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
-
-if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_library(optimized_portable_kernels ${_portable_kernels__srcs})
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)

From ff2c3580800e0380c38e0fd0e9ec76a05164f557 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 26 Mar 2025 16:55:40 -0700
Subject: [PATCH 13/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/elementwise_util.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index 040ebea0b4c..08d54e11090 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -64,16 +64,16 @@ inline void dtype_specialized_elementwise_fn_impl(
   constexpr auto kNumInputs = sizeof...(inputs);
   ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMMON)) && ...));
 
-  std::array<const CTYPE_COMMON*, kNumInputs> inputs_data_ptrs = {
-      inputs.first->template const_data_ptr<CTYPE_COMMON>()...};
-
-  CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
-
   ::executorch::extension::parallel_for(
       0,
       out.numel(),
       ::executorch::extension::internal::GRAIN_SIZE,
       [&](const auto begin, const auto end) {
+        std::array<const CTYPE_COMMON*, kNumInputs> inputs_data_ptrs = {
+            inputs.first->template const_data_ptr<CTYPE_COMMON>()...};
+
+        CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
+
         const auto range =
             BroadcastIndexesRange<kNumInputs>(out, (*inputs.first)...);
         auto begin_it = range.begin();

From 943ab82be758baadbafc4287756fa6dbe904f6df Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 28 Mar 2025 09:51:22 -0700
Subject: [PATCH 14/27] Update

[ghstack-poisoned]
---
 tools/cmake/executorch-config.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 49aa6cf08af..56c7fa2d7d4 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -149,7 +149,7 @@ endif()
 if(TARGET coremldelegate)
   set_target_properties(
     coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES
-                             "coreml_inmemoryfs;coreml_util"
+                              "coreml_inmemoryfs;coreml_util"
   )
 endif()
 
@@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib)
 endif()
 if(TARGET extension_threadpool)
   target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+  set_target_properties(
+    extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES
+                                    "cpuinfo;pthreadpool"
+  )
 endif()

From f22d039d23db4103dc0697b638813ce2e4bc4a5f Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 28 Mar 2025 09:51:28 -0700
Subject: [PATCH 15/27] Update

[ghstack-poisoned]
---
 test/CMakeLists.txt               | 13 +++++++
 test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 test/build_optimized_size_test.sh

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3932f1097e1..812e8e4a67a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections")
 endif()
 
+#
+# size_test_all_optimized_ops: binary with optimized ops and no delegate backend
+#
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+add_executable(size_test_all_optimized_ops ${_size_test__srcs})
+target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+target_link_libraries(
+  size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib)
+if(CMAKE_BUILD_TYPE EQUAL "Release")
+  target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections")
+endif()
+endif()
+
 # Print all summary
 executorch_print_configuration_summary()
diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh
new file mode 100644
index 00000000000..181c2ce617d
--- /dev/null
+++ b/test/build_optimized_size_test.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Unlike build_size_test.sh, this script:
+# - does not attempt to disable exceptions and RTTI
+# - as a consequence, is able to build optimized kernels
+# - uses MinSizeRel builds
+# - is not currently intended to run in CI
+# - sets -g to make it easier to use tools like bloaty to investigate size
+
+set -e
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
+
+cmake_install_executorch_lib() {
+  echo "Installing libexecutorch.a"
+  clean_executorch_install_folders
+  update_tokenizers_git_submodule
+  CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \
+          -DCMAKE_CXX_STANDARD_REQUIRED=ON \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE=MinSizeRel \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DOPTIMIZE_SIZE=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+  cmake --build cmake-out -j9 --target install --config MinSizeRel
+}
+
+test_cmake_size_test() {
+    CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test
+
+    echo "Build size test"
+    cmake --build cmake-out/test -j9 --config MinSizeRel
+
+    echo 'ExecuTorch with no ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test
+
+    echo 'ExecuTorch with portable ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_ops
+
+    echo 'ExecuTorch with optimized ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_optimized_ops
+}
+
+if [[ -z $PYTHON_EXECUTABLE ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+cmake_install_executorch_lib
+test_cmake_size_test

From d5dfe2f230e42139ec78e054e8376765acfe736a Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 28 Mar 2025 09:51:37 -0700
Subject: [PATCH 16/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_add.cpp              | 20 ++++++++-----
 kernels/portable/cpu/op_addmm.cpp            | 10 ++++---
 kernels/portable/cpu/op_atan2.cpp            | 10 ++++---
 kernels/portable/cpu/op_clamp.cpp            | 18 ++++++++----
 kernels/portable/cpu/op_copy.cpp             | 20 ++++++++-----
 kernels/portable/cpu/op_div.cpp              | 31 ++++++++++++--------
 kernels/portable/cpu/op_elu.cpp              | 11 ++++---
 kernels/portable/cpu/op_floor_divide.cpp     |  9 ++++--
 kernels/portable/cpu/op_fmod.cpp             | 18 ++++++++----
 kernels/portable/cpu/op_maximum.cpp          |  8 +++--
 kernels/portable/cpu/op_minimum.cpp          |  9 ++++--
 kernels/portable/cpu/op_mul.cpp              | 10 ++++---
 kernels/portable/cpu/op_pow.cpp              | 27 +++++++++++------
 kernels/portable/cpu/op_remainder.cpp        | 18 ++++++++----
 kernels/portable/cpu/op_rsub.cpp             | 10 ++++---
 kernels/portable/cpu/op_sigmoid.cpp          | 11 ++++---
 kernels/portable/cpu/op_sub.cpp              | 20 ++++++++-----
 kernels/portable/cpu/op_where.cpp            | 14 +++++----
 kernels/portable/cpu/pattern/bitwise_op.h    | 18 ++++++++----
 kernels/portable/cpu/pattern/comparison_op.h | 18 ++++++++----
 kernels/portable/cpu/pattern/logical_op.h    |  9 ++++--
 21 files changed, 201 insertions(+), 118 deletions(-)

diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index adb9d4ea723..555341b3447 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -52,8 +52,11 @@ Tensor& add_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [val_alpha](const auto val_a, const auto val_b) {
           return val_a + val_alpha * val_b;
         },
         ctx,
@@ -61,8 +64,7 @@ Tensor& add_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -100,8 +102,11 @@ Tensor& add_scalar_out(
   static constexpr const char op_name[] = "add.Scalar_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [b, alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [b, alpha](const auto val_a) {
           CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
           CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
           return val_a + val_alpha * val_b;
@@ -109,8 +114,7 @@ Tensor& add_scalar_out(
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp
index d1df5818cd8..440a8b2c0fa 100644
--- a/kernels/portable/cpu/op_addmm.cpp
+++ b/kernels/portable/cpu/op_addmm.cpp
@@ -88,8 +88,11 @@ Tensor& addmm_out(
           n,
           p);
 
-      utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-          [alpha_val, beta_val](const CTYPE val_a, const CTYPE val_b) {
+      utils::apply_bitensor_elementwise_fn<
+          CTYPE,
+          op_name,
+          utils::SupportedTensorDtypes::REALHBF16>(
+          [alpha_val, beta_val](const auto val_a, const auto val_b) {
             return val_a * alpha_val + val_b * beta_val;
           },
           ctx,
@@ -97,8 +100,7 @@ Tensor& addmm_out(
           utils::SupportedTensorDtypes::REALHBF16,
           in,
           utils::SupportedTensorDtypes::REALHBF16,
-          out,
-          utils::SupportedTensorDtypes::REALHBF16);
+          out);
     }
   });
 
diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp
index 19267ef49dd..33d66cf2ad7 100644
--- a/kernels/portable/cpu/op_atan2.cpp
+++ b/kernels/portable/cpu/op_atan2.cpp
@@ -55,8 +55,11 @@ Tensor& atan2_out(
   static constexpr const char op_name[] = "atan2.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_a, const auto val_b) {
           return std::atan2(val_a, val_b);
         },
         ctx,
@@ -64,8 +67,7 @@ Tensor& atan2_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index c1c40a38f34..6974789eccf 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -134,8 +134,12 @@ Tensor& clamp_out(
   static constexpr const char op_name[] = "clamp.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
         [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(
@@ -150,8 +154,7 @@ Tensor& clamp_out(
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
@@ -210,11 +213,15 @@ Tensor& clamp_tensor_out(
   static constexpr const char op_name[] = "clamp.Tensor_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_tritensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [has_min, has_max](
             const CTYPE_COMPUTE val_in,
             const CTYPE_COMPUTE val_min,
             const CTYPE_COMPUTE val_max) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(val_out, val_min);
@@ -231,8 +238,7 @@ Tensor& clamp_tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         max,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp
index 19b0c3a2f6a..30fff4d2c10 100644
--- a/kernels/portable/cpu/op_copy.cpp
+++ b/kernels/portable/cpu/op_copy.cpp
@@ -47,15 +47,17 @@ Tensor& copy_out(
   static constexpr const char op_name[] = "copy.out";
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy.out", CTYPE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-        [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [](ET_UNUSED const auto _, const auto val_src) { return val_src; },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
         src,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -80,15 +82,17 @@ Tensor& copy_(
   static constexpr const char op_name[] = "copy_";
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy_", CTYPE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-        [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [](ET_UNUSED const auto _, const auto val_src) { return val_src; },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
         src,
         utils::SupportedTensorDtypes::REALHBBF16,
-        in,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        in);
   });
 
   return in;
diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp
index 94cd9ea5011..70f9479c464 100644
--- a/kernels/portable/cpu/op_div.cpp
+++ b/kernels/portable/cpu/op_div.cpp
@@ -58,17 +58,17 @@ Tensor& div_out(
   static constexpr const char op_name[] = "div.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          return val_a / val_b;
-        },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_a, const auto val_b) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
@@ -122,9 +122,13 @@ Tensor& div_out_mode(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [mode_is_trunc, &div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
               div_by_zero_error = true;
@@ -146,8 +150,7 @@ Tensor& div_out_mode(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -188,13 +191,15 @@ Tensor& div_scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b](const CTYPE_COMPUTE val_a) { return val_a / val_b; },
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b](const auto val_a) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp
index d4846fb1bfb..d6533642860 100644
--- a/kernels/portable/cpu/op_elu.cpp
+++ b/kernels/portable/cpu/op_elu.cpp
@@ -44,8 +44,12 @@ Tensor& elu_out(
     ET_EXTRACT_SCALAR(scale, math_scale);
     ET_EXTRACT_SCALAR(input_scale, math_input_scale);
     const auto negcoef = math_alpha * math_scale;
-    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
-        [negcoef, math_scale, math_input_scale](auto x) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [negcoef, math_scale, math_input_scale](const auto x) {
+          // TODO: rewrite this to be vectorization-capable.
           return MathT(x) <= MathT(0)
               ? std::expm1(MathT(x) * math_input_scale) * negcoef
               : MathT(x) * math_scale;
@@ -53,8 +57,7 @@ Tensor& elu_out(
         ctx,
         in,
         utils::SupportedTensorDtypes::FLOATHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
   return out;
 }
diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp
index 85eb612ea1e..50723c3fa0a 100644
--- a/kernels/portable/cpu/op_floor_divide.cpp
+++ b/kernels/portable/cpu/op_floor_divide.cpp
@@ -53,9 +53,13 @@ Tensor& floor_divide_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
               div_by_zero_error = true;
@@ -69,8 +73,7 @@ Tensor& floor_divide_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp
index 1e8cba0f1ae..96a971b166a 100644
--- a/kernels/portable/cpu/op_fmod.cpp
+++ b/kernels/portable/cpu/op_fmod.cpp
@@ -55,9 +55,13 @@ Tensor& fmod_Tensor_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -73,8 +77,7 @@ Tensor& fmod_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -131,16 +134,19 @@ Tensor& fmod_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = std::fmod(val_a, val_b);
           return value;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
index 5cf3b5a19f8..3a84095a4df 100644
--- a/kernels/portable/cpu/op_maximum.cpp
+++ b/kernels/portable/cpu/op_maximum.cpp
@@ -45,7 +45,10 @@ Tensor& maximum_out(
   static constexpr const char op_name[] = "maximum.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
           return utils::max_override(val_a, val_b);
         },
@@ -54,8 +57,7 @@ Tensor& maximum_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index e2c641bdb22..5c0e79eb9bb 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -45,8 +45,12 @@ Tensor& minimum_out(
   static constexpr const char op_name[] = "minimum.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::min_override(val_a, val_b);
         },
         ctx,
@@ -54,8 +58,7 @@ Tensor& minimum_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index 114e60ff171..6156227732d 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -97,13 +97,15 @@ Tensor& mul_scalar_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b](const CTYPE_COMPUTE val_a) { return val_a * val_b; },
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b](const auto val_a) { return val_a * val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp
index 81319b03d9f..4d2673cb72d 100644
--- a/kernels/portable/cpu/op_pow.cpp
+++ b/kernels/portable/cpu/op_pow.cpp
@@ -53,8 +53,12 @@ Tensor& pow_Tensor_Tensor_out(
   static constexpr const char op_name[] = "pow.Tensor_Tensor_out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return std::pow(val_a, val_b);
         },
         ctx,
@@ -62,8 +66,7 @@ Tensor& pow_Tensor_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -104,13 +107,16 @@ Tensor& pow_Tensor_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -151,13 +157,16 @@ Tensor& pow_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_a = utils::scalar_to<CTYPE_COMPUTE>(a);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
         ctx,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp
index d34c34a0380..01a5d72de01 100644
--- a/kernels/portable/cpu/op_remainder.cpp
+++ b/kernels/portable/cpu/op_remainder.cpp
@@ -53,9 +53,13 @@ Tensor& remainder_Tensor_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -71,8 +75,7 @@ Tensor& remainder_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -126,15 +129,18 @@ Tensor& remainder_Scalar_out(
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::remainder_override(val_a, val_b);
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp
index 46af021efda..6a0a77b6596 100644
--- a/kernels/portable/cpu/op_rsub.cpp
+++ b/kernels/portable/cpu/op_rsub.cpp
@@ -52,15 +52,17 @@ Tensor& rsub_scalar_out(
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b, val_alpha](const auto val_a) {
           return val_b - val_alpha * val_a;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index 09cfed524f9..acb743a2db6 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -45,8 +45,12 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   static constexpr const char op_name[] = "sigmoid.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_in) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_in) -> CTYPE_COMPUTE {
+          // TODO: rewrite this to be vectorization-capable
           CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
               (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
           return out_val;
@@ -54,8 +58,7 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
index 6217f82c3b1..aa90df8dee4 100644
--- a/kernels/portable/cpu/op_sub.cpp
+++ b/kernels/portable/cpu/op_sub.cpp
@@ -56,8 +56,11 @@ Tensor& sub_out(
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        [val_alpha](const auto val_a, const auto val_b) {
           return val_a - val_alpha * val_b;
         },
         ctx,
@@ -65,8 +68,7 @@ Tensor& sub_out(
         utils::SupportedTensorDtypes::REALHBF16,
         b,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -110,15 +112,17 @@ Tensor& sub_scalar_out(
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b, val_alpha](const auto val_a) {
           return val_a - val_alpha * val_b;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp
index b455c45c2d1..692e296ee00 100644
--- a/kernels/portable/cpu/op_where.cpp
+++ b/kernels/portable/cpu/op_where.cpp
@@ -43,10 +43,13 @@ Tensor& where_out(
   static constexpr const char op_name[] = "where.self_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a,
-           const CTYPE_COMPUTE val_b,
-           const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+    utils::apply_tritensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [](const auto val_a, const auto val_b, const auto val_c) {
+          return val_c ? val_a : val_b;
+        },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -54,8 +57,7 @@ Tensor& where_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         cond,
         utils::SupportedTensorDtypes::BOOL_OR_BYTE,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h
index 6e4c111b8f2..f78ce796e6c 100644
--- a/kernels/portable/cpu/pattern/bitwise_op.h
+++ b/kernels/portable/cpu/pattern/bitwise_op.h
@@ -80,15 +80,18 @@ Tensor& bitwise_tensor_out(
 
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-        utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        utils::apply_bitensor_elementwise_fn<
+            CTYPE_COMPUTE,
+            op_name,
+            utils::SupportedTensorDtypes::REALHBBF16>(
+            // TODO: rewrite this to be vectorization-capable.
             BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value,
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
             b,
             utils::SupportedTensorDtypes::INTB,
-            out,
-            utils::SupportedTensorDtypes::REALHBBF16);
+            out);
       });
 
   return out;
@@ -121,16 +124,19 @@ Tensor& bitwise_scalar_out(
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
         const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-        utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        utils::apply_unitensor_elementwise_fn<
+            CTYPE_COMPUTE,
+            op_name,
+            utils::SupportedTensorDtypes::REALHBBF16>(
             [val_b](const CTYPE_COMPUTE val_a) {
+              // TODO: rewrite this to be vectorization-capable.
               return BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value(
                   val_a, val_b);
             },
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
-            out,
-            utils::SupportedTensorDtypes::REALHBBF16);
+            out);
       });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/comparison_op.h b/kernels/portable/cpu/pattern/comparison_op.h
index e0d9bf4dcab..643d7623922 100644
--- a/kernels/portable/cpu/pattern/comparison_op.h
+++ b/kernels/portable/cpu/pattern/comparison_op.h
@@ -91,15 +91,18 @@ Tensor& comparison_tensor_out(
   ScalarType compute_type = utils::get_compute_type(common_type);
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value,
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -127,15 +130,18 @@ Tensor& comparison_scalar_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           return ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value(val_a, val_b);
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/logical_op.h b/kernels/portable/cpu/pattern/logical_op.h
index 017822a85a6..4547d3df51b 100644
--- a/kernels/portable/cpu/pattern/logical_op.h
+++ b/kernels/portable/cpu/pattern/logical_op.h
@@ -34,15 +34,18 @@ Tensor& logical_tensor_out(
       InvalidArgument,
       out);
 
-  utils::apply_bitensor_elementwise_fn<bool, op_name>(
+  utils::apply_bitensor_elementwise_fn<
+      bool,
+      op_name,
+      utils::SupportedTensorDtypes::REALHBBF16>(
+      // TODO: rewrite this to be vectorization-capable.
       fn,
       ctx,
       a,
       utils::SupportedTensorDtypes::REALHBBF16,
       b,
       utils::SupportedTensorDtypes::REALHBBF16,
-      out,
-      utils::SupportedTensorDtypes::REALHBBF16);
+      out);
 
   return out;
 }

From 3f1b775fe481d9d9d88896c913f7033dc3cfd21d Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Fri, 28 Mar 2025 09:51:43 -0700
Subject: [PATCH 17/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.h | 35 ++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index b5cd980b085..eb1ee83111e 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -228,7 +228,7 @@ enum class SupportedTensorDtypes {
 namespace internal {
 
 template <typename CTYPE_COMPUTE, const char* op_name>
-load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_impl(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
@@ -252,7 +252,7 @@ load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
 }
 
 template <typename CTYPE_COMPUTE, const char* op_name>
-store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
+store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn_impl(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
@@ -285,6 +285,37 @@ store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
   return nullptr;
 }
 
+#ifndef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+constexpr const char kGenericElementwiseOpName[] = "generic_elementwise_op";
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  return get_load_to_compute_fn_impl<
+      CTYPE_COMPUTE,
+#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      op_name
+#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      kGenericElementwiseOpName
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      >(t, dtypes);
+}
+
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  return get_store_compute_to_tensor_fn_impl<
+      CTYPE_COMPUTE,
+#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      op_name
+#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      kGenericElementwiseOpName
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      >(t, dtypes);
+}
 bool check_tensor_dtype(
     const Tensor t,
     SupportedTensorDtypes dtypes,

From 7f2bbdb098596d232cd1193ea76422308ab74dc3 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 1 Apr 2025 19:32:41 -0700
Subject: [PATCH 18/27] Update

[ghstack-poisoned]
---
 kernels/portable/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index 53ad88880d6..edea045d65f 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -66,9 +66,7 @@ gen_operators_lib(
 # Portable kernels support optional parallelization (and, in the
 # future, perhaps other performance features). If support is present,
 # produce an optimized version.
-set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
-
-if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_library(optimized_portable_kernels ${_portable_kernels__srcs})
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)

From 9e42e93a96531d2daa189ac66b796aa515e56cd4 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 1 Apr 2025 19:32:46 -0700
Subject: [PATCH 19/27] Update

[ghstack-poisoned]
---
 tools/cmake/executorch-config.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 49aa6cf08af..56c7fa2d7d4 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -149,7 +149,7 @@ endif()
 if(TARGET coremldelegate)
   set_target_properties(
     coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES
-                             "coreml_inmemoryfs;coreml_util"
+                              "coreml_inmemoryfs;coreml_util"
   )
 endif()
 
@@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib)
 endif()
 if(TARGET extension_threadpool)
   target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+  set_target_properties(
+    extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES
+                                    "cpuinfo;pthreadpool"
+  )
 endif()

From 96d258eb5cb1601283f636f6dd8a046ae5c9e4ae Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 1 Apr 2025 19:33:00 -0700
Subject: [PATCH 20/27] Update

[ghstack-poisoned]
---
 test/CMakeLists.txt               | 13 +++++++
 test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 test/build_optimized_size_test.sh

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3932f1097e1..812e8e4a67a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections")
 endif()
 
+#
+# size_test_all_optimized_ops: binary with optimized ops and no delegate backend
+#
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+add_executable(size_test_all_optimized_ops ${_size_test__srcs})
+target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+target_link_libraries(
+  size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib)
+if(CMAKE_BUILD_TYPE EQUAL "Release")
+  target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections")
+endif()
+endif()
+
 # Print all summary
 executorch_print_configuration_summary()
diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh
new file mode 100644
index 00000000000..181c2ce617d
--- /dev/null
+++ b/test/build_optimized_size_test.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Unlike build_size_test.sh, this script:
+# - does not attempt to disable exceptions and RTTI
+# - as a consequence, is able to build optimized kernels
+# - uses MinSizeRel builds
+# - is not currently intended to run in CI
+# - sets -g to make it easier to use tools like bloaty to investigate size
+
+set -e
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
+
+cmake_install_executorch_lib() {
+  echo "Installing libexecutorch.a"
+  clean_executorch_install_folders
+  update_tokenizers_git_submodule
+  CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \
+          -DCMAKE_CXX_STANDARD_REQUIRED=ON \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE=MinSizeRel \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DOPTIMIZE_SIZE=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+  cmake --build cmake-out -j9 --target install --config MinSizeRel
+}
+
+test_cmake_size_test() {
+    CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test
+
+    echo "Build size test"
+    cmake --build cmake-out/test -j9 --config MinSizeRel
+
+    echo 'ExecuTorch with no ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test
+
+    echo 'ExecuTorch with portable ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_ops
+
+    echo 'ExecuTorch with optimized ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_optimized_ops
+}
+
+if [[ -z $PYTHON_EXECUTABLE ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+cmake_install_executorch_lib
+test_cmake_size_test

From a7562543a4fec8abd42492ae80c86acb9918d6cf Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 1 Apr 2025 19:33:07 -0700
Subject: [PATCH 21/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_add.cpp              | 20 ++++++++-----
 kernels/portable/cpu/op_addmm.cpp            | 10 ++++---
 kernels/portable/cpu/op_atan2.cpp            | 10 ++++---
 kernels/portable/cpu/op_clamp.cpp            | 18 ++++++++----
 kernels/portable/cpu/op_copy.cpp             | 20 ++++++++-----
 kernels/portable/cpu/op_div.cpp              | 31 ++++++++++++--------
 kernels/portable/cpu/op_elu.cpp              | 11 ++++---
 kernels/portable/cpu/op_floor_divide.cpp     |  9 ++++--
 kernels/portable/cpu/op_fmod.cpp             | 18 ++++++++----
 kernels/portable/cpu/op_maximum.cpp          |  8 +++--
 kernels/portable/cpu/op_minimum.cpp          |  9 ++++--
 kernels/portable/cpu/op_mul.cpp              | 10 ++++---
 kernels/portable/cpu/op_pow.cpp              | 27 +++++++++++------
 kernels/portable/cpu/op_remainder.cpp        | 18 ++++++++----
 kernels/portable/cpu/op_rsub.cpp             | 10 ++++---
 kernels/portable/cpu/op_sigmoid.cpp          | 11 ++++---
 kernels/portable/cpu/op_sub.cpp              | 20 ++++++++-----
 kernels/portable/cpu/op_where.cpp            | 14 +++++----
 kernels/portable/cpu/pattern/bitwise_op.h    | 18 ++++++++----
 kernels/portable/cpu/pattern/comparison_op.h | 18 ++++++++----
 kernels/portable/cpu/pattern/logical_op.h    |  9 ++++--
 21 files changed, 201 insertions(+), 118 deletions(-)

diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index adb9d4ea723..555341b3447 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -52,8 +52,11 @@ Tensor& add_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [val_alpha](const auto val_a, const auto val_b) {
           return val_a + val_alpha * val_b;
         },
         ctx,
@@ -61,8 +64,7 @@ Tensor& add_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -100,8 +102,11 @@ Tensor& add_scalar_out(
   static constexpr const char op_name[] = "add.Scalar_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [b, alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [b, alpha](const auto val_a) {
           CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
           CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
           return val_a + val_alpha * val_b;
@@ -109,8 +114,7 @@ Tensor& add_scalar_out(
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp
index d1df5818cd8..440a8b2c0fa 100644
--- a/kernels/portable/cpu/op_addmm.cpp
+++ b/kernels/portable/cpu/op_addmm.cpp
@@ -88,8 +88,11 @@ Tensor& addmm_out(
           n,
           p);
 
-      utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-          [alpha_val, beta_val](const CTYPE val_a, const CTYPE val_b) {
+      utils::apply_bitensor_elementwise_fn<
+          CTYPE,
+          op_name,
+          utils::SupportedTensorDtypes::REALHBF16>(
+          [alpha_val, beta_val](const auto val_a, const auto val_b) {
             return val_a * alpha_val + val_b * beta_val;
           },
           ctx,
@@ -97,8 +100,7 @@ Tensor& addmm_out(
           utils::SupportedTensorDtypes::REALHBF16,
           in,
           utils::SupportedTensorDtypes::REALHBF16,
-          out,
-          utils::SupportedTensorDtypes::REALHBF16);
+          out);
     }
   });
 
diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp
index 19267ef49dd..33d66cf2ad7 100644
--- a/kernels/portable/cpu/op_atan2.cpp
+++ b/kernels/portable/cpu/op_atan2.cpp
@@ -55,8 +55,11 @@ Tensor& atan2_out(
   static constexpr const char op_name[] = "atan2.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_a, const auto val_b) {
           return std::atan2(val_a, val_b);
         },
         ctx,
@@ -64,8 +67,7 @@ Tensor& atan2_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index c1c40a38f34..6974789eccf 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -134,8 +134,12 @@ Tensor& clamp_out(
   static constexpr const char op_name[] = "clamp.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
         [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(
@@ -150,8 +154,7 @@ Tensor& clamp_out(
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
@@ -210,11 +213,15 @@ Tensor& clamp_tensor_out(
   static constexpr const char op_name[] = "clamp.Tensor_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_tritensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [has_min, has_max](
             const CTYPE_COMPUTE val_in,
             const CTYPE_COMPUTE val_min,
             const CTYPE_COMPUTE val_max) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(val_out, val_min);
@@ -231,8 +238,7 @@ Tensor& clamp_tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         max,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp
index 19b0c3a2f6a..30fff4d2c10 100644
--- a/kernels/portable/cpu/op_copy.cpp
+++ b/kernels/portable/cpu/op_copy.cpp
@@ -47,15 +47,17 @@ Tensor& copy_out(
   static constexpr const char op_name[] = "copy.out";
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy.out", CTYPE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-        [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [](ET_UNUSED const auto _, const auto val_src) { return val_src; },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
         src,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -80,15 +82,17 @@ Tensor& copy_(
   static constexpr const char op_name[] = "copy_";
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy_", CTYPE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-        [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [](ET_UNUSED const auto _, const auto val_src) { return val_src; },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
         src,
         utils::SupportedTensorDtypes::REALHBBF16,
-        in,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        in);
   });
 
   return in;
diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp
index 94cd9ea5011..70f9479c464 100644
--- a/kernels/portable/cpu/op_div.cpp
+++ b/kernels/portable/cpu/op_div.cpp
@@ -58,17 +58,17 @@ Tensor& div_out(
   static constexpr const char op_name[] = "div.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          return val_a / val_b;
-        },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_a, const auto val_b) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
@@ -122,9 +122,13 @@ Tensor& div_out_mode(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [mode_is_trunc, &div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
               div_by_zero_error = true;
@@ -146,8 +150,7 @@ Tensor& div_out_mode(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -188,13 +191,15 @@ Tensor& div_scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b](const CTYPE_COMPUTE val_a) { return val_a / val_b; },
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b](const auto val_a) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp
index d4846fb1bfb..d6533642860 100644
--- a/kernels/portable/cpu/op_elu.cpp
+++ b/kernels/portable/cpu/op_elu.cpp
@@ -44,8 +44,12 @@ Tensor& elu_out(
     ET_EXTRACT_SCALAR(scale, math_scale);
     ET_EXTRACT_SCALAR(input_scale, math_input_scale);
     const auto negcoef = math_alpha * math_scale;
-    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
-        [negcoef, math_scale, math_input_scale](auto x) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [negcoef, math_scale, math_input_scale](const auto x) {
+          // TODO: rewrite this to be vectorization-capable.
           return MathT(x) <= MathT(0)
               ? std::expm1(MathT(x) * math_input_scale) * negcoef
               : MathT(x) * math_scale;
@@ -53,8 +57,7 @@ Tensor& elu_out(
         ctx,
         in,
         utils::SupportedTensorDtypes::FLOATHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
   return out;
 }
diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp
index 85eb612ea1e..50723c3fa0a 100644
--- a/kernels/portable/cpu/op_floor_divide.cpp
+++ b/kernels/portable/cpu/op_floor_divide.cpp
@@ -53,9 +53,13 @@ Tensor& floor_divide_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
               div_by_zero_error = true;
@@ -69,8 +73,7 @@ Tensor& floor_divide_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp
index 1e8cba0f1ae..96a971b166a 100644
--- a/kernels/portable/cpu/op_fmod.cpp
+++ b/kernels/portable/cpu/op_fmod.cpp
@@ -55,9 +55,13 @@ Tensor& fmod_Tensor_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -73,8 +77,7 @@ Tensor& fmod_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -131,16 +134,19 @@ Tensor& fmod_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = std::fmod(val_a, val_b);
           return value;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
index 5cf3b5a19f8..3a84095a4df 100644
--- a/kernels/portable/cpu/op_maximum.cpp
+++ b/kernels/portable/cpu/op_maximum.cpp
@@ -45,7 +45,10 @@ Tensor& maximum_out(
   static constexpr const char op_name[] = "maximum.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
           return utils::max_override(val_a, val_b);
         },
@@ -54,8 +57,7 @@ Tensor& maximum_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index e2c641bdb22..5c0e79eb9bb 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -45,8 +45,12 @@ Tensor& minimum_out(
   static constexpr const char op_name[] = "minimum.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::min_override(val_a, val_b);
         },
         ctx,
@@ -54,8 +58,7 @@ Tensor& minimum_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index 114e60ff171..6156227732d 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -97,13 +97,15 @@ Tensor& mul_scalar_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b](const CTYPE_COMPUTE val_a) { return val_a * val_b; },
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b](const auto val_a) { return val_a * val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp
index 81319b03d9f..4d2673cb72d 100644
--- a/kernels/portable/cpu/op_pow.cpp
+++ b/kernels/portable/cpu/op_pow.cpp
@@ -53,8 +53,12 @@ Tensor& pow_Tensor_Tensor_out(
   static constexpr const char op_name[] = "pow.Tensor_Tensor_out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return std::pow(val_a, val_b);
         },
         ctx,
@@ -62,8 +66,7 @@ Tensor& pow_Tensor_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -104,13 +107,16 @@ Tensor& pow_Tensor_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -151,13 +157,16 @@ Tensor& pow_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_a = utils::scalar_to<CTYPE_COMPUTE>(a);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
         ctx,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp
index d34c34a0380..01a5d72de01 100644
--- a/kernels/portable/cpu/op_remainder.cpp
+++ b/kernels/portable/cpu/op_remainder.cpp
@@ -53,9 +53,13 @@ Tensor& remainder_Tensor_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -71,8 +75,7 @@ Tensor& remainder_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -126,15 +129,18 @@ Tensor& remainder_Scalar_out(
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::remainder_override(val_a, val_b);
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp
index 46af021efda..6a0a77b6596 100644
--- a/kernels/portable/cpu/op_rsub.cpp
+++ b/kernels/portable/cpu/op_rsub.cpp
@@ -52,15 +52,17 @@ Tensor& rsub_scalar_out(
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b, val_alpha](const auto val_a) {
           return val_b - val_alpha * val_a;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index 09cfed524f9..acb743a2db6 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -45,8 +45,12 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   static constexpr const char op_name[] = "sigmoid.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_in) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_in) -> CTYPE_COMPUTE {
+          // TODO: rewrite this to be vectorization-capable
           CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
               (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
           return out_val;
@@ -54,8 +58,7 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
index 6217f82c3b1..aa90df8dee4 100644
--- a/kernels/portable/cpu/op_sub.cpp
+++ b/kernels/portable/cpu/op_sub.cpp
@@ -56,8 +56,11 @@ Tensor& sub_out(
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        [val_alpha](const auto val_a, const auto val_b) {
           return val_a - val_alpha * val_b;
         },
         ctx,
@@ -65,8 +68,7 @@ Tensor& sub_out(
         utils::SupportedTensorDtypes::REALHBF16,
         b,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -110,15 +112,17 @@ Tensor& sub_scalar_out(
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b, val_alpha](const auto val_a) {
           return val_a - val_alpha * val_b;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp
index b455c45c2d1..692e296ee00 100644
--- a/kernels/portable/cpu/op_where.cpp
+++ b/kernels/portable/cpu/op_where.cpp
@@ -43,10 +43,13 @@ Tensor& where_out(
   static constexpr const char op_name[] = "where.self_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a,
-           const CTYPE_COMPUTE val_b,
-           const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+    utils::apply_tritensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [](const auto val_a, const auto val_b, const auto val_c) {
+          return val_c ? val_a : val_b;
+        },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -54,8 +57,7 @@ Tensor& where_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         cond,
         utils::SupportedTensorDtypes::BOOL_OR_BYTE,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h
index 6e4c111b8f2..f78ce796e6c 100644
--- a/kernels/portable/cpu/pattern/bitwise_op.h
+++ b/kernels/portable/cpu/pattern/bitwise_op.h
@@ -80,15 +80,18 @@ Tensor& bitwise_tensor_out(
 
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-        utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        utils::apply_bitensor_elementwise_fn<
+            CTYPE_COMPUTE,
+            op_name,
+            utils::SupportedTensorDtypes::REALHBBF16>(
+            // TODO: rewrite this to be vectorization-capable.
             BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value,
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
             b,
             utils::SupportedTensorDtypes::INTB,
-            out,
-            utils::SupportedTensorDtypes::REALHBBF16);
+            out);
       });
 
   return out;
@@ -121,16 +124,19 @@ Tensor& bitwise_scalar_out(
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
         const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-        utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        utils::apply_unitensor_elementwise_fn<
+            CTYPE_COMPUTE,
+            op_name,
+            utils::SupportedTensorDtypes::REALHBBF16>(
             [val_b](const CTYPE_COMPUTE val_a) {
+              // TODO: rewrite this to be vectorization-capable.
               return BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value(
                   val_a, val_b);
             },
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
-            out,
-            utils::SupportedTensorDtypes::REALHBBF16);
+            out);
       });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/comparison_op.h b/kernels/portable/cpu/pattern/comparison_op.h
index e0d9bf4dcab..643d7623922 100644
--- a/kernels/portable/cpu/pattern/comparison_op.h
+++ b/kernels/portable/cpu/pattern/comparison_op.h
@@ -91,15 +91,18 @@ Tensor& comparison_tensor_out(
   ScalarType compute_type = utils::get_compute_type(common_type);
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value,
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -127,15 +130,18 @@ Tensor& comparison_scalar_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           return ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value(val_a, val_b);
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/logical_op.h b/kernels/portable/cpu/pattern/logical_op.h
index 017822a85a6..4547d3df51b 100644
--- a/kernels/portable/cpu/pattern/logical_op.h
+++ b/kernels/portable/cpu/pattern/logical_op.h
@@ -34,15 +34,18 @@ Tensor& logical_tensor_out(
       InvalidArgument,
       out);
 
-  utils::apply_bitensor_elementwise_fn<bool, op_name>(
+  utils::apply_bitensor_elementwise_fn<
+      bool,
+      op_name,
+      utils::SupportedTensorDtypes::REALHBBF16>(
+      // TODO: rewrite this to be vectorization-capable.
       fn,
       ctx,
       a,
       utils::SupportedTensorDtypes::REALHBBF16,
       b,
       utils::SupportedTensorDtypes::REALHBBF16,
-      out,
-      utils::SupportedTensorDtypes::REALHBBF16);
+      out);
 
   return out;
 }

From ef74fe1ea9a9afc5ae255c7879da251ad1146ef2 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 1 Apr 2025 19:33:13 -0700
Subject: [PATCH 22/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.h | 35 ++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index b5cd980b085..eb1ee83111e 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -228,7 +228,7 @@ enum class SupportedTensorDtypes {
 namespace internal {
 
 template <typename CTYPE_COMPUTE, const char* op_name>
-load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_impl(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
@@ -252,7 +252,7 @@ load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
 }
 
 template <typename CTYPE_COMPUTE, const char* op_name>
-store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
+store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn_impl(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
@@ -285,6 +285,37 @@ store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
   return nullptr;
 }
 
+#ifndef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+constexpr const char kGenericElementwiseOpName[] = "generic_elementwise_op";
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  return get_load_to_compute_fn_impl<
+      CTYPE_COMPUTE,
+#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      op_name
+#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      kGenericElementwiseOpName
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      >(t, dtypes);
+}
+
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  return get_store_compute_to_tensor_fn_impl<
+      CTYPE_COMPUTE,
+#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      op_name
+#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      kGenericElementwiseOpName
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      >(t, dtypes);
+}
 bool check_tensor_dtype(
     const Tensor t,
     SupportedTensorDtypes dtypes,

From 3aa266d6537815d70e5332e45c160c9a43346158 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 2 Apr 2025 10:09:37 -0700
Subject: [PATCH 23/27] Update

[ghstack-poisoned]
---
 kernels/portable/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index 53ad88880d6..edea045d65f 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -66,9 +66,7 @@ gen_operators_lib(
 # Portable kernels support optional parallelization (and, in the
 # future, perhaps other performance features). If support is present,
 # produce an optimized version.
-set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
-
-if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_library(optimized_portable_kernels ${_portable_kernels__srcs})
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)

From 3c88a5662e950077bf471a91168ee69c28752af0 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 2 Apr 2025 10:09:41 -0700
Subject: [PATCH 24/27] Update

[ghstack-poisoned]
---
 tools/cmake/executorch-config.cmake | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 49aa6cf08af..56c7fa2d7d4 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -149,7 +149,7 @@ endif()
 if(TARGET coremldelegate)
   set_target_properties(
     coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES
-                             "coreml_inmemoryfs;coreml_util"
+                              "coreml_inmemoryfs;coreml_util"
   )
 endif()
 
@@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib)
 endif()
 if(TARGET extension_threadpool)
   target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+  set_target_properties(
+    extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES
+                                    "cpuinfo;pthreadpool"
+  )
 endif()

From 153735d91465623e35c9394c0b4a0a282eb35327 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 2 Apr 2025 10:09:46 -0700
Subject: [PATCH 25/27] Update

[ghstack-poisoned]
---
 test/CMakeLists.txt               | 13 +++++++
 test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)
 create mode 100644 test/build_optimized_size_test.sh

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3932f1097e1..812e8e4a67a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections")
 endif()
 
+#
+# size_test_all_optimized_ops: binary with optimized ops and no delegate backend
+#
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+add_executable(size_test_all_optimized_ops ${_size_test__srcs})
+target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+target_link_libraries(
+  size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib)
+if(CMAKE_BUILD_TYPE EQUAL "Release")
+  target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections")
+endif()
+endif()
+
 # Print all summary
 executorch_print_configuration_summary()
diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh
new file mode 100644
index 00000000000..181c2ce617d
--- /dev/null
+++ b/test/build_optimized_size_test.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Unlike build_size_test.sh, this script:
+# - does not attempt to disable exceptions and RTTI
+# - as a consequence, is able to build optimized kernels
+# - uses MinSizeRel builds
+# - is not currently intended to run in CI
+# - sets -g to make it easier to use tools like bloaty to investigate size
+
+set -e
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
+
+cmake_install_executorch_lib() {
+  echo "Installing libexecutorch.a"
+  clean_executorch_install_folders
+  update_tokenizers_git_submodule
+  CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \
+          -DCMAKE_CXX_STANDARD_REQUIRED=ON \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE=MinSizeRel \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DOPTIMIZE_SIZE=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+  cmake --build cmake-out -j9 --target install --config MinSizeRel
+}
+
+test_cmake_size_test() {
+    CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test
+
+    echo "Build size test"
+    cmake --build cmake-out/test -j9 --config MinSizeRel
+
+    echo 'ExecuTorch with no ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test
+
+    echo 'ExecuTorch with portable ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_ops
+
+    echo 'ExecuTorch with optimized ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_optimized_ops
+}
+
+if [[ -z $PYTHON_EXECUTABLE ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+cmake_install_executorch_lib
+test_cmake_size_test

From 77a4fc6fc5e4280e394f0fb3f45e360099b0c519 Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 2 Apr 2025 10:09:53 -0700
Subject: [PATCH 26/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/op_add.cpp              | 20 ++++++++-----
 kernels/portable/cpu/op_addmm.cpp            | 10 ++++---
 kernels/portable/cpu/op_atan2.cpp            | 10 ++++---
 kernels/portable/cpu/op_clamp.cpp            | 18 ++++++++----
 kernels/portable/cpu/op_copy.cpp             | 20 ++++++++-----
 kernels/portable/cpu/op_div.cpp              | 31 ++++++++++++--------
 kernels/portable/cpu/op_elu.cpp              | 11 ++++---
 kernels/portable/cpu/op_floor_divide.cpp     |  9 ++++--
 kernels/portable/cpu/op_fmod.cpp             | 18 ++++++++----
 kernels/portable/cpu/op_maximum.cpp          |  8 +++--
 kernels/portable/cpu/op_minimum.cpp          |  9 ++++--
 kernels/portable/cpu/op_mul.cpp              | 10 ++++---
 kernels/portable/cpu/op_pow.cpp              | 27 +++++++++++------
 kernels/portable/cpu/op_remainder.cpp        | 18 ++++++++----
 kernels/portable/cpu/op_rsub.cpp             | 10 ++++---
 kernels/portable/cpu/op_sigmoid.cpp          | 11 ++++---
 kernels/portable/cpu/op_sub.cpp              | 20 ++++++++-----
 kernels/portable/cpu/op_where.cpp            | 14 +++++----
 kernels/portable/cpu/pattern/bitwise_op.h    | 18 ++++++++----
 kernels/portable/cpu/pattern/comparison_op.h | 18 ++++++++----
 kernels/portable/cpu/pattern/logical_op.h    |  9 ++++--
 21 files changed, 201 insertions(+), 118 deletions(-)

diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp
index adb9d4ea723..555341b3447 100644
--- a/kernels/portable/cpu/op_add.cpp
+++ b/kernels/portable/cpu/op_add.cpp
@@ -52,8 +52,11 @@ Tensor& add_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [val_alpha](const auto val_a, const auto val_b) {
           return val_a + val_alpha * val_b;
         },
         ctx,
@@ -61,8 +64,7 @@ Tensor& add_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -100,8 +102,11 @@ Tensor& add_scalar_out(
   static constexpr const char op_name[] = "add.Scalar_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [b, alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [b, alpha](const auto val_a) {
           CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
           CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
           return val_a + val_alpha * val_b;
@@ -109,8 +114,7 @@ Tensor& add_scalar_out(
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp
index d1df5818cd8..440a8b2c0fa 100644
--- a/kernels/portable/cpu/op_addmm.cpp
+++ b/kernels/portable/cpu/op_addmm.cpp
@@ -88,8 +88,11 @@ Tensor& addmm_out(
           n,
           p);
 
-      utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-          [alpha_val, beta_val](const CTYPE val_a, const CTYPE val_b) {
+      utils::apply_bitensor_elementwise_fn<
+          CTYPE,
+          op_name,
+          utils::SupportedTensorDtypes::REALHBF16>(
+          [alpha_val, beta_val](const auto val_a, const auto val_b) {
             return val_a * alpha_val + val_b * beta_val;
           },
           ctx,
@@ -97,8 +100,7 @@ Tensor& addmm_out(
           utils::SupportedTensorDtypes::REALHBF16,
           in,
           utils::SupportedTensorDtypes::REALHBF16,
-          out,
-          utils::SupportedTensorDtypes::REALHBF16);
+          out);
     }
   });
 
diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp
index 19267ef49dd..33d66cf2ad7 100644
--- a/kernels/portable/cpu/op_atan2.cpp
+++ b/kernels/portable/cpu/op_atan2.cpp
@@ -55,8 +55,11 @@ Tensor& atan2_out(
   static constexpr const char op_name[] = "atan2.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_a, const auto val_b) {
           return std::atan2(val_a, val_b);
         },
         ctx,
@@ -64,8 +67,7 @@ Tensor& atan2_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp
index c1c40a38f34..6974789eccf 100644
--- a/kernels/portable/cpu/op_clamp.cpp
+++ b/kernels/portable/cpu/op_clamp.cpp
@@ -134,8 +134,12 @@ Tensor& clamp_out(
   static constexpr const char op_name[] = "clamp.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
         [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(
@@ -150,8 +154,7 @@ Tensor& clamp_out(
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
@@ -210,11 +213,15 @@ Tensor& clamp_tensor_out(
   static constexpr const char op_name[] = "clamp.Tensor_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_tritensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [has_min, has_max](
             const CTYPE_COMPUTE val_in,
             const CTYPE_COMPUTE val_min,
             const CTYPE_COMPUTE val_max) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE val_out = val_in;
           if (has_min) {
             val_out = utils::max_override(val_out, val_min);
@@ -231,8 +238,7 @@ Tensor& clamp_tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         max,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp
index 19b0c3a2f6a..30fff4d2c10 100644
--- a/kernels/portable/cpu/op_copy.cpp
+++ b/kernels/portable/cpu/op_copy.cpp
@@ -47,15 +47,17 @@ Tensor& copy_out(
   static constexpr const char op_name[] = "copy.out";
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy.out", CTYPE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-        [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [](ET_UNUSED const auto _, const auto val_src) { return val_src; },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
         src,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -80,15 +82,17 @@ Tensor& copy_(
   static constexpr const char op_name[] = "copy_";
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy_", CTYPE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE, op_name>(
-        [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        [](ET_UNUSED const auto _, const auto val_src) { return val_src; },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
         src,
         utils::SupportedTensorDtypes::REALHBBF16,
-        in,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        in);
   });
 
   return in;
diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp
index 94cd9ea5011..70f9479c464 100644
--- a/kernels/portable/cpu/op_div.cpp
+++ b/kernels/portable/cpu/op_div.cpp
@@ -58,17 +58,17 @@ Tensor& div_out(
   static constexpr const char op_name[] = "div.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          return val_a / val_b;
-        },
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_a, const auto val_b) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
@@ -122,9 +122,13 @@ Tensor& div_out_mode(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [mode_is_trunc, &div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
               div_by_zero_error = true;
@@ -146,8 +150,7 @@ Tensor& div_out_mode(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -188,13 +191,15 @@ Tensor& div_scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b](const CTYPE_COMPUTE val_a) { return val_a / val_b; },
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b](const auto val_a) { return val_a / val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp
index d4846fb1bfb..d6533642860 100644
--- a/kernels/portable/cpu/op_elu.cpp
+++ b/kernels/portable/cpu/op_elu.cpp
@@ -44,8 +44,12 @@ Tensor& elu_out(
     ET_EXTRACT_SCALAR(scale, math_scale);
     ET_EXTRACT_SCALAR(input_scale, math_input_scale);
     const auto negcoef = math_alpha * math_scale;
-    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
-        [negcoef, math_scale, math_input_scale](auto x) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [negcoef, math_scale, math_input_scale](const auto x) {
+          // TODO: rewrite this to be vectorization-capable.
           return MathT(x) <= MathT(0)
               ? std::expm1(MathT(x) * math_input_scale) * negcoef
               : MathT(x) * math_scale;
@@ -53,8 +57,7 @@ Tensor& elu_out(
         ctx,
         in,
         utils::SupportedTensorDtypes::FLOATHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
   return out;
 }
diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp
index 85eb612ea1e..50723c3fa0a 100644
--- a/kernels/portable/cpu/op_floor_divide.cpp
+++ b/kernels/portable/cpu/op_floor_divide.cpp
@@ -53,9 +53,13 @@ Tensor& floor_divide_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
               div_by_zero_error = true;
@@ -69,8 +73,7 @@ Tensor& floor_divide_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp
index 1e8cba0f1ae..96a971b166a 100644
--- a/kernels/portable/cpu/op_fmod.cpp
+++ b/kernels/portable/cpu/op_fmod.cpp
@@ -55,9 +55,13 @@ Tensor& fmod_Tensor_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -73,8 +77,7 @@ Tensor& fmod_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -131,16 +134,19 @@ Tensor& fmod_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = std::fmod(val_a, val_b);
           return value;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp
index 5cf3b5a19f8..3a84095a4df 100644
--- a/kernels/portable/cpu/op_maximum.cpp
+++ b/kernels/portable/cpu/op_maximum.cpp
@@ -45,7 +45,10 @@ Tensor& maximum_out(
   static constexpr const char op_name[] = "maximum.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
           return utils::max_override(val_a, val_b);
         },
@@ -54,8 +57,7 @@ Tensor& maximum_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp
index e2c641bdb22..5c0e79eb9bb 100644
--- a/kernels/portable/cpu/op_minimum.cpp
+++ b/kernels/portable/cpu/op_minimum.cpp
@@ -45,8 +45,12 @@ Tensor& minimum_out(
   static constexpr const char op_name[] = "minimum.out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::min_override(val_a, val_b);
         },
         ctx,
@@ -54,8 +58,7 @@ Tensor& minimum_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
index 114e60ff171..6156227732d 100644
--- a/kernels/portable/cpu/op_mul.cpp
+++ b/kernels/portable/cpu/op_mul.cpp
@@ -97,13 +97,15 @@ Tensor& mul_scalar_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b](const CTYPE_COMPUTE val_a) { return val_a * val_b; },
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b](const auto val_a) { return val_a * val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp
index 81319b03d9f..4d2673cb72d 100644
--- a/kernels/portable/cpu/op_pow.cpp
+++ b/kernels/portable/cpu/op_pow.cpp
@@ -53,8 +53,12 @@ Tensor& pow_Tensor_Tensor_out(
   static constexpr const char op_name[] = "pow.Tensor_Tensor_out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           return std::pow(val_a, val_b);
         },
         ctx,
@@ -62,8 +66,7 @@ Tensor& pow_Tensor_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -104,13 +107,16 @@ Tensor& pow_Tensor_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -151,13 +157,16 @@ Tensor& pow_Scalar_out(
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_a = utils::scalar_to<CTYPE_COMPUTE>(a);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); },
         ctx,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp
index d34c34a0380..01a5d72de01 100644
--- a/kernels/portable/cpu/op_remainder.cpp
+++ b/kernels/portable/cpu/op_remainder.cpp
@@ -53,9 +53,13 @@ Tensor& remainder_Tensor_out(
   bool div_by_zero_error = false;
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [&div_by_zero_error](
             const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+          // TODO: rewrite this to be vectorization-capable.
           CTYPE_COMPUTE value = 0;
           if (is_integral_type<CTYPE_COMPUTE, /*includeBool=*/true>::value) {
             if (val_b == 0) {
@@ -71,8 +75,7 @@ Tensor& remainder_Tensor_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   ET_KERNEL_CHECK_MSG(
@@ -126,15 +129,18 @@ Tensor& remainder_Scalar_out(
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           return utils::remainder_override(val_a, val_b);
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp
index 46af021efda..6a0a77b6596 100644
--- a/kernels/portable/cpu/op_rsub.cpp
+++ b/kernels/portable/cpu/op_rsub.cpp
@@ -52,15 +52,17 @@ Tensor& rsub_scalar_out(
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b, val_alpha](const auto val_a) {
           return val_b - val_alpha * val_a;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp
index 09cfed524f9..acb743a2db6 100644
--- a/kernels/portable/cpu/op_sigmoid.cpp
+++ b/kernels/portable/cpu/op_sigmoid.cpp
@@ -45,8 +45,12 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
   static constexpr const char op_name[] = "sigmoid.out";
 
   ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_in) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::FLOATHBF16>(
+        [](const auto val_in) -> CTYPE_COMPUTE {
+          // TODO: rewrite this to be vectorization-capable
           CTYPE_COMPUTE out_val = static_cast<CTYPE_COMPUTE>(1.0) /
               (static_cast<CTYPE_COMPUTE>(1.0) + exp(-val_in));
           return out_val;
@@ -54,8 +58,7 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp
index 6217f82c3b1..aa90df8dee4 100644
--- a/kernels/portable/cpu/op_sub.cpp
+++ b/kernels/portable/cpu/op_sub.cpp
@@ -56,8 +56,11 @@ Tensor& sub_out(
 
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBF16>(
+        [val_alpha](const auto val_a, const auto val_b) {
           return val_a - val_alpha * val_b;
         },
         ctx,
@@ -65,8 +68,7 @@ Tensor& sub_out(
         utils::SupportedTensorDtypes::REALHBF16,
         b,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBF16);
+        out);
   });
 
   return out;
@@ -110,15 +112,17 @@ Tensor& sub_scalar_out(
   ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     const CTYPE_COMPUTE val_alpha = utils::scalar_to<CTYPE_COMPUTE>(alpha);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b, val_alpha](const CTYPE_COMPUTE val_a) {
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [val_b, val_alpha](const auto val_a) {
           return val_a - val_alpha * val_b;
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBF16,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp
index b455c45c2d1..692e296ee00 100644
--- a/kernels/portable/cpu/op_where.cpp
+++ b/kernels/portable/cpu/op_where.cpp
@@ -43,10 +43,13 @@ Tensor& where_out(
   static constexpr const char op_name[] = "where.self_out";
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_tritensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [](const CTYPE_COMPUTE val_a,
-           const CTYPE_COMPUTE val_b,
-           const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; },
+    utils::apply_tritensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::SAME_AS_COMMON>(
+        [](const auto val_a, const auto val_b, const auto val_c) {
+          return val_c ? val_a : val_b;
+        },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
@@ -54,8 +57,7 @@ Tensor& where_out(
         utils::SupportedTensorDtypes::REALHBBF16,
         cond,
         utils::SupportedTensorDtypes::BOOL_OR_BYTE,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h
index 6e4c111b8f2..f78ce796e6c 100644
--- a/kernels/portable/cpu/pattern/bitwise_op.h
+++ b/kernels/portable/cpu/pattern/bitwise_op.h
@@ -80,15 +80,18 @@ Tensor& bitwise_tensor_out(
 
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-        utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        utils::apply_bitensor_elementwise_fn<
+            CTYPE_COMPUTE,
+            op_name,
+            utils::SupportedTensorDtypes::REALHBBF16>(
+            // TODO: rewrite this to be vectorization-capable.
             BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value,
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
             b,
             utils::SupportedTensorDtypes::INTB,
-            out,
-            utils::SupportedTensorDtypes::REALHBBF16);
+            out);
       });
 
   return out;
@@ -121,16 +124,19 @@ Tensor& bitwise_scalar_out(
   ET_SWITCH_INT_TYPES_AND(
       Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
         const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-        utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+        utils::apply_unitensor_elementwise_fn<
+            CTYPE_COMPUTE,
+            op_name,
+            utils::SupportedTensorDtypes::REALHBBF16>(
             [val_b](const CTYPE_COMPUTE val_a) {
+              // TODO: rewrite this to be vectorization-capable.
               return BitwiseFnForOp<CTYPE_COMPUTE, op_name>::value(
                   val_a, val_b);
             },
             ctx,
             a,
             utils::SupportedTensorDtypes::INTB,
-            out,
-            utils::SupportedTensorDtypes::REALHBBF16);
+            out);
       });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/comparison_op.h b/kernels/portable/cpu/pattern/comparison_op.h
index e0d9bf4dcab..643d7623922 100644
--- a/kernels/portable/cpu/pattern/comparison_op.h
+++ b/kernels/portable/cpu/pattern/comparison_op.h
@@ -91,15 +91,18 @@ Tensor& comparison_tensor_out(
   ScalarType compute_type = utils::get_compute_type(common_type);
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
-    utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_bitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
+        // TODO: rewrite this to be vectorization-capable.
         ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value,
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
         b,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
@@ -127,15 +130,18 @@ Tensor& comparison_scalar_out(
 
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
-    utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
+    utils::apply_unitensor_elementwise_fn<
+        CTYPE_COMPUTE,
+        op_name,
+        utils::SupportedTensorDtypes::REALHBBF16>(
         [val_b](const CTYPE_COMPUTE val_a) {
+          // TODO: rewrite this to be vectorization-capable.
           return ComparisonFnForOp<CTYPE_COMPUTE, op_name>::value(val_a, val_b);
         },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::REALHBBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/pattern/logical_op.h b/kernels/portable/cpu/pattern/logical_op.h
index 017822a85a6..4547d3df51b 100644
--- a/kernels/portable/cpu/pattern/logical_op.h
+++ b/kernels/portable/cpu/pattern/logical_op.h
@@ -34,15 +34,18 @@ Tensor& logical_tensor_out(
       InvalidArgument,
       out);
 
-  utils::apply_bitensor_elementwise_fn<bool, op_name>(
+  utils::apply_bitensor_elementwise_fn<
+      bool,
+      op_name,
+      utils::SupportedTensorDtypes::REALHBBF16>(
+      // TODO: rewrite this to be vectorization-capable.
       fn,
       ctx,
       a,
       utils::SupportedTensorDtypes::REALHBBF16,
       b,
       utils::SupportedTensorDtypes::REALHBBF16,
-      out,
-      utils::SupportedTensorDtypes::REALHBBF16);
+      out);
 
   return out;
 }

From 21ae5da3534db7560d72fcec2f327167dd08240d Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 2 Apr 2025 10:09:57 -0700
Subject: [PATCH 27/27] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.h | 35 ++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index b5cd980b085..eb1ee83111e 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -228,7 +228,7 @@ enum class SupportedTensorDtypes {
 namespace internal {
 
 template <typename CTYPE_COMPUTE, const char* op_name>
-load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn_impl(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
@@ -252,7 +252,7 @@ load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
 }
 
 template <typename CTYPE_COMPUTE, const char* op_name>
-store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
+store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn_impl(
     const Tensor& t,
     SupportedTensorDtypes dtypes) {
   switch (dtypes) {
@@ -285,6 +285,37 @@ store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
   return nullptr;
 }
 
+#ifndef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+constexpr const char kGenericElementwiseOpName[] = "generic_elementwise_op";
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+
+template <typename CTYPE_COMPUTE, const char* op_name>
+load_to_compute_fn<CTYPE_COMPUTE> get_load_to_compute_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  return get_load_to_compute_fn_impl<
+      CTYPE_COMPUTE,
+#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      op_name
+#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      kGenericElementwiseOpName
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      >(t, dtypes);
+}
+
+template <typename CTYPE_COMPUTE, const char* op_name>
+store_compute_to_tensor_fn<CTYPE_COMPUTE> get_store_compute_to_tensor_fn(
+    const Tensor& t,
+    SupportedTensorDtypes dtypes) {
+  return get_store_compute_to_tensor_fn_impl<
+      CTYPE_COMPUTE,
+#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      op_name
+#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      kGenericElementwiseOpName
+#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE
+      >(t, dtypes);
+}
 bool check_tensor_dtype(
     const Tensor t,
     SupportedTensorDtypes dtypes,