From 31a49e0212d88b6b16979f926f437beed9cde1dc Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:07 -0700
Subject: [PATCH 1/3] Update

[ghstack-poisoned]
---
 CMakeLists.txt                                 | 9 +++++++--
 kernels/optimized/CMakeLists.txt               | 1 +
 kernels/portable/CMakeLists.txt                | 2 ++
 runtime/core/portable_type/c10/c10/targets.bzl | 3 ++-
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3a53b8a6e2a..8fe08a2c25e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch)
 # Real integrations should supply their own YAML file that only lists the
 # operators necessary for the models that will run.
 #
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  # find pytorch lib here to make it available to all
+  # sub-directories. Find it before including portable so that
+  # optimized_portabale_kernels can use it.
+  find_package_torch_headers()
+endif()
+
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
-  # find pytorch lib here to make it available to all sub-directories
-  find_package_torch_headers()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 7cba9e91fe5..693be68c35e 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}")
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
 target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
+target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
 target_link_libraries(
   optimized_kernels PUBLIC executorch_core cpublas extension_threadpool
 )
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index e27ba12ac0d..53ad88880d6 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -73,6 +73,8 @@ if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
   target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
+  target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
+  target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
   install(
     TARGETS optimized_portable_kernels
     DESTINATION lib
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index a727cb19ac1..b8883c75bfe 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -73,6 +73,7 @@ def define_common_targets():
             # -Wmacro-redefined, and we only care about getting
             # reasonable vectorization and Sleef support.
             "-DCPU_CAPABILITY_AVX2",
+            "-DET_USE_PYTORCH_HEADERS",
             "-DHAVE_AVX2_CPU_DEFINITION",
             "-DSTANDALONE_TORCH_HEADER",
         ] + get_sleef_preprocessor_flags(),
@@ -87,5 +88,5 @@ def define_common_targets():
             # linker failure.
             "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(),
             "DEFAULT": [],
-        }) + ["-DSTANDALONE_TORCH_HEADER"],
+        }) + ["-DET_USE_PYTORCH_HEADERS", "-DSTANDALONE_TORCH_HEADER"],
     )

From 9fcd8857fb0e00bee0b401f5e25f1fd081fe3c9c Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Tue, 18 Mar 2025 17:32:12 -0700
Subject: [PATCH 2/3] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.h       | 11 ----------
 kernels/portable/cpu/util/elementwise_util.h | 23 ++++++++++++++++----
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 2bbd5de4577..59b82cdc51b 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -86,12 +86,6 @@ load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_bool_or_byte(
 template <typename CTYPE_COMMON, const char* op_name>
 load_to_common_fn<CTYPE_COMMON> get_load_to_common_fn_same_as_compute(
     const Tensor& t) {
-  constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-  ET_CHECK_MSG(
-      t.scalar_type() == common_scalar_type,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(common_scalar_type),
-      op_name);
   return internal::load_and_convert<CTYPE_COMMON, CTYPE_COMMON>;
 }
 
@@ -180,11 +174,6 @@ template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) {
   constexpr auto common_scalar_type = CppTypeToScalarType<CTYPE_COMMON>::value;
-  ET_CHECK_MSG(
-      t.scalar_type() == common_scalar_type,
-      "Unhandled dtype %s for %s",
-      ::executorch::runtime::toString(common_scalar_type),
-      op_name);
   return internal::convert_and_store<CTYPE_COMMON, CTYPE_COMMON>;
 }
 
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index f5932069005..021ec42bf27 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -51,6 +51,13 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 }
 
 namespace internal {
+template <typename Ignore, typename T>
+using ignore_first_yield_second = T;
+
+template <typename CTYPE_COMMON, typename Op, typename... Args>
+using op_call_result =
+    std::invoke_result_t<Op, ignore_first_yield_second<Args, CTYPE_COMMON>...>;
+
 template <
     typename CTYPE_COMMON,
     const char* op_name,
@@ -89,9 +96,16 @@ inline void apply_elementwise_fn(
       inputs.first->element_size(),
   })...};
 
-  const auto store_common_to_out =
-      internal::get_store_common_to_tensor_fn<CTYPE_COMMON, op_name>(
-          out, out_dtypes);
+  // NOTE: the result of compute_fun is not necessarily CTYPE_COMMON!
+  // For example, consider the possibility that compute_fun is a
+  // trigonometric function like acos, the common input type is bool,
+  // and the output type is float -- we would truncate acos(0) ~= 1.67
+  // to just 1. Conveniently, it costs us nothing at runtime to handle
+  // this correctly.
+  const auto store_compute_result_to_out =
+      internal::get_store_common_to_tensor_fn<
+          op_call_result<CTYPE_COMMON, Op, Args...>,
+          op_name>(out, out_dtypes);
   char* const data_out = reinterpret_cast<char*>(out.mutable_data_ptr());
   const auto out_element_size = out.element_size();
 
@@ -114,7 +128,8 @@ inline void apply_elementwise_fn(
                      .data_ptr[indexes[idx + 1] * input_info.element_size]);
           }
           auto result = std::apply(compute_fun, loaded_inputs);
-          store_common_to_out(result, &data_out[indexes[0] * out_element_size]);
+          store_compute_result_to_out(
+              result, &data_out[indexes[0] * out_element_size]);
         }
       });
 }

From 40c1b1be46d2ad91f6ca39fe3008d9b685d3f45b Mon Sep 17 00:00:00 2001
From: Scott Wolchok <swolchok@meta.com>
Date: Wed, 19 Mar 2025 09:58:10 -0700
Subject: [PATCH 3/3] Update

[ghstack-poisoned]
---
 kernels/portable/cpu/util/dtype_util.h | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h
index 76579301850..1f0e3403e82 100644
--- a/kernels/portable/cpu/util/dtype_util.h
+++ b/kernels/portable/cpu/util/dtype_util.h
@@ -173,27 +173,13 @@ get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) {
 template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) {
-  return internal::convert_and_store<CTYPE_COMMON, CTYPE_COMMON>;
+  // We already validate tensor types earlier in the process, so at
+  // this phase, treat same_as_compute the same as our widest
+  // SupportedTensorDtypes set.
+  return get_store_common_to_tensor_fn_realhbf16<CTYPE_COMMON, op_name>(t);
 }
 
-template <
-    typename CTYPE_COMMON,
-    const char* op_name,
-    std::enable_if_t<std::is_same_v<CTYPE_COMMON, float>, bool> = true>
-store_common_to_tensor_fn<CTYPE_COMMON>
-get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
-  void (*result)(CTYPE_COMMON, void*) = nullptr;
-  ET_SWITCH_THREE_TYPES(
-      Float, Half, BFloat16, t.scalar_type(), unused, op_name, CTYPE, [&]() {
-        result = internal::convert_and_store<CTYPE, CTYPE_COMMON>;
-      });
-  return result;
-}
-
-template <
-    typename CTYPE_COMMON,
-    const char* op_name,
-    std::enable_if_t<!std::is_same_v<CTYPE_COMMON, float>, bool> = true>
+template <typename CTYPE_COMMON, const char* op_name>
 store_common_to_tensor_fn<CTYPE_COMMON>
 get_store_common_to_tensor_fn_same_as_common(const Tensor& t) {
   return get_store_common_to_tensor_fn_same_as_compute<CTYPE_COMMON, op_name>(