diff --git a/CMakeLists.txt b/CMakeLists.txt
index b34ed07a10e..6dbb66afdaa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch)
 # Real integrations should supply their own YAML file that only lists the
 # operators necessary for the models that will run.
 #
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  # find pytorch lib here to make it available to all
+  # sub-directories. Find it before including portable so that
+  # optimized_portable_kernels can use it.
+  find_package_torch_headers()
+endif()
+
 if(BUILD_EXECUTORCH_PORTABLE_OPS)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
-  # find pytorch lib here to make it available to all sub-directories
-  find_package_torch_headers()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
index 7cba9e91fe5..693be68c35e 100644
--- a/kernels/optimized/CMakeLists.txt
+++ b/kernels/optimized/CMakeLists.txt
@@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}")
 list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
 add_library(optimized_kernels ${_optimized_kernels__srcs})
 target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft")
+target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
 target_link_libraries(
   optimized_kernels PUBLIC executorch_core cpublas extension_threadpool
 )
diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt
index e27ba12ac0d..edea045d65f 100644
--- a/kernels/portable/CMakeLists.txt
+++ b/kernels/portable/CMakeLists.txt
@@ -66,13 +66,13 @@ gen_operators_lib(
 # Portable kernels support optional parallelization (and, in the
 # future, perhaps other performance features). If support is present,
 # produce an optimized version.
-set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)
-
-if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
+if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
   add_library(optimized_portable_kernels ${_portable_kernels__srcs})
   target_link_libraries(optimized_portable_kernels PRIVATE executorch)
   target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
   target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
+  target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS})
+  target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS)
   install(
     TARGETS optimized_portable_kernels
     DESTINATION lib
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
index 206be87f98e..4e0718bc522 100644
--- a/kernels/portable/cpu/util/elementwise_util.h
+++ b/kernels/portable/cpu/util/elementwise_util.h
@@ -51,12 +51,8 @@ inline int64_t scalar_to<int64_t>(const Scalar& s) {
 }
 
 namespace internal {
-template <
-    typename CTYPE_COMPUTE,
-    const char* op_name,
-    typename Op,
-    typename... Args>
-inline void apply_elementwise_fn(
+template <typename CTYPE_COMPUTE, typename Op, typename... Args>
+inline bool validate_elementwise_fn_inputs(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
     const Tensor& out,
@@ -65,7 +61,6 @@ inline void apply_elementwise_fn(
   static_assert(
       (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
        ...));
-  constexpr auto kNumInputs = sizeof...(inputs);
   constexpr auto compute_type = CppTypeToScalarType<CTYPE_COMPUTE>::value;
   const auto check_input_dtype = [](auto input, auto compute_type) {
     return internal::check_tensor_dtype(
@@ -75,7 +70,30 @@ inline void apply_elementwise_fn(
       ctx,
       (check_input_dtype(inputs, compute_type) && ...) &&
           internal::check_tensor_dtype(out, out_dtypes, compute_type),
-      InvalidArgument, );
+      InvalidArgument,
+      false);
+
+  return true;
+}
+
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    typename Op,
+    typename... Args>
+inline void apply_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes,
+    Args... inputs) {
+  const bool inputs_valid = validate_elementwise_fn_inputs<CTYPE_COMPUTE>(
+      compute_fun, ctx, out, out_dtypes, inputs...);
+  if (!inputs_valid) {
+    return;
+  }
+
+  constexpr auto kNumInputs = sizeof...(inputs);
 
   struct InputInfo {
     load_to_compute_fn<CTYPE_COMPUTE> load_to_compute;
@@ -120,6 +138,7 @@ inline void apply_elementwise_fn(
       });
 }
 
+/// DEPRECATED: prefer the variant with out_dtypes in the template argument.
 template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
 inline void apply_unitensor_elementwise_fn(
     const Op& compute_fun,
@@ -132,12 +151,53 @@ inline void apply_unitensor_elementwise_fn(
       compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
 }
 
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op>
+inline void apply_unitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& out) {
+  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
+      compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes));
+}
+
+/**
+ * DEPRECATED: prefer the variant with out_dtypes in the template argument list.
+ */
+template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
+inline void apply_bitensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& b,
+    SupportedTensorDtypes b_dtypes,
+    const Tensor& out,
+    SupportedTensorDtypes out_dtypes) {
+  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
+      compute_fun,
+      ctx,
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
+}
+
 /**
  * Useful for bi-tensor elementwise operators. For each element of the inputs,
  * perform a computation and write to the corresponding element of the output.
  * Tensor broadcasting is applied wherever it is required.
  */
-template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op>
 inline void apply_bitensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -145,6 +205,29 @@ inline void apply_bitensor_elementwise_fn(
     SupportedTensorDtypes a_dtypes,
     const Tensor& b,
     SupportedTensorDtypes b_dtypes,
+    const Tensor& out) {
+  internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
+      compute_fun,
+      ctx,
+      out,
+      out_dtypes,
+      std::make_pair(&a, a_dtypes),
+      std::make_pair(&b, b_dtypes));
+}
+
+/**
+ * DEPRECATED: prefer the variant with out_dtypes in the template argument list.
+ */
+template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
+inline void apply_tritensor_elementwise_fn(
+    const Op& compute_fun,
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    SupportedTensorDtypes a_dtypes,
+    const Tensor& b,
+    SupportedTensorDtypes b_dtypes,
+    const Tensor& c,
+    SupportedTensorDtypes c_dtypes,
     const Tensor& out,
     SupportedTensorDtypes out_dtypes) {
   internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
@@ -153,7 +236,8 @@ inline void apply_bitensor_elementwise_fn(
       out,
       out_dtypes,
       std::make_pair(&a, a_dtypes),
-      std::make_pair(&b, b_dtypes));
+      std::make_pair(&b, b_dtypes),
+      std::make_pair(&c, c_dtypes));
 }
 
 /**
@@ -176,7 +260,11 @@ inline void apply_bitensor_elementwise_fn(
  * static constexpr const char op_name[] = "my_op";
  * apply_ternary_elementwise_fn<CTYPE_COMPUTE, op_name>.
  */
-template <typename CTYPE_COMPUTE, const char* op_name, typename Op>
+template <
+    typename CTYPE_COMPUTE,
+    const char* op_name,
+    SupportedTensorDtypes out_dtypes,
+    typename Op>
 inline void apply_tritensor_elementwise_fn(
     const Op& compute_fun,
     KernelRuntimeContext& ctx,
@@ -186,8 +274,7 @@ inline void apply_tritensor_elementwise_fn(
     SupportedTensorDtypes b_dtypes,
     const Tensor& c,
     SupportedTensorDtypes c_dtypes,
-    const Tensor& out,
-    SupportedTensorDtypes out_dtypes) {
+    const Tensor& out) {
   internal::apply_elementwise_fn<CTYPE_COMPUTE, op_name>(
       compute_fun,
       ctx,
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
index dbe35f8eefd..d9d72b5be3f 100644
--- a/runtime/core/portable_type/c10/c10/targets.bzl
+++ b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -73,6 +73,7 @@ def define_common_targets():
             # -Wmacro-redefined, and we only care about getting
             # reasonable vectorization and Sleef support.
             "-DCPU_CAPABILITY_AVX2",
+            "-DET_USE_PYTORCH_HEADERS",
             "-DHAVE_AVX2_CPU_DEFINITION",
             "-DSTANDALONE_TORCH_HEADER",
         ] + get_sleef_preprocessor_flags(),
@@ -86,5 +87,5 @@ def define_common_targets():
             # linker failure.
             "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(),
             "DEFAULT": [],
-        }) + ["-DSTANDALONE_TORCH_HEADER"],
+        }) + ["-DSTANDALONE_TORCH_HEADER"] + ([] if runtime.is_oss else ["-DET_USE_PYTORCH_HEADERS"]),
     )
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3932f1097e1..812e8e4a67a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release")
   target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections")
 endif()
 
+#
+# size_test_all_optimized_ops: binary with optimized ops and no delegate backend
+#
+if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+add_executable(size_test_all_optimized_ops ${_size_test__srcs})
+target_link_options_shared_lib(optimized_native_cpu_ops_lib)
+target_link_libraries(
+  size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib)
+if(CMAKE_BUILD_TYPE EQUAL "Release")
+  target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections")
+endif()
+endif()
+
 # Print all summary
 executorch_print_configuration_summary()
diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh
new file mode 100644
index 00000000000..181c2ce617d
--- /dev/null
+++ b/test/build_optimized_size_test.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Unlike build_size_test.sh, this script:
+# - does not attempt to disable exceptions and RTTI
+# - as a consequence, is able to build optimized kernels
+# - uses MinSizeRel builds
+# - is not currently intended to run in CI
+# - sets -g to make it easier to use tools like bloaty to investigate size
+
+set -e
+
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh"
+
+cmake_install_executorch_lib() {
+  echo "Installing libexecutorch.a"
+  clean_executorch_install_folders
+  update_tokenizers_git_submodule
+  CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \
+          -DCMAKE_CXX_STANDARD_REQUIRED=ON \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_BUILD_TYPE=MinSizeRel \
+          -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DOPTIMIZE_SIZE=ON \
+          -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+          -Bcmake-out .
+  cmake --build cmake-out -j9 --target install --config MinSizeRel
+}
+
+test_cmake_size_test() {
+    CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test
+
+    echo "Build size test"
+    cmake --build cmake-out/test -j9 --config MinSizeRel
+
+    echo 'ExecuTorch with no ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test
+
+    echo 'ExecuTorch with portable ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_ops
+
+    echo 'ExecuTorch with optimized ops binary size, unstripped:'
+    ls -al cmake-out/test/size_test_all_optimized_ops
+}
+
+if [[ -z $PYTHON_EXECUTABLE ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+cmake_install_executorch_lib
+test_cmake_size_test
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index 49aa6cf08af..56c7fa2d7d4 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -149,7 +149,7 @@ endif()
 if(TARGET coremldelegate)
   set_target_properties(
     coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES
-                             "coreml_inmemoryfs;coreml_util"
+                              "coreml_inmemoryfs;coreml_util"
   )
 endif()
 
@@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib)
 endif()
 if(TARGET extension_threadpool)
   target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL)
+  set_target_properties(
+    extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES
+                                    "cpuinfo;pthreadpool"
+  )
 endif()