From 9fcd8857fb0e00bee0b401f5e25f1fd081fe3c9c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:12 -0700 Subject: [PATCH 01/10] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.h | 11 ---------- kernels/portable/cpu/util/elementwise_util.h | 23 ++++++++++++++++---- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 2bbd5de4577..59b82cdc51b 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -86,12 +86,6 @@ load_to_common_fn get_load_to_common_fn_bool_or_byte( template load_to_common_fn get_load_to_common_fn_same_as_compute( const Tensor& t) { - constexpr auto common_scalar_type = CppTypeToScalarType::value; - ET_CHECK_MSG( - t.scalar_type() == common_scalar_type, - "Unhandled dtype %s for %s", - ::executorch::runtime::toString(common_scalar_type), - op_name); return internal::load_and_convert; } @@ -180,11 +174,6 @@ template store_common_to_tensor_fn get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) { constexpr auto common_scalar_type = CppTypeToScalarType::value; - ET_CHECK_MSG( - t.scalar_type() == common_scalar_type, - "Unhandled dtype %s for %s", - ::executorch::runtime::toString(common_scalar_type), - op_name); return internal::convert_and_store; } diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index f5932069005..021ec42bf27 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -51,6 +51,13 @@ inline int64_t scalar_to(const Scalar& s) { } namespace internal { +template +using ignore_first_yield_second = T; + +template +using op_call_result = + std::invoke_result_t...>; + template < typename CTYPE_COMMON, const char* op_name, @@ -89,9 +96,16 @@ inline void apply_elementwise_fn( inputs.first->element_size(), })...}; - const auto store_common_to_out = - internal::get_store_common_to_tensor_fn( - out, out_dtypes); + // NOTE: the result of compute_fun is not necessarily CTYPE_COMMON! + // For example, consider the possibility that compute_fun is a + // trigonometric function like acos, the common input type is bool, + // and the output type is float -- we would truncate acos(0) ~= 1.67 + // to just 1. Conveniently, it costs us nothing at runtime to handle + // this correctly. + const auto store_compute_result_to_out = + internal::get_store_common_to_tensor_fn< + op_call_result, + op_name>(out, out_dtypes); char* const data_out = reinterpret_cast(out.mutable_data_ptr()); const auto out_element_size = out.element_size(); @@ -114,7 +128,8 @@ inline void apply_elementwise_fn( .data_ptr[indexes[idx + 1] * input_info.element_size]); } auto result = std::apply(compute_fun, loaded_inputs); - store_common_to_out(result, &data_out[indexes[0] * out_element_size]); + store_compute_result_to_out( + result, &data_out[indexes[0] * out_element_size]); } }); } From 29d6de9d2e63b567e242aea0b7949d7250f12b34 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:16 -0700 Subject: [PATCH 02/10] Update [ghstack-poisoned] --- .../cpu/pattern/unary_ufunc_realh.cpp | 19 ++++--- .../pattern/unary_ufunc_realhb_to_bool.cpp | 26 +++++----- .../unary_ufunc_realhbbf16_to_floathbf16.cpp | 27 +++++----- kernels/portable/cpu/util/dtype_util.cpp | 4 ++ kernels/portable/cpu/util/dtype_util.h | 50 +++++++++++++++++++ 5 files changed, 94 insertions(+), 32 deletions(-) diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp index 16d847ace31..f7050e8410b 100644 --- a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp +++ b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp @@ -7,7 +7,7 @@ */ #include -#include +#include #include namespace torch { @@ -36,12 +36,19 @@ Tensor& unary_ufunc_realh( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); - ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] { - apply_unary_map_fn( + // TODO: this is broken for dtype_selective_build: this was + // __func__, which isn't the operator name. + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "unary_ufunc_realh"; + + ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] { + utils::apply_unitensor_elementwise_fn( [fn](const CTYPE val_in) { return static_cast(fn(val_in)); }, - in.const_data_ptr(), - out.mutable_data_ptr(), - in.numel()); + ctx, + in, + utils::SupportedTensorDtypes::REALH, + out, + utils::SupportedTensorDtypes::SAME_AS_COMMON); }); return out; diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp index 367137ad02c..5a7332efc07 100644 --- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp +++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp @@ -7,7 +7,7 @@ */ #include -#include +#include #include namespace torch { @@ -30,25 +30,23 @@ Tensor& unary_ufunc_realhb_to_bool( out, "Failed to resize output tensor."); - ET_KERNEL_CHECK_MSG( - ctx, - out.scalar_type() == executorch::aten::ScalarType::Bool, - InvalidArgument, - out, - "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.", - static_cast(out.scalar_type())); - ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); const auto in_type = in.scalar_type(); - ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] { - apply_unary_map_fn( + // TODO: this is broken for dtype_selective_build: this was + // __func__, which isn't the operator name. + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "unary_ufunc_realhb_to_bool"; + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE_IN, [&] { + utils::apply_unitensor_elementwise_fn( [fn](const CTYPE_IN val_in) { return fn(val_in); }, - in.const_data_ptr(), - out.mutable_data_ptr(), - in.numel()); + ctx, + in, + utils::SupportedTensorDtypes::REALHBBF16, + out, + utils::SupportedTensorDtypes::BOOL); }); return out; diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp index 602b5b1bfd2..3dcdbd4050c 100644 --- a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp +++ b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp @@ -7,7 +7,7 @@ */ #include -#include +#include #include namespace torch { @@ -38,17 +38,20 @@ Tensor& unary_ufunc_realhbbf16_to_floathbf16( const auto in_type = in.scalar_type(); const auto out_type = out.scalar_type(); - ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] { - ET_SWITCH_FLOATHBF16_TYPES(out_type, ctx, __func__, CTYPE_OUT, [&] { - apply_unary_map_fn( - [fn](const CTYPE_IN val_in) { - CTYPE_OUT xi = static_cast(val_in); - return static_cast(fn(xi)); - }, - in.const_data_ptr(), - out.mutable_data_ptr(), - in.numel()); - }); + // TODO: this is broken for dtype_selective_build: this was + // __func__, which isn't the operator name. + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = + "unary_ufunc_realhbbf16_to_floathbf16"; + + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE_IN, [&] { + utils::apply_unitensor_elementwise_fn( + [fn](const CTYPE_IN val_in) { return fn(val_in); }, + ctx, + in, + utils::SupportedTensorDtypes::REALHBBF16, + out, + utils::SupportedTensorDtypes::FLOATHBF16); }); return out; diff --git a/kernels/portable/cpu/util/dtype_util.cpp b/kernels/portable/cpu/util/dtype_util.cpp index d240b9f83bc..81b1b203a54 100644 --- a/kernels/portable/cpu/util/dtype_util.cpp +++ b/kernels/portable/cpu/util/dtype_util.cpp @@ -23,10 +23,14 @@ bool check_tensor_dtype( return executorch::runtime::tensor_is_realhbbf16_type(t); case SupportedTensorDtypes::REALHBF16: return executorch::runtime::tensor_is_realhbf16_type(t); + case SupportedTensorDtypes::REALH: + return executorch::runtime::tensor_is_realh_type(t); case SupportedTensorDtypes::FLOATHBF16: return executorch::runtime::tensor_is_floating_type(t); case SupportedTensorDtypes::INTB: return executorch::runtime::tensor_is_integral_type(t, true); + case SupportedTensorDtypes::BOOL: + return executorch::runtime::tensor_is_type(t, ScalarType::Bool); case SupportedTensorDtypes::BOOL_OR_BYTE: return (executorch::runtime::tensor_is_type( t, ScalarType::Bool, ScalarType::Byte)); diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 59b82cdc51b..19bee220005 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -51,6 +51,15 @@ load_to_common_fn get_load_to_common_fn_realhbf16( return result; } +template +load_to_common_fn get_load_to_common_fn_realh(const Tensor& t) { + CTYPE_COMMON (*result)(const void*) = nullptr; + ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() { + result = internal::load_and_convert; + }); + return result; +} + template load_to_common_fn get_load_to_common_fn_floathbf16( const Tensor& t) { @@ -72,6 +81,16 @@ load_to_common_fn get_load_to_common_fn_intb(const Tensor& t) { return result; } +template +load_to_common_fn get_load_to_common_fn_bool(const Tensor& t) { + ET_CHECK_MSG( + t.scalar_type() == ScalarType::Bool, + "Unhandled dtype %s for %s", + ::executorch::runtime::toString(t.scalar_type()), + op_name); + return internal::load_and_convert; +} + template load_to_common_fn get_load_to_common_fn_bool_or_byte( const Tensor& t) { @@ -137,6 +156,16 @@ store_common_to_tensor_fn get_store_common_to_tensor_fn_realhbf16( return result; } +template +store_common_to_tensor_fn get_store_common_to_tensor_fn_realh( + const Tensor& t) { + void (*result)(CTYPE_COMMON, void*) = nullptr; + ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() { + result = internal::convert_and_store; + }); + return result; +} + template store_common_to_tensor_fn get_store_common_to_tensor_fn_floathbf16(const Tensor& t) { @@ -159,6 +188,17 @@ store_common_to_tensor_fn get_store_common_to_tensor_fn_intb( return result; } +template +store_common_to_tensor_fn get_store_common_to_tensor_fn_bool( + const Tensor& t) { + ET_CHECK_MSG( + t.scalar_type() == ScalarType::Bool, + "Unhandled dtype %s for %s", + ::executorch::runtime::toString(t.scalar_type()), + op_name); + return internal::convert_and_store; +} + template store_common_to_tensor_fn get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) { @@ -206,8 +246,10 @@ get_store_common_to_tensor_fn_same_as_common(const Tensor& t) { enum class SupportedTensorDtypes { REALHBBF16, REALHBF16, + REALH, FLOATHBF16, INTB, + BOOL, BOOL_OR_BYTE, SAME_AS_COMPUTE, SAME_AS_COMMON, @@ -224,10 +266,14 @@ load_to_common_fn get_load_to_common_fn( return get_load_to_common_fn_realhbbf16(t); case SupportedTensorDtypes::REALHBF16: return get_load_to_common_fn_realhbf16(t); + case SupportedTensorDtypes::REALH: + return get_load_to_common_fn_realh(t); case SupportedTensorDtypes::FLOATHBF16: return get_load_to_common_fn_realhbf16(t); case SupportedTensorDtypes::INTB: return get_load_to_common_fn_intb(t); + case SupportedTensorDtypes::BOOL: + return get_load_to_common_fn_bool(t); case SupportedTensorDtypes::BOOL_OR_BYTE: return get_load_to_common_fn_bool_or_byte(t); case SupportedTensorDtypes::SAME_AS_COMPUTE: @@ -248,10 +294,14 @@ store_common_to_tensor_fn get_store_common_to_tensor_fn( return get_store_common_to_tensor_fn_realhbbf16(t); case SupportedTensorDtypes::REALHBF16: return get_store_common_to_tensor_fn_realhbf16(t); + case SupportedTensorDtypes::REALH: + return get_store_common_to_tensor_fn_realh(t); case SupportedTensorDtypes::FLOATHBF16: return get_store_common_to_tensor_fn_floathbf16(t); case SupportedTensorDtypes::INTB: return get_store_common_to_tensor_fn_intb(t); + case SupportedTensorDtypes::BOOL: + return get_store_common_to_tensor_fn_bool(t); case SupportedTensorDtypes::BOOL_OR_BYTE: return get_store_common_to_tensor_fn_bool_or_byte( t); From 79b908c798961ff453b71594793586b309641702 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:21 -0700 Subject: [PATCH 03/10] Update [ghstack-poisoned] --- kernels/portable/cpu/util/elementwise_util.h | 101 +++++++++++++++++-- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index 021ec42bf27..aa9883a0b26 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -60,10 +60,9 @@ using op_call_result = template < typename CTYPE_COMMON, - const char* op_name, typename Op, - typename... Args> -inline void apply_elementwise_fn( + typename... Args> +inline bool validate_elementwise_fn_inputs( const Op& compute_fun, KernelRuntimeContext& ctx, const Tensor& out, @@ -72,7 +71,6 @@ inline void apply_elementwise_fn( static_assert( (std::is_same_v> && ...)); - constexpr auto kNumInputs = sizeof...(inputs); constexpr auto compute_type = CppTypeToScalarType::value; const auto check_input_dtype = [](auto input, auto compute_type) { return internal::check_tensor_dtype( @@ -82,7 +80,33 @@ inline void apply_elementwise_fn( ctx, (check_input_dtype(inputs, compute_type) && ...) && internal::check_tensor_dtype(out, out_dtypes, compute_type), - InvalidArgument, ); + InvalidArgument, false); + + return true; +} + +template < + typename CTYPE_COMMON, + const char* op_name, + typename Op, + typename... Args> +inline void apply_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& out, + SupportedTensorDtypes out_dtypes, + Args... inputs) { + const bool inputs_valid = validate_elementwise_fn_inputs( + compute_fun, + ctx, + out, + out_dtypes, + inputs...); + if (!inputs_valid) { + return; + } + + constexpr auto kNumInputs = sizeof...(inputs); struct InputInfo { load_to_common_fn load_to_common; @@ -135,6 +159,7 @@ inline void apply_elementwise_fn( } } // namespace internal +/// DEPRECATED: prefer the variant with out_dtypes in the template argument. template inline void apply_unitensor_elementwise_fn( const Op& compute_fun, @@ -147,12 +172,45 @@ inline void apply_unitensor_elementwise_fn( compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); } +template +inline void apply_unitensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& out) { + internal::apply_elementwise_fn( + compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); +} + +/** + * DEPRECATED: prefer the variant with out_dtypes in the template argument list. + */ +template +inline void apply_bitensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& b, + SupportedTensorDtypes b_dtypes, + const Tensor& out, + SupportedTensorDtypes out_dtypes) { + internal::apply_elementwise_fn( + compute_fun, + ctx, + out, + out_dtypes, + std::make_pair(&a, a_dtypes), + std::make_pair(&b, b_dtypes)); +} + /** * Useful for bi-tensor elementwise operators. For each element of the inputs, * perform a computation and write to the corresponding element of the output. * Tensor broadcasting is applied wherever it is required. */ -template +template inline void apply_bitensor_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, @@ -160,6 +218,29 @@ inline void apply_bitensor_elementwise_fn( SupportedTensorDtypes a_dtypes, const Tensor& b, SupportedTensorDtypes b_dtypes, + const Tensor& out) { + internal::apply_elementwise_fn( + compute_fun, + ctx, + out, + out_dtypes, + std::make_pair(&a, a_dtypes), + std::make_pair(&b, b_dtypes)); +} + +/** + * DEPRECATED: prefer the variant with out_dtypes in the template argument list. + */ +template +inline void apply_tritensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& b, + SupportedTensorDtypes b_dtypes, + const Tensor& c, + SupportedTensorDtypes c_dtypes, const Tensor& out, SupportedTensorDtypes out_dtypes) { internal::apply_elementwise_fn( @@ -168,7 +249,8 @@ inline void apply_bitensor_elementwise_fn( out, out_dtypes, std::make_pair(&a, a_dtypes), - std::make_pair(&b, b_dtypes)); + std::make_pair(&b, b_dtypes), + std::make_pair(&c, c_dtypes)); } /** @@ -191,7 +273,7 @@ inline void apply_bitensor_elementwise_fn( * static constexpr const char op_name[] = "my_op"; * apply_ternary_elementwise_fn. */ -template +template inline void apply_tritensor_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, @@ -201,8 +283,7 @@ inline void apply_tritensor_elementwise_fn( SupportedTensorDtypes b_dtypes, const Tensor& c, SupportedTensorDtypes c_dtypes, - const Tensor& out, - SupportedTensorDtypes out_dtypes) { + const Tensor& out) { internal::apply_elementwise_fn( compute_fun, ctx, From 40c1b1be46d2ad91f6ca39fe3008d9b685d3f45b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 19 Mar 2025 09:58:10 -0700 Subject: [PATCH 04/10] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.h | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 76579301850..1f0e3403e82 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -173,27 +173,13 @@ get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) { template store_common_to_tensor_fn get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) { - return internal::convert_and_store; + // We already validate tensor types earlier in the process, so at + // this phase, treat same_as_compute the same as our widest + // SupportedTensorDtypes set. + return get_store_common_to_tensor_fn_realhbf16(t); } -template < - typename CTYPE_COMMON, - const char* op_name, - std::enable_if_t, bool> = true> -store_common_to_tensor_fn -get_store_common_to_tensor_fn_same_as_common(const Tensor& t) { - void (*result)(CTYPE_COMMON, void*) = nullptr; - ET_SWITCH_THREE_TYPES( - Float, Half, BFloat16, t.scalar_type(), unused, op_name, CTYPE, [&]() { - result = internal::convert_and_store; - }); - return result; -} - -template < - typename CTYPE_COMMON, - const char* op_name, - std::enable_if_t, bool> = true> +template store_common_to_tensor_fn get_store_common_to_tensor_fn_same_as_common(const Tensor& t) { return get_store_common_to_tensor_fn_same_as_compute( From 4553283773f0a5fb325a1a3eac54e26835327cbd Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 26 Mar 2025 16:20:19 -0700 Subject: [PATCH 05/10] Update [ghstack-poisoned] --- kernels/portable/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index 53ad88880d6..edea045d65f 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -66,9 +66,7 @@ gen_operators_lib( # Portable kernels support optional parallelization (and, in the # future, perhaps other performance features). If support is present, # produce an optimized version. -set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL) - -if(BUILD_OPTIMIZED_PORTABLE_KERNELS) +if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_library(optimized_portable_kernels ${_portable_kernels__srcs}) target_link_libraries(optimized_portable_kernels PRIVATE executorch) target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) From 943ab82be758baadbafc4287756fa6dbe904f6df Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 28 Mar 2025 09:51:22 -0700 Subject: [PATCH 06/10] Update [ghstack-poisoned] --- tools/cmake/executorch-config.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index 49aa6cf08af..56c7fa2d7d4 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -149,7 +149,7 @@ endif() if(TARGET coremldelegate) set_target_properties( coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES - "coreml_inmemoryfs;coreml_util" + "coreml_inmemoryfs;coreml_util" ) endif() @@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib) endif() if(TARGET extension_threadpool) target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) + set_target_properties( + extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES + "cpuinfo;pthreadpool" + ) endif() From f22d039d23db4103dc0697b638813ce2e4bc4a5f Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 28 Mar 2025 09:51:28 -0700 Subject: [PATCH 07/10] Update [ghstack-poisoned] --- test/CMakeLists.txt | 13 +++++++ test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 test/build_optimized_size_test.sh diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3932f1097e1..812e8e4a67a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release") target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections") endif() +# +# size_test_all_optimized_ops: binary with optimized ops and no delegate backend +# +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) +add_executable(size_test_all_optimized_ops ${_size_test__srcs}) +target_link_options_shared_lib(optimized_native_cpu_ops_lib) +target_link_libraries( + size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib) +if(CMAKE_BUILD_TYPE EQUAL "Release") + target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections") +endif() +endif() + # Print all summary executorch_print_configuration_summary() diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh new file mode 100644 index 00000000000..181c2ce617d --- /dev/null +++ b/test/build_optimized_size_test.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Unlike build_size_test.sh, this script: +# - does not attempt to disable exceptions and RTTI +# - as a consequence, is able to build optimized kernels +# - uses MinSizeRel builds +# - is not currently intended to run in CI +# - sets -g to make it easier to use tools like bloaty to investigate size + +set -e + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh" + +cmake_install_executorch_lib() { + echo "Installing libexecutorch.a" + clean_executorch_install_folders + update_tokenizers_git_submodule + CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \ + -DCMAKE_CXX_STANDARD_REQUIRED=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DOPTIMIZE_SIZE=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config MinSizeRel +} + +test_cmake_size_test() { + CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test + + echo "Build size test" + cmake --build cmake-out/test -j9 --config MinSizeRel + + echo 'ExecuTorch with no ops binary size, unstripped:' + ls -al cmake-out/test/size_test + + echo 'ExecuTorch with portable ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_ops + + echo 'ExecuTorch with optimized ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_optimized_ops +} + +if [[ -z $PYTHON_EXECUTABLE ]]; then + PYTHON_EXECUTABLE=python3 +fi + +cmake_install_executorch_lib +test_cmake_size_test From 7f2bbdb098596d232cd1193ea76422308ab74dc3 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 1 Apr 2025 19:32:41 -0700 Subject: [PATCH 08/10] Update [ghstack-poisoned] --- kernels/portable/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index 53ad88880d6..edea045d65f 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -66,9 +66,7 @@ gen_operators_lib( # Portable kernels support optional parallelization (and, in the # future, perhaps other performance features). If support is present, # produce an optimized version. -set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL) - -if(BUILD_OPTIMIZED_PORTABLE_KERNELS) +if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_library(optimized_portable_kernels ${_portable_kernels__srcs}) target_link_libraries(optimized_portable_kernels PRIVATE executorch) target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) From 9e42e93a96531d2daa189ac66b796aa515e56cd4 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 1 Apr 2025 19:32:46 -0700 Subject: [PATCH 09/10] Update [ghstack-poisoned] --- tools/cmake/executorch-config.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index 49aa6cf08af..56c7fa2d7d4 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -149,7 +149,7 @@ endif() if(TARGET coremldelegate) set_target_properties( coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES - "coreml_inmemoryfs;coreml_util" + "coreml_inmemoryfs;coreml_util" ) endif() @@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib) endif() if(TARGET extension_threadpool) target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) + set_target_properties( + extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES + "cpuinfo;pthreadpool" + ) endif() From 96d258eb5cb1601283f636f6dd8a046ae5c9e4ae Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 1 Apr 2025 19:33:00 -0700 Subject: [PATCH 10/10] Update [ghstack-poisoned] --- test/CMakeLists.txt | 13 +++++++ test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 test/build_optimized_size_test.sh diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3932f1097e1..812e8e4a67a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release") target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections") endif() +# +# size_test_all_optimized_ops: binary with optimized ops and no delegate backend +# +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) +add_executable(size_test_all_optimized_ops ${_size_test__srcs}) +target_link_options_shared_lib(optimized_native_cpu_ops_lib) +target_link_libraries( + size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib) +if(CMAKE_BUILD_TYPE EQUAL "Release") + target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections") +endif() +endif() + # Print all summary executorch_print_configuration_summary() diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh new file mode 100644 index 00000000000..181c2ce617d --- /dev/null +++ b/test/build_optimized_size_test.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Unlike build_size_test.sh, this script: +# - does not attempt to disable exceptions and RTTI +# - as a consequence, is able to build optimized kernels +# - uses MinSizeRel builds +# - is not currently intended to run in CI +# - sets -g to make it easier to use tools like bloaty to investigate size + +set -e + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh" + +cmake_install_executorch_lib() { + echo "Installing libexecutorch.a" + clean_executorch_install_folders + update_tokenizers_git_submodule + CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \ + -DCMAKE_CXX_STANDARD_REQUIRED=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DOPTIMIZE_SIZE=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config MinSizeRel +} + +test_cmake_size_test() { + CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test + + echo "Build size test" + cmake --build cmake-out/test -j9 --config MinSizeRel + + echo 'ExecuTorch with no ops binary size, unstripped:' + ls -al cmake-out/test/size_test + + echo 'ExecuTorch with portable ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_ops + + echo 'ExecuTorch with optimized ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_optimized_ops +} + +if [[ -z $PYTHON_EXECUTABLE ]]; then + PYTHON_EXECUTABLE=python3 +fi + +cmake_install_executorch_lib +test_cmake_size_test