From 31a49e0212d88b6b16979f926f437beed9cde1dc Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:07 -0700 Subject: [PATCH 01/27] Update [ghstack-poisoned] --- CMakeLists.txt | 9 +++++++-- kernels/optimized/CMakeLists.txt | 1 + kernels/portable/CMakeLists.txt | 2 ++ runtime/core/portable_type/c10/c10/targets.bzl | 3 ++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a53b8a6e2a..8fe08a2c25e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch) # Real integrations should supply their own YAML file that only lists the # operators necessary for the models that will run. # +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + # find pytorch lib here to make it available to all + # sub-directories. Find it before including portable so that + # optimized_portabale_kernels can use it. + find_package_torch_headers() +endif() + if(BUILD_EXECUTORCH_PORTABLE_OPS) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) endif() if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) - # find pytorch lib here to make it available to all sub-directories - find_package_torch_headers() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized) endif() diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 7cba9e91fe5..693be68c35e 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}") list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(optimized_kernels ${_optimized_kernels__srcs}) target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft") +target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS) target_link_libraries( optimized_kernels PUBLIC executorch_core cpublas extension_threadpool ) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index e27ba12ac0d..53ad88880d6 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -73,6 +73,8 @@ if(BUILD_OPTIMIZED_PORTABLE_KERNELS) target_link_libraries(optimized_portable_kernels PRIVATE executorch) target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options}) + target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS}) + target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS) install( TARGETS optimized_portable_kernels DESTINATION lib diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index a727cb19ac1..b8883c75bfe 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -73,6 +73,7 @@ def define_common_targets(): # -Wmacro-redefined, and we only care about getting # reasonable vectorization and Sleef support. "-DCPU_CAPABILITY_AVX2", + "-DET_USE_PYTORCH_HEADERS", "-DHAVE_AVX2_CPU_DEFINITION", "-DSTANDALONE_TORCH_HEADER", ] + get_sleef_preprocessor_flags(), @@ -87,5 +88,5 @@ def define_common_targets(): # linker failure. "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(), "DEFAULT": [], - }) + ["-DSTANDALONE_TORCH_HEADER"], + }) + ["-DET_USE_PYTORCH_HEADERS", "-DSTANDALONE_TORCH_HEADER"], ) From 9fcd8857fb0e00bee0b401f5e25f1fd081fe3c9c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:12 -0700 Subject: [PATCH 02/27] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.h | 11 ---------- kernels/portable/cpu/util/elementwise_util.h | 23 ++++++++++++++++---- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 2bbd5de4577..59b82cdc51b 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -86,12 +86,6 @@ load_to_common_fn get_load_to_common_fn_bool_or_byte( template load_to_common_fn get_load_to_common_fn_same_as_compute( const Tensor& t) { - constexpr auto common_scalar_type = CppTypeToScalarType::value; - ET_CHECK_MSG( - t.scalar_type() == common_scalar_type, - "Unhandled dtype %s for %s", - ::executorch::runtime::toString(common_scalar_type), - op_name); return internal::load_and_convert; } @@ -180,11 +174,6 @@ template store_common_to_tensor_fn get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) { constexpr auto common_scalar_type = CppTypeToScalarType::value; - ET_CHECK_MSG( - t.scalar_type() == common_scalar_type, - "Unhandled dtype %s for %s", - ::executorch::runtime::toString(common_scalar_type), - op_name); return internal::convert_and_store; } diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index f5932069005..021ec42bf27 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -51,6 +51,13 @@ inline int64_t scalar_to(const Scalar& s) { } namespace internal { +template +using ignore_first_yield_second = T; + +template +using op_call_result = + std::invoke_result_t...>; + template < typename CTYPE_COMMON, const char* op_name, @@ -89,9 +96,16 @@ inline void apply_elementwise_fn( inputs.first->element_size(), })...}; - const auto store_common_to_out = - internal::get_store_common_to_tensor_fn( - out, out_dtypes); + // NOTE: the result of compute_fun is not necessarily CTYPE_COMMON! + // For example, consider the possibility that compute_fun is a + // trigonometric function like acos, the common input type is bool, + // and the output type is float -- we would truncate acos(0) ~= 1.67 + // to just 1. Conveniently, it costs us nothing at runtime to handle + // this correctly. + const auto store_compute_result_to_out = + internal::get_store_common_to_tensor_fn< + op_call_result, + op_name>(out, out_dtypes); char* const data_out = reinterpret_cast(out.mutable_data_ptr()); const auto out_element_size = out.element_size(); @@ -114,7 +128,8 @@ inline void apply_elementwise_fn( .data_ptr[indexes[idx + 1] * input_info.element_size]); } auto result = std::apply(compute_fun, loaded_inputs); - store_common_to_out(result, &data_out[indexes[0] * out_element_size]); + store_compute_result_to_out( + result, &data_out[indexes[0] * out_element_size]); } }); } From 29d6de9d2e63b567e242aea0b7949d7250f12b34 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:16 -0700 Subject: [PATCH 03/27] Update [ghstack-poisoned] --- .../cpu/pattern/unary_ufunc_realh.cpp | 19 ++++--- .../pattern/unary_ufunc_realhb_to_bool.cpp | 26 +++++----- .../unary_ufunc_realhbbf16_to_floathbf16.cpp | 27 +++++----- kernels/portable/cpu/util/dtype_util.cpp | 4 ++ kernels/portable/cpu/util/dtype_util.h | 50 +++++++++++++++++++ 5 files changed, 94 insertions(+), 32 deletions(-) diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp index 16d847ace31..f7050e8410b 100644 --- a/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp +++ b/kernels/portable/cpu/pattern/unary_ufunc_realh.cpp @@ -7,7 +7,7 @@ */ #include -#include +#include #include namespace torch { @@ -36,12 +36,19 @@ Tensor& unary_ufunc_realh( ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); - ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, __func__, CTYPE, [&] { - apply_unary_map_fn( + // TODO: this is broken for dtype_selective_build: this was + // __func__, which isn't the operator name. + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "unary_ufunc_realh"; + + ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] { + utils::apply_unitensor_elementwise_fn( [fn](const CTYPE val_in) { return static_cast(fn(val_in)); }, - in.const_data_ptr(), - out.mutable_data_ptr(), - in.numel()); + ctx, + in, + utils::SupportedTensorDtypes::REALH, + out, + utils::SupportedTensorDtypes::SAME_AS_COMMON); }); return out; diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp index 367137ad02c..5a7332efc07 100644 --- a/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp +++ b/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_bool.cpp @@ -7,7 +7,7 @@ */ #include -#include +#include #include namespace torch { @@ -30,25 +30,23 @@ Tensor& unary_ufunc_realhb_to_bool( out, "Failed to resize output tensor."); - ET_KERNEL_CHECK_MSG( - ctx, - out.scalar_type() == executorch::aten::ScalarType::Bool, - InvalidArgument, - out, - "Expected out tensor to have dtype Bool, but got %" PRId8 " instead.", - static_cast(out.scalar_type())); - ET_KERNEL_CHECK( ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); const auto in_type = in.scalar_type(); - ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] { - apply_unary_map_fn( + // TODO: this is broken for dtype_selective_build: this was + // __func__, which isn't the operator name. + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "unary_ufunc_realhb_to_bool"; + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE_IN, [&] { + utils::apply_unitensor_elementwise_fn( [fn](const CTYPE_IN val_in) { return fn(val_in); }, - in.const_data_ptr(), - out.mutable_data_ptr(), - in.numel()); + ctx, + in, + utils::SupportedTensorDtypes::REALHBBF16, + out, + utils::SupportedTensorDtypes::BOOL); }); return out; diff --git a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp index 602b5b1bfd2..3dcdbd4050c 100644 --- a/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp +++ b/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp @@ -7,7 +7,7 @@ */ #include -#include +#include #include namespace torch { @@ -38,17 +38,20 @@ Tensor& unary_ufunc_realhbbf16_to_floathbf16( const auto in_type = in.scalar_type(); const auto out_type = out.scalar_type(); - ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, __func__, CTYPE_IN, [&] { - ET_SWITCH_FLOATHBF16_TYPES(out_type, ctx, __func__, CTYPE_OUT, [&] { - apply_unary_map_fn( - [fn](const CTYPE_IN val_in) { - CTYPE_OUT xi = static_cast(val_in); - return static_cast(fn(xi)); - }, - in.const_data_ptr(), - out.mutable_data_ptr(), - in.numel()); - }); + // TODO: this is broken for dtype_selective_build: this was + // __func__, which isn't the operator name. + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = + "unary_ufunc_realhbbf16_to_floathbf16"; + + ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, op_name, CTYPE_IN, [&] { + utils::apply_unitensor_elementwise_fn( + [fn](const CTYPE_IN val_in) { return fn(val_in); }, + ctx, + in, + utils::SupportedTensorDtypes::REALHBBF16, + out, + utils::SupportedTensorDtypes::FLOATHBF16); }); return out; diff --git a/kernels/portable/cpu/util/dtype_util.cpp b/kernels/portable/cpu/util/dtype_util.cpp index d240b9f83bc..81b1b203a54 100644 --- a/kernels/portable/cpu/util/dtype_util.cpp +++ b/kernels/portable/cpu/util/dtype_util.cpp @@ -23,10 +23,14 @@ bool check_tensor_dtype( return executorch::runtime::tensor_is_realhbbf16_type(t); case SupportedTensorDtypes::REALHBF16: return executorch::runtime::tensor_is_realhbf16_type(t); + case SupportedTensorDtypes::REALH: + return executorch::runtime::tensor_is_realh_type(t); case SupportedTensorDtypes::FLOATHBF16: return executorch::runtime::tensor_is_floating_type(t); case SupportedTensorDtypes::INTB: return executorch::runtime::tensor_is_integral_type(t, true); + case SupportedTensorDtypes::BOOL: + return executorch::runtime::tensor_is_type(t, ScalarType::Bool); case SupportedTensorDtypes::BOOL_OR_BYTE: return (executorch::runtime::tensor_is_type( t, ScalarType::Bool, ScalarType::Byte)); diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 59b82cdc51b..19bee220005 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -51,6 +51,15 @@ load_to_common_fn get_load_to_common_fn_realhbf16( return result; } +template +load_to_common_fn get_load_to_common_fn_realh(const Tensor& t) { + CTYPE_COMMON (*result)(const void*) = nullptr; + ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() { + result = internal::load_and_convert; + }); + return result; +} + template load_to_common_fn get_load_to_common_fn_floathbf16( const Tensor& t) { @@ -72,6 +81,16 @@ load_to_common_fn get_load_to_common_fn_intb(const Tensor& t) { return result; } +template +load_to_common_fn get_load_to_common_fn_bool(const Tensor& t) { + ET_CHECK_MSG( + t.scalar_type() == ScalarType::Bool, + "Unhandled dtype %s for %s", + ::executorch::runtime::toString(t.scalar_type()), + op_name); + return internal::load_and_convert; +} + template load_to_common_fn get_load_to_common_fn_bool_or_byte( const Tensor& t) { @@ -137,6 +156,16 @@ store_common_to_tensor_fn get_store_common_to_tensor_fn_realhbf16( return result; } +template +store_common_to_tensor_fn get_store_common_to_tensor_fn_realh( + const Tensor& t) { + void (*result)(CTYPE_COMMON, void*) = nullptr; + ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() { + result = internal::convert_and_store; + }); + return result; +} + template store_common_to_tensor_fn get_store_common_to_tensor_fn_floathbf16(const Tensor& t) { @@ -159,6 +188,17 @@ store_common_to_tensor_fn get_store_common_to_tensor_fn_intb( return result; } +template +store_common_to_tensor_fn get_store_common_to_tensor_fn_bool( + const Tensor& t) { + ET_CHECK_MSG( + t.scalar_type() == ScalarType::Bool, + "Unhandled dtype %s for %s", + ::executorch::runtime::toString(t.scalar_type()), + op_name); + return internal::convert_and_store; +} + template store_common_to_tensor_fn get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) { @@ -206,8 +246,10 @@ get_store_common_to_tensor_fn_same_as_common(const Tensor& t) { enum class SupportedTensorDtypes { REALHBBF16, REALHBF16, + REALH, FLOATHBF16, INTB, + BOOL, BOOL_OR_BYTE, SAME_AS_COMPUTE, SAME_AS_COMMON, @@ -224,10 +266,14 @@ load_to_common_fn get_load_to_common_fn( return get_load_to_common_fn_realhbbf16(t); case SupportedTensorDtypes::REALHBF16: return get_load_to_common_fn_realhbf16(t); + case SupportedTensorDtypes::REALH: + return get_load_to_common_fn_realh(t); case SupportedTensorDtypes::FLOATHBF16: return get_load_to_common_fn_realhbf16(t); case SupportedTensorDtypes::INTB: return get_load_to_common_fn_intb(t); + case SupportedTensorDtypes::BOOL: + return get_load_to_common_fn_bool(t); case SupportedTensorDtypes::BOOL_OR_BYTE: return get_load_to_common_fn_bool_or_byte(t); case SupportedTensorDtypes::SAME_AS_COMPUTE: @@ -248,10 +294,14 @@ store_common_to_tensor_fn get_store_common_to_tensor_fn( return get_store_common_to_tensor_fn_realhbbf16(t); case SupportedTensorDtypes::REALHBF16: return get_store_common_to_tensor_fn_realhbf16(t); + case SupportedTensorDtypes::REALH: + return get_store_common_to_tensor_fn_realh(t); case SupportedTensorDtypes::FLOATHBF16: return get_store_common_to_tensor_fn_floathbf16(t); case SupportedTensorDtypes::INTB: return get_store_common_to_tensor_fn_intb(t); + case SupportedTensorDtypes::BOOL: + return get_store_common_to_tensor_fn_bool(t); case SupportedTensorDtypes::BOOL_OR_BYTE: return get_store_common_to_tensor_fn_bool_or_byte( t); From 79b908c798961ff453b71594793586b309641702 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:21 -0700 Subject: [PATCH 04/27] Update [ghstack-poisoned] --- kernels/portable/cpu/util/elementwise_util.h | 101 +++++++++++++++++-- 1 file changed, 91 insertions(+), 10 deletions(-) diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index 021ec42bf27..aa9883a0b26 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -60,10 +60,9 @@ using op_call_result = template < typename CTYPE_COMMON, - const char* op_name, typename Op, - typename... Args> -inline void apply_elementwise_fn( + typename... Args> +inline bool validate_elementwise_fn_inputs( const Op& compute_fun, KernelRuntimeContext& ctx, const Tensor& out, @@ -72,7 +71,6 @@ inline void apply_elementwise_fn( static_assert( (std::is_same_v> && ...)); - constexpr auto kNumInputs = sizeof...(inputs); constexpr auto compute_type = CppTypeToScalarType::value; const auto check_input_dtype = [](auto input, auto compute_type) { return internal::check_tensor_dtype( @@ -82,7 +80,33 @@ inline void apply_elementwise_fn( ctx, (check_input_dtype(inputs, compute_type) && ...) && internal::check_tensor_dtype(out, out_dtypes, compute_type), - InvalidArgument, ); + InvalidArgument, false); + + return true; +} + +template < + typename CTYPE_COMMON, + const char* op_name, + typename Op, + typename... Args> +inline void apply_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& out, + SupportedTensorDtypes out_dtypes, + Args... inputs) { + const bool inputs_valid = validate_elementwise_fn_inputs( + compute_fun, + ctx, + out, + out_dtypes, + inputs...); + if (!inputs_valid) { + return; + } + + constexpr auto kNumInputs = sizeof...(inputs); struct InputInfo { load_to_common_fn load_to_common; @@ -135,6 +159,7 @@ inline void apply_elementwise_fn( } } // namespace internal +/// DEPRECATED: prefer the variant with out_dtypes in the template argument. template inline void apply_unitensor_elementwise_fn( const Op& compute_fun, @@ -147,12 +172,45 @@ inline void apply_unitensor_elementwise_fn( compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); } +template +inline void apply_unitensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& out) { + internal::apply_elementwise_fn( + compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); +} + +/** + * DEPRECATED: prefer the variant with out_dtypes in the template argument list. + */ +template +inline void apply_bitensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& b, + SupportedTensorDtypes b_dtypes, + const Tensor& out, + SupportedTensorDtypes out_dtypes) { + internal::apply_elementwise_fn( + compute_fun, + ctx, + out, + out_dtypes, + std::make_pair(&a, a_dtypes), + std::make_pair(&b, b_dtypes)); +} + /** * Useful for bi-tensor elementwise operators. For each element of the inputs, * perform a computation and write to the corresponding element of the output. * Tensor broadcasting is applied wherever it is required. */ -template +template inline void apply_bitensor_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, @@ -160,6 +218,29 @@ inline void apply_bitensor_elementwise_fn( SupportedTensorDtypes a_dtypes, const Tensor& b, SupportedTensorDtypes b_dtypes, + const Tensor& out) { + internal::apply_elementwise_fn( + compute_fun, + ctx, + out, + out_dtypes, + std::make_pair(&a, a_dtypes), + std::make_pair(&b, b_dtypes)); +} + +/** + * DEPRECATED: prefer the variant with out_dtypes in the template argument list. + */ +template +inline void apply_tritensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& b, + SupportedTensorDtypes b_dtypes, + const Tensor& c, + SupportedTensorDtypes c_dtypes, const Tensor& out, SupportedTensorDtypes out_dtypes) { internal::apply_elementwise_fn( @@ -168,7 +249,8 @@ inline void apply_bitensor_elementwise_fn( out, out_dtypes, std::make_pair(&a, a_dtypes), - std::make_pair(&b, b_dtypes)); + std::make_pair(&b, b_dtypes), + std::make_pair(&c, c_dtypes)); } /** @@ -191,7 +273,7 @@ inline void apply_bitensor_elementwise_fn( * static constexpr const char op_name[] = "my_op"; * apply_ternary_elementwise_fn. */ -template +template inline void apply_tritensor_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, @@ -201,8 +283,7 @@ inline void apply_tritensor_elementwise_fn( SupportedTensorDtypes b_dtypes, const Tensor& c, SupportedTensorDtypes c_dtypes, - const Tensor& out, - SupportedTensorDtypes out_dtypes) { + const Tensor& out) { internal::apply_elementwise_fn( compute_fun, ctx, From fd62a079438de94ac23de004c2a562e982d8689c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:25 -0700 Subject: [PATCH 05/27] Update [ghstack-poisoned] --- kernels/portable/cpu/op_mul.cpp | 8 +- kernels/portable/cpu/util/dtype_util.h | 22 +++ kernels/portable/cpu/util/elementwise_util.h | 143 +++++++++++++++---- 3 files changed, 146 insertions(+), 27 deletions(-) diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 1ee73d342ca..114e60ff171 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -52,7 +52,10 @@ Tensor& mul_out( out); ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { return val_a * val_b; }, @@ -61,8 +64,7 @@ Tensor& mul_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 19bee220005..9a75432e184 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -324,6 +324,28 @@ bool check_tensor_dtype( SupportedTensorDtypes dtypes, const ScalarType compute_type); +/// Return the one output type we are willing to emit specialized code +/// to handle, given a compute type of CTYPE_COMMON and supported +/// output types of out_dtypes. +template +inline constexpr ScalarType specialized_output_scalar_type( + SupportedTensorDtypes out_dtypes) { + switch (out_dtypes) { + case SupportedTensorDtypes::BOOL: + return ScalarType::Bool; + case SupportedTensorDtypes::BOOL_OR_BYTE: + return ScalarType::Bool; + case SupportedTensorDtypes::REALHBBF16: + case SupportedTensorDtypes::REALHBF16: + case SupportedTensorDtypes::REALH: + case SupportedTensorDtypes::FLOATHBF16: + case SupportedTensorDtypes::INTB: + case SupportedTensorDtypes::SAME_AS_COMPUTE: + case SupportedTensorDtypes::SAME_AS_COMMON: + return CppTypeToScalarType::value; + } +} + } // namespace internal } // namespace utils } // namespace native diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index aa9883a0b26..14bf9293957 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -60,8 +60,43 @@ using op_call_result = template < typename CTYPE_COMMON, + typename CTYPE_OUT, typename Op, - typename... Args> + typename... Args> +inline void dtype_specialized_elementwise_fn_impl( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& out, + Args... inputs) { + constexpr auto kNumInputs = sizeof...(inputs); + ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMMON)) && ...)); + + std::array inputs_data_ptrs = { + inputs.first->template const_data_ptr()...}; + + CTYPE_OUT* const data_out = out.mutable_data_ptr(); + + ::executorch::extension::parallel_for( + 0, + out.numel(), + ::executorch::extension::internal::GRAIN_SIZE, + [&](const auto begin, const auto end) { + const auto range = + BroadcastIndexesRange(out, (*inputs.first)...); + auto begin_it = range.begin(); + begin_it += begin; + for (; (*begin_it)[0] < end; ++begin_it) { + const auto& indexes = *begin_it; + std::array loaded_inputs; + for (const auto idx : c10::irange(kNumInputs)) { + loaded_inputs[idx] = inputs_data_ptrs[idx][indexes[idx + 1]]; + } + data_out[indexes[0]] = std::apply(compute_fun, loaded_inputs); + } + }); +} + +template inline bool validate_elementwise_fn_inputs( const Op& compute_fun, KernelRuntimeContext& ctx, @@ -80,7 +115,8 @@ inline bool validate_elementwise_fn_inputs( ctx, (check_input_dtype(inputs, compute_type) && ...) && internal::check_tensor_dtype(out, out_dtypes, compute_type), - InvalidArgument, false); + InvalidArgument, + false); return true; } @@ -90,22 +126,12 @@ template < const char* op_name, typename Op, typename... Args> -inline void apply_elementwise_fn( +inline void apply_elementwise_fn_generic_impl( const Op& compute_fun, KernelRuntimeContext& ctx, const Tensor& out, SupportedTensorDtypes out_dtypes, Args... inputs) { - const bool inputs_valid = validate_elementwise_fn_inputs( - compute_fun, - ctx, - out, - out_dtypes, - inputs...); - if (!inputs_valid) { - return; - } - constexpr auto kNumInputs = sizeof...(inputs); struct InputInfo { @@ -157,6 +183,65 @@ inline void apply_elementwise_fn( } }); } + +template < + typename CTYPE_COMMON, + const char* op_name, + typename Op, + typename... Args> +inline void apply_elementwise_fn_runtime_out_dtypes( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& out, + SupportedTensorDtypes out_dtypes, + Args... inputs) { + const bool inputs_valid = validate_elementwise_fn_inputs( + compute_fun, ctx, out, out_dtypes, inputs...); + if (!inputs_valid) { + return; + } + + apply_elementwise_fn_generic_impl( + compute_fun, ctx, out, out_dtypes, inputs...); +} + +template < + typename CTYPE_COMMON, + const char* op_name, + SupportedTensorDtypes out_dtypes, + typename Op, + typename... Args> +inline void apply_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& out, + Args... inputs) { + const bool inputs_valid = validate_elementwise_fn_inputs( + compute_fun, ctx, out, out_dtypes, inputs...); + if (!inputs_valid) { + return; + } + + constexpr auto kNumInputs = sizeof...(inputs); + + constexpr auto compute_type = CppTypeToScalarType::value; + const bool all_inputs_compute_dtype = + ((inputs.first->scalar_type() == compute_type) && ...); + + constexpr ScalarType out_specialized_scalar_type = + specialized_output_scalar_type(out_dtypes); + if (all_inputs_compute_dtype && + out.scalar_type() == out_specialized_scalar_type) { + using CTYPE_OUT = + typename ScalarTypeToCppType::type; + dtype_specialized_elementwise_fn_impl( + compute_fun, ctx, out, inputs...); + return; + } + + apply_elementwise_fn_generic_impl( + compute_fun, ctx, out, out_dtypes, inputs...); +} } // namespace internal /// DEPRECATED: prefer the variant with out_dtypes in the template argument. @@ -168,18 +253,22 @@ inline void apply_unitensor_elementwise_fn( SupportedTensorDtypes a_dtypes, const Tensor& out, SupportedTensorDtypes out_dtypes) { - internal::apply_elementwise_fn( + internal::apply_elementwise_fn_runtime_out_dtypes( compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); } -template +template < + typename CTYPE_COMMON, + const char* op_name, + SupportedTensorDtypes out_dtypes, + typename Op> inline void apply_unitensor_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, const Tensor& a, SupportedTensorDtypes a_dtypes, const Tensor& out) { - internal::apply_elementwise_fn( + internal::apply_elementwise_fn( compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); } @@ -196,7 +285,7 @@ inline void apply_bitensor_elementwise_fn( SupportedTensorDtypes b_dtypes, const Tensor& out, SupportedTensorDtypes out_dtypes) { - internal::apply_elementwise_fn( + internal::apply_elementwise_fn_runtime_out_dtypes( compute_fun, ctx, out, @@ -210,7 +299,11 @@ inline void apply_bitensor_elementwise_fn( * perform a computation and write to the corresponding element of the output. * Tensor broadcasting is applied wherever it is required. */ -template +template < + typename CTYPE_COMMON, + const char* op_name, + SupportedTensorDtypes out_dtypes, + typename Op> inline void apply_bitensor_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, @@ -219,11 +312,10 @@ inline void apply_bitensor_elementwise_fn( const Tensor& b, SupportedTensorDtypes b_dtypes, const Tensor& out) { - internal::apply_elementwise_fn( + internal::apply_elementwise_fn( compute_fun, ctx, out, - out_dtypes, std::make_pair(&a, a_dtypes), std::make_pair(&b, b_dtypes)); } @@ -243,7 +335,7 @@ inline void apply_tritensor_elementwise_fn( SupportedTensorDtypes c_dtypes, const Tensor& out, SupportedTensorDtypes out_dtypes) { - internal::apply_elementwise_fn( + internal::apply_elementwise_fn_runtime_out_dtypes( compute_fun, ctx, out, @@ -273,7 +365,11 @@ inline void apply_tritensor_elementwise_fn( * static constexpr const char op_name[] = "my_op"; * apply_ternary_elementwise_fn. */ -template +template < + typename CTYPE_COMMON, + const char* op_name, + SupportedTensorDtypes out_dtypes, + typename Op> inline void apply_tritensor_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, @@ -284,11 +380,10 @@ inline void apply_tritensor_elementwise_fn( const Tensor& c, SupportedTensorDtypes c_dtypes, const Tensor& out) { - internal::apply_elementwise_fn( + internal::apply_elementwise_fn( compute_fun, ctx, out, - out_dtypes, std::make_pair(&a, a_dtypes), std::make_pair(&b, b_dtypes), std::make_pair(&c, c_dtypes)); From 40c1b1be46d2ad91f6ca39fe3008d9b685d3f45b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 19 Mar 2025 09:58:10 -0700 Subject: [PATCH 06/27] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.h | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 76579301850..1f0e3403e82 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -173,27 +173,13 @@ get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) { template store_common_to_tensor_fn get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) { - return internal::convert_and_store; + // We already validate tensor types earlier in the process, so at + // this phase, treat same_as_compute the same as our widest + // SupportedTensorDtypes set. + return get_store_common_to_tensor_fn_realhbf16(t); } -template < - typename CTYPE_COMMON, - const char* op_name, - std::enable_if_t, bool> = true> -store_common_to_tensor_fn -get_store_common_to_tensor_fn_same_as_common(const Tensor& t) { - void (*result)(CTYPE_COMMON, void*) = nullptr; - ET_SWITCH_THREE_TYPES( - Float, Half, BFloat16, t.scalar_type(), unused, op_name, CTYPE, [&]() { - result = internal::convert_and_store; - }); - return result; -} - -template < - typename CTYPE_COMMON, - const char* op_name, - std::enable_if_t, bool> = true> +template store_common_to_tensor_fn get_store_common_to_tensor_fn_same_as_common(const Tensor& t) { return get_store_common_to_tensor_fn_same_as_compute( From 8782a900668b190307a16a97e485b0c350b96e8f Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 19 Mar 2025 17:36:12 -0700 Subject: [PATCH 07/27] Update [ghstack-poisoned] --- kernels/optimized/cpu/binary_ops.h | 25 +---------------- kernels/optimized/cpu/targets.bzl | 5 +++- .../cpu/util/broadcast_indexes_range.h | 27 ++++++++++++++++++- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/kernels/optimized/cpu/binary_ops.h b/kernels/optimized/cpu/binary_ops.h index f59c9fd5d76..dd4425e4ce6 100644 --- a/kernels/optimized/cpu/binary_ops.h +++ b/kernels/optimized/cpu/binary_ops.h @@ -10,34 +10,11 @@ #include #include +#include #include namespace torch { namespace executor { -namespace internal { -// NOTE: we bake ArrayRef iterators being pointers into the return -// type here because we assume that iterators are portable across -// ArrayRef copies. -inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s( - ArrayRef arr) { - return std::find_if( - arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; }); -} - -inline bool sizes_match_ignoring_leading_1s( - ArrayRef lhs, - ArrayRef rhs) { - auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs); - auto lhs_end = lhs.end(); - - auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs); - auto rhs_end = rhs.end(); - - return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) && - std::equal(lhs_begin, lhs_end, rhs_begin); -} -} // namespace internal - enum class ElementwiseOptimizedPath { kNone, kTreatAs1d, diff --git a/kernels/optimized/cpu/targets.bzl b/kernels/optimized/cpu/targets.bzl index b868a5901fd..edf01ca110c 100644 --- a/kernels/optimized/cpu/targets.bzl +++ b/kernels/optimized/cpu/targets.bzl @@ -130,7 +130,10 @@ def define_common_targets(): srcs = [], exported_headers = ["op_add_sub_impl.h"], visibility = ["//executorch/kernels/optimized/cpu/..."], - exported_deps = ["//executorch/runtime/core:core"], + exported_deps = [ + "//executorch/runtime/core:core", + "//executorch/kernels/portable/cpu/util:broadcast_indexes_range", + ], ) runtime.cxx_library( diff --git a/kernels/portable/cpu/util/broadcast_indexes_range.h b/kernels/portable/cpu/util/broadcast_indexes_range.h index aaf7207d0c9..4d3ba46b51b 100644 --- a/kernels/portable/cpu/util/broadcast_indexes_range.h +++ b/kernels/portable/cpu/util/broadcast_indexes_range.h @@ -21,6 +21,28 @@ namespace torch::executor { namespace internal { +// NOTE: we bake ArrayRef iterators being pointers into the return +// type here because we assume that iterators are portable across +// ArrayRef copies. +inline const Tensor::SizesType* arrayref_begin_ignoring_leading_1s( + ArrayRef arr) { + return std::find_if( + arr.begin(), arr.end(), [](Tensor::SizesType x) { return x != 1; }); +} + +inline bool sizes_match_ignoring_leading_1s( + ArrayRef lhs, + ArrayRef rhs) { + auto lhs_begin = arrayref_begin_ignoring_leading_1s(lhs); + auto lhs_end = lhs.end(); + + auto rhs_begin = arrayref_begin_ignoring_leading_1s(rhs); + auto rhs_end = rhs.end(); + + return ((lhs_end - lhs_begin) == (rhs_end - rhs_begin)) && + std::equal(lhs_begin, lhs_end, rhs_begin); +} + template class BroadcastIndexesIterator { public: @@ -35,7 +57,10 @@ class BroadcastIndexesIterator { template explicit BroadcastIndexesIterator(const Tensor& output, const Args&... args) : output_dim_or_zero_if_no_broadcasting_( - ((args.sizes() == output.sizes()) && ...) ? 0 : output.dim()), + (sizes_match_ignoring_leading_1s(args.sizes(), output.sizes()) && + ...) + ? 0 + : output.dim()), output_shape_(output.sizes()) { static_assert( sizeof...(args) == kNumInputs && (std::is_same_v && ...), From 75f8970ef85c122b82467b2c6f89d090d8e5c0a8 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 19 Mar 2025 17:36:18 -0700 Subject: [PATCH 08/27] Update [ghstack-poisoned] --- .lintrunner.toml | 2 + kernels/portable/cpu/op_mul.cpp | 4 +- kernels/portable/cpu/pattern/pattern.h | 15 ++- kernels/portable/cpu/util/elementwise_util.h | 96 ++++++++++++++++++- kernels/portable/cpu/util/targets.bzl | 1 + .../core/portable_type/c10/c10/targets.bzl | 5 +- 6 files changed, 109 insertions(+), 14 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index 842b4b1c6cb..1f3d128dd60 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -264,6 +264,8 @@ exclude_patterns = [ 'examples/**', 'exir/verification/bindings.cpp', 'extension/**', + # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include. + 'kernels/portable/cpu/util/elementwise_util.h', 'kernels/optimized/**', 'runtime/core/exec_aten/**', # Want to be able to keep c10 in sync with PyTorch core. diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 114e60ff171..e97263ef1bf 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -56,9 +56,7 @@ Tensor& mul_out( CTYPE_COMPUTE, op_name, utils::SupportedTensorDtypes::REALHBBF16>( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { - return val_a * val_b; - }, + [](const auto val_a, const auto val_b) { return val_a * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, diff --git a/kernels/portable/cpu/pattern/pattern.h b/kernels/portable/cpu/pattern/pattern.h index eae8a746d0e..1596a518b33 100644 --- a/kernels/portable/cpu/pattern/pattern.h +++ b/kernels/portable/cpu/pattern/pattern.h @@ -80,13 +80,12 @@ Tensor& unary_ufunc_realh( ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out); ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] { - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn( fn, ctx, in, utils::SupportedTensorDtypes::REALH, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; } @@ -107,13 +106,12 @@ Tensor& unary_ufunc_realhb_to_bool( return out; } ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn( [fn](const CTYPE_IN val_in) { return fn(val_in); }, ctx, in, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::BOOL); + out); }); return out; @@ -138,13 +136,12 @@ Tensor& unary_ufunc_realhbbf16_to_floathbf16( } ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn( [fn](const CTYPE_IN val_in) { return fn(val_in); }, ctx, in, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index 83289664cbb..28aab92fceb 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -15,6 +15,10 @@ #include #include +#ifdef ET_USE_PYTORCH_HEADERS +#include +#endif // ET_USE_PYTORCH_HEADERS + #include #include @@ -58,6 +62,38 @@ template using op_call_result = std::invoke_result_t...>; +#ifdef ET_USE_PYTORCH_HEADERS +template +struct is_vectorized : public std::false_type {}; + +template +struct is_vectorized> : public std::true_type {}; + +// TODO: can_use_vectorized and can_use_vectorized_impl are a failed +// attempt to use SFINAE to detect whether our generic lambda argument +// with deduced return type would compile if it was passed +// Vectorized instead of CTYPE_COMMON. SFINAE does not +// work that way (see +// e.g. https://stackoverflow.com/questions/53344484/hard-error-when-using-stdinvoke-result-t-with-a-generic-lambda, +// https://stackoverflow.com/questions/31368601/how-to-detect-if-a-generic-lambda-is-uncompilable-in-c-14); +// if we really want to do it then we need to at least require that +// our lambdas actively participate in being SFINAE-friendly, as in +// https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable. +template +struct can_use_vectorized_impl : std::false_type {}; +template +struct can_use_vectorized_impl>...>>().store(std::declval()))>, Args...> : public std::true_type {};//std::bool_constant>...>>::value> {}; + +// Can I call a function of type Op with sizeof...(Args) arguments of type +// at::vec::Vectorized? +// This is not possible in C++17 as the code is currently set up; see TODO above. +template +struct can_use_vectorized : public can_use_vectorized_impl {}; + +#endif // ET_USE_PYTORCH_HEADERS + template < typename CTYPE_COMMON, typename CTYPE_OUT, @@ -68,14 +104,72 @@ inline void dtype_specialized_elementwise_fn_impl( KernelRuntimeContext& ctx, const Tensor& out, Args... inputs) { + static_assert( + (std::is_same_v> && + ...)); constexpr auto kNumInputs = sizeof...(inputs); - ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMMON)) && ...)); + // All inputs must be of type CTYPE_COMMON. + ET_DCHECK( + ((inputs.first->scalar_type() == + CppTypeToScalarType::value) && + ...)); std::array inputs_data_ptrs = { inputs.first->template const_data_ptr()...}; CTYPE_OUT* const data_out = out.mutable_data_ptr(); +#ifdef ET_USE_PYTORCH_HEADERS + if constexpr (can_use_vectorized::value) { + const bool any_is_broadcasted = + !(torch::executor::internal::sizes_match_ignoring_leading_1s( + inputs.first->sizes(), out.sizes()) && + ...); + if (!any_is_broadcasted) { + using Vec = at::vec::Vectorized; + ::executorch::extension::parallel_for( + 0, + out.numel(), + ::executorch::extension::internal::GRAIN_SIZE, + [&](const auto begin, const auto end) { + const auto vectorized_begin = + begin + (Vec::size() - begin % Vec::size()) % Vec::size(); + const auto vectorized_end = end - (end % Vec::size()); + // Scalar prologue. + for (const auto idx : c10::irange(begin, vectorized_begin)) { + std::array loaded_inputs; + for (const auto input_idx : c10::irange(kNumInputs)) { + loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; + } + data_out[idx] = std::apply(compute_fun, loaded_inputs); + } + + // Main vectorized loop. + for (auto idx = vectorized_begin; idx < vectorized_end; + idx += Vec::size()) { + std::array loaded_vec_inputs; + for (const auto input_idx : c10::irange(kNumInputs)) { + loaded_vec_inputs[input_idx] = + Vec::loadu(&inputs_data_ptrs[input_idx][idx]); + } + auto result_vec = std::apply(compute_fun, loaded_vec_inputs); + result_vec.store(&data_out[idx]); + } + + // Scalar epilogue. + for (const auto idx : c10::irange(vectorized_end, end)) { + std::array loaded_inputs; + for (const auto input_idx : c10::irange(kNumInputs)) { + loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx]; + } + data_out[idx] = std::apply(compute_fun, loaded_inputs); + } + }); + return; + } + } +#endif + ::executorch::extension::parallel_for( 0, out.numel(), diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl index a623b9d4d7a..4a53041041e 100644 --- a/kernels/portable/cpu/util/targets.bzl +++ b/kernels/portable/cpu/util/targets.bzl @@ -110,6 +110,7 @@ def define_common_targets(): ":broadcast_indexes_range", ":broadcast_util", ":dtype_util", + "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch", "//executorch/runtime/kernel:kernel_runtime_context", "//executorch/runtime/kernel:thread_parallel_interface", ], diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index b8883c75bfe..8403f092de4 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -49,7 +49,10 @@ def define_common_targets(): runtime.cxx_library( name = "aten_headers_for_executorch", srcs = [], - visibility = ["//executorch/kernels/optimized/..."], + visibility = [ + "//executorch/kernels/optimized/...", + "//executorch/kernels/portable/cpu/util/...", + ], exported_deps = select({ "DEFAULT": [], "ovr_config//cpu:arm64": [ From 2d19e75d70e62056bab1426db4f6b7ffd10d5fc4 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 19 Mar 2025 19:57:07 -0700 Subject: [PATCH 09/27] Update [ghstack-poisoned] --- kernels/portable/cpu/util/elementwise_util.h | 49 +++++++++----------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index 28aab92fceb..3f1f712bec0 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -63,35 +63,16 @@ using op_call_result = std::invoke_result_t...>; #ifdef ET_USE_PYTORCH_HEADERS -template -struct is_vectorized : public std::false_type {}; - -template -struct is_vectorized> : public std::true_type {}; - -// TODO: can_use_vectorized and can_use_vectorized_impl are a failed -// attempt to use SFINAE to detect whether our generic lambda argument -// with deduced return type would compile if it was passed -// Vectorized instead of CTYPE_COMMON. SFINAE does not -// work that way (see -// e.g. https://stackoverflow.com/questions/53344484/hard-error-when-using-stdinvoke-result-t-with-a-generic-lambda, -// https://stackoverflow.com/questions/31368601/how-to-detect-if-a-generic-lambda-is-uncompilable-in-c-14); -// if we really want to do it then we need to at least require that -// our lambdas actively participate in being SFINAE-friendly, as in -// https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable. -template -struct can_use_vectorized_impl : std::false_type {}; -template -struct can_use_vectorized_impl>...>>().store(std::declval()))>, Args...> : public std::true_type {};//std::bool_constant>...>>::value> {}; - // Can I call a function of type Op with sizeof...(Args) arguments of type // at::vec::Vectorized? -// This is not possible in C++17 as the code is currently set up; see TODO above. -template -struct can_use_vectorized : public can_use_vectorized_impl {}; - +// +// See [NOTE: Generic lambdas] below for requirements on Op. +template +constexpr bool can_use_vectorized() { + return std::is_invocable_v< + Op, + ignore_first_yield_second>...>; +} #endif // ET_USE_PYTORCH_HEADERS template < @@ -349,6 +330,17 @@ inline void apply_unitensor_elementwise_fn( compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); } +/** + * Useful for unary elementwise operators. For each element of the + * input, call Op and write to the corresponding element of the + * output. Tensor broadcasting is applied wherever it is required. + * + * [NOTE: Generic lambdas]: If Op is a *generic* lambda (i.e., one with `auto` + * parameters; normal lambdas are fine), it must fulfill one of the + * following conditions. Either: + * 1) It must in fact compile when passed at::vec::Vectorized, or + * 2) It must be actively SFINAE-friendly, as per the C++17 examples in https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable . + */ template < typename CTYPE_COMMON, const char* op_name, @@ -390,6 +382,7 @@ inline void apply_bitensor_elementwise_fn( * Useful for bi-tensor elementwise operators. For each element of the inputs, * perform a computation and write to the corresponding element of the output. * Tensor broadcasting is applied wherever it is required. + * See [NOTE: Generic lambdas] if you want to pass a generic lambda for compute_fun. */ template < typename CTYPE_COMMON, @@ -456,6 +449,8 @@ inline void apply_tritensor_elementwise_fn( * * static constexpr const char op_name[] = "my_op"; * apply_ternary_elementwise_fn. + * + * See [NOTE: Generic lambdas] if you want to pass a generic lambda for compute_fun. */ template < typename CTYPE_COMMON, From 44ee51ad2f209d9981c78c069d0138d64de6481c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 25 Mar 2025 10:05:24 -0700 Subject: [PATCH 10/27] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.cpp | 4 ++ kernels/portable/cpu/util/dtype_util.h | 50 ++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/kernels/portable/cpu/util/dtype_util.cpp b/kernels/portable/cpu/util/dtype_util.cpp index d240b9f83bc..81b1b203a54 100644 --- a/kernels/portable/cpu/util/dtype_util.cpp +++ b/kernels/portable/cpu/util/dtype_util.cpp @@ -23,10 +23,14 @@ bool check_tensor_dtype( return executorch::runtime::tensor_is_realhbbf16_type(t); case SupportedTensorDtypes::REALHBF16: return executorch::runtime::tensor_is_realhbf16_type(t); + case SupportedTensorDtypes::REALH: + return executorch::runtime::tensor_is_realh_type(t); case SupportedTensorDtypes::FLOATHBF16: return executorch::runtime::tensor_is_floating_type(t); case SupportedTensorDtypes::INTB: return executorch::runtime::tensor_is_integral_type(t, true); + case SupportedTensorDtypes::BOOL: + return executorch::runtime::tensor_is_type(t, ScalarType::Bool); case SupportedTensorDtypes::BOOL_OR_BYTE: return (executorch::runtime::tensor_is_type( t, ScalarType::Bool, ScalarType::Byte)); diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 71d3625052d..df92428004e 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -51,6 +51,15 @@ load_to_common_fn get_load_to_common_fn_realhbf16( return result; } +template +load_to_common_fn get_load_to_common_fn_realh(const Tensor& t) { + CTYPE_COMMON (*result)(const void*) = nullptr; + ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() { + result = internal::load_and_convert; + }); + return result; +} + template load_to_common_fn get_load_to_common_fn_floathbf16( const Tensor& t) { @@ -72,6 +81,16 @@ load_to_common_fn get_load_to_common_fn_intb(const Tensor& t) { return result; } +template +load_to_common_fn get_load_to_common_fn_bool(const Tensor& t) { + ET_CHECK_MSG( + t.scalar_type() == ScalarType::Bool, + "Unhandled dtype %s for %s", + ::executorch::runtime::toString(t.scalar_type()), + op_name); + return internal::load_and_convert; +} + template load_to_common_fn get_load_to_common_fn_bool_or_byte( const Tensor& t) { @@ -137,6 +156,16 @@ store_common_to_tensor_fn get_store_common_to_tensor_fn_realhbf16( return result; } +template +store_common_to_tensor_fn get_store_common_to_tensor_fn_realh( + const Tensor& t) { + void (*result)(CTYPE_COMMON, void*) = nullptr; + ET_SWITCH_REALH_TYPES(t.scalar_type(), unused, op_name, TENSOR_CTYPE, [&]() { + result = internal::convert_and_store; + }); + return result; +} + template store_common_to_tensor_fn get_store_common_to_tensor_fn_floathbf16(const Tensor& t) { @@ -159,6 +188,17 @@ store_common_to_tensor_fn get_store_common_to_tensor_fn_intb( return result; } +template +store_common_to_tensor_fn get_store_common_to_tensor_fn_bool( + const Tensor& t) { + ET_CHECK_MSG( + t.scalar_type() == ScalarType::Bool, + "Unhandled dtype %s for %s", + ::executorch::runtime::toString(t.scalar_type()), + op_name); + return internal::convert_and_store; +} + template store_common_to_tensor_fn get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) { @@ -191,8 +231,10 @@ get_store_common_to_tensor_fn_same_as_common(const Tensor& t) { enum class SupportedTensorDtypes { REALHBBF16, REALHBF16, + REALH, FLOATHBF16, INTB, + BOOL, BOOL_OR_BYTE, SAME_AS_COMPUTE, SAME_AS_COMMON, @@ -209,10 +251,14 @@ load_to_common_fn get_load_to_common_fn( return get_load_to_common_fn_realhbbf16(t); case SupportedTensorDtypes::REALHBF16: return get_load_to_common_fn_realhbf16(t); + case SupportedTensorDtypes::REALH: + return get_load_to_common_fn_realh(t); case SupportedTensorDtypes::FLOATHBF16: return get_load_to_common_fn_realhbf16(t); case SupportedTensorDtypes::INTB: return get_load_to_common_fn_intb(t); + case SupportedTensorDtypes::BOOL: + return get_load_to_common_fn_bool(t); case SupportedTensorDtypes::BOOL_OR_BYTE: return get_load_to_common_fn_bool_or_byte(t); case SupportedTensorDtypes::SAME_AS_COMPUTE: @@ -233,10 +279,14 @@ store_common_to_tensor_fn get_store_common_to_tensor_fn( return get_store_common_to_tensor_fn_realhbbf16(t); case SupportedTensorDtypes::REALHBF16: return get_store_common_to_tensor_fn_realhbf16(t); + case SupportedTensorDtypes::REALH: + return get_store_common_to_tensor_fn_realh(t); case SupportedTensorDtypes::FLOATHBF16: return get_store_common_to_tensor_fn_floathbf16(t); case SupportedTensorDtypes::INTB: return get_store_common_to_tensor_fn_intb(t); + case SupportedTensorDtypes::BOOL: + return get_store_common_to_tensor_fn_bool(t); case SupportedTensorDtypes::BOOL_OR_BYTE: return get_store_common_to_tensor_fn_bool_or_byte( t); From 7f57a19d5135cd8e1b0689e737b1189d80f47065 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 26 Mar 2025 15:54:09 -0700 Subject: [PATCH 11/27] Update [ghstack-poisoned] --- runtime/core/portable_type/c10/c10/targets.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index a49f30c72a0..d9d72b5be3f 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -87,5 +87,5 @@ def define_common_targets(): # linker failure. "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(), "DEFAULT": [], - }) + ["-DET_USE_PYTORCH_HEADERS", "-DSTANDALONE_TORCH_HEADER"], + }) + ["-DSTANDALONE_TORCH_HEADER"] + ([] if runtime.is_oss else ["-DET_USE_PYTORCH_HEADERS"]), ) From 4553283773f0a5fb325a1a3eac54e26835327cbd Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 26 Mar 2025 16:20:19 -0700 Subject: [PATCH 12/27] Update [ghstack-poisoned] --- kernels/portable/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index 53ad88880d6..edea045d65f 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -66,9 +66,7 @@ gen_operators_lib( # Portable kernels support optional parallelization (and, in the # future, perhaps other performance features). If support is present, # produce an optimized version. -set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL) - -if(BUILD_OPTIMIZED_PORTABLE_KERNELS) +if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_library(optimized_portable_kernels ${_portable_kernels__srcs}) target_link_libraries(optimized_portable_kernels PRIVATE executorch) target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) From ff2c3580800e0380c38e0fd0e9ec76a05164f557 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 26 Mar 2025 16:55:40 -0700 Subject: [PATCH 13/27] Update [ghstack-poisoned] --- kernels/portable/cpu/util/elementwise_util.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index 040ebea0b4c..08d54e11090 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -64,16 +64,16 @@ inline void dtype_specialized_elementwise_fn_impl( constexpr auto kNumInputs = sizeof...(inputs); ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMMON)) && ...)); - std::array inputs_data_ptrs = { - inputs.first->template const_data_ptr()...}; - - CTYPE_OUT* const data_out = out.mutable_data_ptr(); - ::executorch::extension::parallel_for( 0, out.numel(), ::executorch::extension::internal::GRAIN_SIZE, [&](const auto begin, const auto end) { + std::array inputs_data_ptrs = { + inputs.first->template const_data_ptr()...}; + + CTYPE_OUT* const data_out = out.mutable_data_ptr(); + const auto range = BroadcastIndexesRange(out, (*inputs.first)...); auto begin_it = range.begin(); From 943ab82be758baadbafc4287756fa6dbe904f6df Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 28 Mar 2025 09:51:22 -0700 Subject: [PATCH 14/27] Update [ghstack-poisoned] --- tools/cmake/executorch-config.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index 49aa6cf08af..56c7fa2d7d4 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -149,7 +149,7 @@ endif() if(TARGET coremldelegate) set_target_properties( coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES - "coreml_inmemoryfs;coreml_util" + "coreml_inmemoryfs;coreml_util" ) endif() @@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib) endif() if(TARGET extension_threadpool) target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) + set_target_properties( + extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES + "cpuinfo;pthreadpool" + ) endif() From f22d039d23db4103dc0697b638813ce2e4bc4a5f Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 28 Mar 2025 09:51:28 -0700 Subject: [PATCH 15/27] Update [ghstack-poisoned] --- test/CMakeLists.txt | 13 +++++++ test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 test/build_optimized_size_test.sh diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3932f1097e1..812e8e4a67a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release") target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections") endif() +# +# size_test_all_optimized_ops: binary with optimized ops and no delegate backend +# +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) +add_executable(size_test_all_optimized_ops ${_size_test__srcs}) +target_link_options_shared_lib(optimized_native_cpu_ops_lib) +target_link_libraries( + size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib) +if(CMAKE_BUILD_TYPE EQUAL "Release") + target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections") +endif() +endif() + # Print all summary executorch_print_configuration_summary() diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh new file mode 100644 index 00000000000..181c2ce617d --- /dev/null +++ b/test/build_optimized_size_test.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Unlike build_size_test.sh, this script: +# - does not attempt to disable exceptions and RTTI +# - as a consequence, is able to build optimized kernels +# - uses MinSizeRel builds +# - is not currently intended to run in CI +# - sets -g to make it easier to use tools like bloaty to investigate size + +set -e + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh" + +cmake_install_executorch_lib() { + echo "Installing libexecutorch.a" + clean_executorch_install_folders + update_tokenizers_git_submodule + CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \ + -DCMAKE_CXX_STANDARD_REQUIRED=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DOPTIMIZE_SIZE=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config MinSizeRel +} + +test_cmake_size_test() { + CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test + + echo "Build size test" + cmake --build cmake-out/test -j9 --config MinSizeRel + + echo 'ExecuTorch with no ops binary size, unstripped:' + ls -al cmake-out/test/size_test + + echo 'ExecuTorch with portable ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_ops + + echo 'ExecuTorch with optimized ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_optimized_ops +} + +if [[ -z $PYTHON_EXECUTABLE ]]; then + PYTHON_EXECUTABLE=python3 +fi + +cmake_install_executorch_lib +test_cmake_size_test From d5dfe2f230e42139ec78e054e8376765acfe736a Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 28 Mar 2025 09:51:37 -0700 Subject: [PATCH 16/27] Update [ghstack-poisoned] --- kernels/portable/cpu/op_add.cpp | 20 ++++++++----- kernels/portable/cpu/op_addmm.cpp | 10 ++++--- kernels/portable/cpu/op_atan2.cpp | 10 ++++--- kernels/portable/cpu/op_clamp.cpp | 18 ++++++++---- kernels/portable/cpu/op_copy.cpp | 20 ++++++++----- kernels/portable/cpu/op_div.cpp | 31 ++++++++++++-------- kernels/portable/cpu/op_elu.cpp | 11 ++++--- kernels/portable/cpu/op_floor_divide.cpp | 9 ++++-- kernels/portable/cpu/op_fmod.cpp | 18 ++++++++---- kernels/portable/cpu/op_maximum.cpp | 8 +++-- kernels/portable/cpu/op_minimum.cpp | 9 ++++-- kernels/portable/cpu/op_mul.cpp | 10 ++++--- kernels/portable/cpu/op_pow.cpp | 27 +++++++++++------ kernels/portable/cpu/op_remainder.cpp | 18 ++++++++---- kernels/portable/cpu/op_rsub.cpp | 10 ++++--- kernels/portable/cpu/op_sigmoid.cpp | 11 ++++--- kernels/portable/cpu/op_sub.cpp | 20 ++++++++----- kernels/portable/cpu/op_where.cpp | 14 +++++---- kernels/portable/cpu/pattern/bitwise_op.h | 18 ++++++++---- kernels/portable/cpu/pattern/comparison_op.h | 18 ++++++++---- kernels/portable/cpu/pattern/logical_op.h | 9 ++++-- 21 files changed, 201 insertions(+), 118 deletions(-) diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index adb9d4ea723..555341b3447 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -52,8 +52,11 @@ Tensor& add_out( ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_bitensor_elementwise_fn( - [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [val_alpha](const auto val_a, const auto val_b) { return val_a + val_alpha * val_b; }, ctx, @@ -61,8 +64,7 @@ Tensor& add_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -100,8 +102,11 @@ Tensor& add_scalar_out( static constexpr const char op_name[] = "add.Scalar_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_unitensor_elementwise_fn( - [b, alpha](const CTYPE_COMPUTE val_a) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [b, alpha](const auto val_a) { CTYPE_COMPUTE val_b = utils::scalar_to(b); CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); return val_a + val_alpha * val_b; @@ -109,8 +114,7 @@ Tensor& add_scalar_out( ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp index d1df5818cd8..440a8b2c0fa 100644 --- a/kernels/portable/cpu/op_addmm.cpp +++ b/kernels/portable/cpu/op_addmm.cpp @@ -88,8 +88,11 @@ Tensor& addmm_out( n, p); - utils::apply_bitensor_elementwise_fn( - [alpha_val, beta_val](const CTYPE val_a, const CTYPE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + [alpha_val, beta_val](const auto val_a, const auto val_b) { return val_a * alpha_val + val_b * beta_val; }, ctx, @@ -97,8 +100,7 @@ Tensor& addmm_out( utils::SupportedTensorDtypes::REALHBF16, in, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); } }); diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp index 19267ef49dd..33d66cf2ad7 100644 --- a/kernels/portable/cpu/op_atan2.cpp +++ b/kernels/portable/cpu/op_atan2.cpp @@ -55,8 +55,11 @@ Tensor& atan2_out( static constexpr const char op_name[] = "atan2.out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::FLOATHBF16>( + [](const auto val_a, const auto val_b) { return std::atan2(val_a, val_b); }, ctx, @@ -64,8 +67,7 @@ Tensor& atan2_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index c1c40a38f34..6974789eccf 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -134,8 +134,12 @@ Tensor& clamp_out( static constexpr const char op_name[] = "clamp.out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE val_out = val_in; if (has_min) { val_out = utils::max_override( @@ -150,8 +154,7 @@ Tensor& clamp_out( ctx, in, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; @@ -210,11 +213,15 @@ Tensor& clamp_tensor_out( static constexpr const char op_name[] = "clamp.Tensor_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_tritensor_elementwise_fn( + utils::apply_tritensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [has_min, has_max]( const CTYPE_COMPUTE val_in, const CTYPE_COMPUTE val_min, const CTYPE_COMPUTE val_max) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE val_out = val_in; if (has_min) { val_out = utils::max_override(val_out, val_min); @@ -231,8 +238,7 @@ Tensor& clamp_tensor_out( utils::SupportedTensorDtypes::REALHBBF16, max, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp index 19b0c3a2f6a..30fff4d2c10 100644 --- a/kernels/portable/cpu/op_copy.cpp +++ b/kernels/portable/cpu/op_copy.cpp @@ -47,15 +47,17 @@ Tensor& copy_out( static constexpr const char op_name[] = "copy.out"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy.out", CTYPE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; }, + utils::apply_bitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [](ET_UNUSED const auto _, const auto val_src) { return val_src; }, ctx, in, utils::SupportedTensorDtypes::REALHBBF16, src, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -80,15 +82,17 @@ Tensor& copy_( static constexpr const char op_name[] = "copy_"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy_", CTYPE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; }, + utils::apply_bitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [](ET_UNUSED const auto _, const auto val_src) { return val_src; }, ctx, in, utils::SupportedTensorDtypes::REALHBBF16, src, utils::SupportedTensorDtypes::REALHBBF16, - in, - utils::SupportedTensorDtypes::REALHBBF16); + in); }); return in; diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp index 94cd9ea5011..70f9479c464 100644 --- a/kernels/portable/cpu/op_div.cpp +++ b/kernels/portable/cpu/op_div.cpp @@ -58,17 +58,17 @@ Tensor& div_out( static constexpr const char op_name[] = "div.out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { - return val_a / val_b; - }, + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::FLOATHBF16>( + [](const auto val_a, const auto val_b) { return val_a / val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; @@ -122,9 +122,13 @@ Tensor& div_out_mode( bool div_by_zero_error = false; ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [mode_is_trunc, &div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. if (is_integral_type::value) { if (val_b == 0) { div_by_zero_error = true; @@ -146,8 +150,7 @@ Tensor& div_out_mode( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( @@ -188,13 +191,15 @@ Tensor& div_scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( - [val_b](const CTYPE_COMPUTE val_a) { return val_a / val_b; }, + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b](const auto val_a) { return val_a / val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp index d4846fb1bfb..d6533642860 100644 --- a/kernels/portable/cpu/op_elu.cpp +++ b/kernels/portable/cpu/op_elu.cpp @@ -44,8 +44,12 @@ Tensor& elu_out( ET_EXTRACT_SCALAR(scale, math_scale); ET_EXTRACT_SCALAR(input_scale, math_input_scale); const auto negcoef = math_alpha * math_scale; - utils::apply_unitensor_elementwise_fn( - [negcoef, math_scale, math_input_scale](auto x) { + utils::apply_unitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [negcoef, math_scale, math_input_scale](const auto x) { + // TODO: rewrite this to be vectorization-capable. return MathT(x) <= MathT(0) ? std::expm1(MathT(x) * math_input_scale) * negcoef : MathT(x) * math_scale; @@ -53,8 +57,7 @@ Tensor& elu_out( ctx, in, utils::SupportedTensorDtypes::FLOATHBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; } diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp index 85eb612ea1e..50723c3fa0a 100644 --- a/kernels/portable/cpu/op_floor_divide.cpp +++ b/kernels/portable/cpu/op_floor_divide.cpp @@ -53,9 +53,13 @@ Tensor& floor_divide_out( bool div_by_zero_error = false; ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. if (is_integral_type::value) { if (val_b == 0) { div_by_zero_error = true; @@ -69,8 +73,7 @@ Tensor& floor_divide_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp index 1e8cba0f1ae..96a971b166a 100644 --- a/kernels/portable/cpu/op_fmod.cpp +++ b/kernels/portable/cpu/op_fmod.cpp @@ -55,9 +55,13 @@ Tensor& fmod_Tensor_out( bool div_by_zero_error = false; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = 0; if (is_integral_type::value) { if (val_b == 0) { @@ -73,8 +77,7 @@ Tensor& fmod_Tensor_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( @@ -131,16 +134,19 @@ Tensor& fmod_Scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = std::fmod(val_a, val_b); return value; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index 5cf3b5a19f8..3a84095a4df 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -45,7 +45,10 @@ Tensor& maximum_out( static constexpr const char op_name[] = "maximum.out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { return utils::max_override(val_a, val_b); }, @@ -54,8 +57,7 @@ Tensor& maximum_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index e2c641bdb22..5c0e79eb9bb 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -45,8 +45,12 @@ Tensor& minimum_out( static constexpr const char op_name[] = "minimum.out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. return utils::min_override(val_a, val_b); }, ctx, @@ -54,8 +58,7 @@ Tensor& minimum_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 114e60ff171..6156227732d 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -97,13 +97,15 @@ Tensor& mul_scalar_out( ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( - [val_b](const CTYPE_COMPUTE val_a) { return val_a * val_b; }, + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b](const auto val_a) { return val_a * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp index 81319b03d9f..4d2673cb72d 100644 --- a/kernels/portable/cpu/op_pow.cpp +++ b/kernels/portable/cpu/op_pow.cpp @@ -53,8 +53,12 @@ Tensor& pow_Tensor_Tensor_out( static constexpr const char op_name[] = "pow.Tensor_Tensor_out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. return std::pow(val_a, val_b); }, ctx, @@ -62,8 +66,7 @@ Tensor& pow_Tensor_Tensor_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; @@ -104,13 +107,16 @@ Tensor& pow_Tensor_Scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + // TODO: rewrite this to be vectorization-capable. [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; @@ -151,13 +157,16 @@ Tensor& pow_Scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_a = utils::scalar_to(a); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + // TODO: rewrite this to be vectorization-capable. [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); }, ctx, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp index d34c34a0380..01a5d72de01 100644 --- a/kernels/portable/cpu/op_remainder.cpp +++ b/kernels/portable/cpu/op_remainder.cpp @@ -53,9 +53,13 @@ Tensor& remainder_Tensor_out( bool div_by_zero_error = false; ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = 0; if (is_integral_type::value) { if (val_b == 0) { @@ -71,8 +75,7 @@ Tensor& remainder_Tensor_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( @@ -126,15 +129,18 @@ Tensor& remainder_Scalar_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. return utils::remainder_override(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp index 46af021efda..6a0a77b6596 100644 --- a/kernels/portable/cpu/op_rsub.cpp +++ b/kernels/portable/cpu/op_rsub.cpp @@ -52,15 +52,17 @@ Tensor& rsub_scalar_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_unitensor_elementwise_fn( - [val_b, val_alpha](const CTYPE_COMPUTE val_a) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b, val_alpha](const auto val_a) { return val_b - val_alpha * val_a; }, ctx, a, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp index 09cfed524f9..acb743a2db6 100644 --- a/kernels/portable/cpu/op_sigmoid.cpp +++ b/kernels/portable/cpu/op_sigmoid.cpp @@ -45,8 +45,12 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { static constexpr const char op_name[] = "sigmoid.out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_unitensor_elementwise_fn( - [](const CTYPE_COMPUTE val_in) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::FLOATHBF16>( + [](const auto val_in) -> CTYPE_COMPUTE { + // TODO: rewrite this to be vectorization-capable CTYPE_COMPUTE out_val = static_cast(1.0) / (static_cast(1.0) + exp(-val_in)); return out_val; @@ -54,8 +58,7 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { ctx, in, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp index 6217f82c3b1..aa90df8dee4 100644 --- a/kernels/portable/cpu/op_sub.cpp +++ b/kernels/portable/cpu/op_sub.cpp @@ -56,8 +56,11 @@ Tensor& sub_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_bitensor_elementwise_fn( - [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + [val_alpha](const auto val_a, const auto val_b) { return val_a - val_alpha * val_b; }, ctx, @@ -65,8 +68,7 @@ Tensor& sub_out( utils::SupportedTensorDtypes::REALHBF16, b, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; @@ -110,15 +112,17 @@ Tensor& sub_scalar_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_unitensor_elementwise_fn( - [val_b, val_alpha](const CTYPE_COMPUTE val_a) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b, val_alpha](const auto val_a) { return val_a - val_alpha * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp index b455c45c2d1..692e296ee00 100644 --- a/kernels/portable/cpu/op_where.cpp +++ b/kernels/portable/cpu/op_where.cpp @@ -43,10 +43,13 @@ Tensor& where_out( static constexpr const char op_name[] = "where.self_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_tritensor_elementwise_fn( - [](const CTYPE_COMPUTE val_a, - const CTYPE_COMPUTE val_b, - const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; }, + utils::apply_tritensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [](const auto val_a, const auto val_b, const auto val_c) { + return val_c ? val_a : val_b; + }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, @@ -54,8 +57,7 @@ Tensor& where_out( utils::SupportedTensorDtypes::REALHBBF16, cond, utils::SupportedTensorDtypes::BOOL_OR_BYTE, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h index 6e4c111b8f2..f78ce796e6c 100644 --- a/kernels/portable/cpu/pattern/bitwise_op.h +++ b/kernels/portable/cpu/pattern/bitwise_op.h @@ -80,15 +80,18 @@ Tensor& bitwise_tensor_out( ET_SWITCH_INT_TYPES_AND( Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + // TODO: rewrite this to be vectorization-capable. BitwiseFnForOp::value, ctx, a, utils::SupportedTensorDtypes::INTB, b, utils::SupportedTensorDtypes::INTB, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -121,16 +124,19 @@ Tensor& bitwise_scalar_out( ET_SWITCH_INT_TYPES_AND( Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. return BitwiseFnForOp::value( val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::INTB, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/pattern/comparison_op.h b/kernels/portable/cpu/pattern/comparison_op.h index e0d9bf4dcab..643d7623922 100644 --- a/kernels/portable/cpu/pattern/comparison_op.h +++ b/kernels/portable/cpu/pattern/comparison_op.h @@ -91,15 +91,18 @@ Tensor& comparison_tensor_out( ScalarType compute_type = utils::get_compute_type(common_type); ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + // TODO: rewrite this to be vectorization-capable. ComparisonFnForOp::value, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -127,15 +130,18 @@ Tensor& comparison_scalar_out( ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. return ComparisonFnForOp::value(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/pattern/logical_op.h b/kernels/portable/cpu/pattern/logical_op.h index 017822a85a6..4547d3df51b 100644 --- a/kernels/portable/cpu/pattern/logical_op.h +++ b/kernels/portable/cpu/pattern/logical_op.h @@ -34,15 +34,18 @@ Tensor& logical_tensor_out( InvalidArgument, out); - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + bool, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + // TODO: rewrite this to be vectorization-capable. fn, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); return out; } From 3f1b775fe481d9d9d88896c913f7033dc3cfd21d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Fri, 28 Mar 2025 09:51:43 -0700 Subject: [PATCH 17/27] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.h | 35 ++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index b5cd980b085..eb1ee83111e 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -228,7 +228,7 @@ enum class SupportedTensorDtypes { namespace internal { template -load_to_compute_fn get_load_to_compute_fn( +load_to_compute_fn get_load_to_compute_fn_impl( const Tensor& t, SupportedTensorDtypes dtypes) { switch (dtypes) { @@ -252,7 +252,7 @@ load_to_compute_fn get_load_to_compute_fn( } template -store_compute_to_tensor_fn get_store_compute_to_tensor_fn( +store_compute_to_tensor_fn get_store_compute_to_tensor_fn_impl( const Tensor& t, SupportedTensorDtypes dtypes) { switch (dtypes) { @@ -285,6 +285,37 @@ store_compute_to_tensor_fn get_store_compute_to_tensor_fn( return nullptr; } +#ifndef EXECUTORCH_SELECTIVE_BUILD_DTYPE +constexpr const char kGenericElementwiseOpName[] = "generic_elementwise_op"; +#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE + +template +load_to_compute_fn get_load_to_compute_fn( + const Tensor& t, + SupportedTensorDtypes dtypes) { + return get_load_to_compute_fn_impl< + CTYPE_COMPUTE, +#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE + op_name +#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE + kGenericElementwiseOpName +#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE + >(t, dtypes); +} + +template +store_compute_to_tensor_fn get_store_compute_to_tensor_fn( + const Tensor& t, + SupportedTensorDtypes dtypes) { + return get_store_compute_to_tensor_fn_impl< + CTYPE_COMPUTE, +#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE + op_name +#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE + kGenericElementwiseOpName +#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE + >(t, dtypes); +} bool check_tensor_dtype( const Tensor t, SupportedTensorDtypes dtypes, From 7f2bbdb098596d232cd1193ea76422308ab74dc3 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 1 Apr 2025 19:32:41 -0700 Subject: [PATCH 18/27] Update [ghstack-poisoned] --- kernels/portable/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index 53ad88880d6..edea045d65f 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -66,9 +66,7 @@ gen_operators_lib( # Portable kernels support optional parallelization (and, in the # future, perhaps other performance features). If support is present, # produce an optimized version. -set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL) - -if(BUILD_OPTIMIZED_PORTABLE_KERNELS) +if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_library(optimized_portable_kernels ${_portable_kernels__srcs}) target_link_libraries(optimized_portable_kernels PRIVATE executorch) target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) From 9e42e93a96531d2daa189ac66b796aa515e56cd4 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 1 Apr 2025 19:32:46 -0700 Subject: [PATCH 19/27] Update [ghstack-poisoned] --- tools/cmake/executorch-config.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index 49aa6cf08af..56c7fa2d7d4 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -149,7 +149,7 @@ endif() if(TARGET coremldelegate) set_target_properties( coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES - "coreml_inmemoryfs;coreml_util" + "coreml_inmemoryfs;coreml_util" ) endif() @@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib) endif() if(TARGET extension_threadpool) target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) + set_target_properties( + extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES + "cpuinfo;pthreadpool" + ) endif() From 96d258eb5cb1601283f636f6dd8a046ae5c9e4ae Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 1 Apr 2025 19:33:00 -0700 Subject: [PATCH 20/27] Update [ghstack-poisoned] --- test/CMakeLists.txt | 13 +++++++ test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 test/build_optimized_size_test.sh diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3932f1097e1..812e8e4a67a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release") target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections") endif() +# +# size_test_all_optimized_ops: binary with optimized ops and no delegate backend +# +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) +add_executable(size_test_all_optimized_ops ${_size_test__srcs}) +target_link_options_shared_lib(optimized_native_cpu_ops_lib) +target_link_libraries( + size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib) +if(CMAKE_BUILD_TYPE EQUAL "Release") + target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections") +endif() +endif() + # Print all summary executorch_print_configuration_summary() diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh new file mode 100644 index 00000000000..181c2ce617d --- /dev/null +++ b/test/build_optimized_size_test.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Unlike build_size_test.sh, this script: +# - does not attempt to disable exceptions and RTTI +# - as a consequence, is able to build optimized kernels +# - uses MinSizeRel builds +# - is not currently intended to run in CI +# - sets -g to make it easier to use tools like bloaty to investigate size + +set -e + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh" + +cmake_install_executorch_lib() { + echo "Installing libexecutorch.a" + clean_executorch_install_folders + update_tokenizers_git_submodule + CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \ + -DCMAKE_CXX_STANDARD_REQUIRED=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DOPTIMIZE_SIZE=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config MinSizeRel +} + +test_cmake_size_test() { + CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test + + echo "Build size test" + cmake --build cmake-out/test -j9 --config MinSizeRel + + echo 'ExecuTorch with no ops binary size, unstripped:' + ls -al cmake-out/test/size_test + + echo 'ExecuTorch with portable ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_ops + + echo 'ExecuTorch with optimized ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_optimized_ops +} + +if [[ -z $PYTHON_EXECUTABLE ]]; then + PYTHON_EXECUTABLE=python3 +fi + +cmake_install_executorch_lib +test_cmake_size_test From a7562543a4fec8abd42492ae80c86acb9918d6cf Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 1 Apr 2025 19:33:07 -0700 Subject: [PATCH 21/27] Update [ghstack-poisoned] --- kernels/portable/cpu/op_add.cpp | 20 ++++++++----- kernels/portable/cpu/op_addmm.cpp | 10 ++++--- kernels/portable/cpu/op_atan2.cpp | 10 ++++--- kernels/portable/cpu/op_clamp.cpp | 18 ++++++++---- kernels/portable/cpu/op_copy.cpp | 20 ++++++++----- kernels/portable/cpu/op_div.cpp | 31 ++++++++++++-------- kernels/portable/cpu/op_elu.cpp | 11 ++++--- kernels/portable/cpu/op_floor_divide.cpp | 9 ++++-- kernels/portable/cpu/op_fmod.cpp | 18 ++++++++---- kernels/portable/cpu/op_maximum.cpp | 8 +++-- kernels/portable/cpu/op_minimum.cpp | 9 ++++-- kernels/portable/cpu/op_mul.cpp | 10 ++++--- kernels/portable/cpu/op_pow.cpp | 27 +++++++++++------ kernels/portable/cpu/op_remainder.cpp | 18 ++++++++---- kernels/portable/cpu/op_rsub.cpp | 10 ++++--- kernels/portable/cpu/op_sigmoid.cpp | 11 ++++--- kernels/portable/cpu/op_sub.cpp | 20 ++++++++----- kernels/portable/cpu/op_where.cpp | 14 +++++---- kernels/portable/cpu/pattern/bitwise_op.h | 18 ++++++++---- kernels/portable/cpu/pattern/comparison_op.h | 18 ++++++++---- kernels/portable/cpu/pattern/logical_op.h | 9 ++++-- 21 files changed, 201 insertions(+), 118 deletions(-) diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index adb9d4ea723..555341b3447 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -52,8 +52,11 @@ Tensor& add_out( ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_bitensor_elementwise_fn( - [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [val_alpha](const auto val_a, const auto val_b) { return val_a + val_alpha * val_b; }, ctx, @@ -61,8 +64,7 @@ Tensor& add_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -100,8 +102,11 @@ Tensor& add_scalar_out( static constexpr const char op_name[] = "add.Scalar_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_unitensor_elementwise_fn( - [b, alpha](const CTYPE_COMPUTE val_a) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [b, alpha](const auto val_a) { CTYPE_COMPUTE val_b = utils::scalar_to(b); CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); return val_a + val_alpha * val_b; @@ -109,8 +114,7 @@ Tensor& add_scalar_out( ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp index d1df5818cd8..440a8b2c0fa 100644 --- a/kernels/portable/cpu/op_addmm.cpp +++ b/kernels/portable/cpu/op_addmm.cpp @@ -88,8 +88,11 @@ Tensor& addmm_out( n, p); - utils::apply_bitensor_elementwise_fn( - [alpha_val, beta_val](const CTYPE val_a, const CTYPE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + [alpha_val, beta_val](const auto val_a, const auto val_b) { return val_a * alpha_val + val_b * beta_val; }, ctx, @@ -97,8 +100,7 @@ Tensor& addmm_out( utils::SupportedTensorDtypes::REALHBF16, in, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); } }); diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp index 19267ef49dd..33d66cf2ad7 100644 --- a/kernels/portable/cpu/op_atan2.cpp +++ b/kernels/portable/cpu/op_atan2.cpp @@ -55,8 +55,11 @@ Tensor& atan2_out( static constexpr const char op_name[] = "atan2.out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::FLOATHBF16>( + [](const auto val_a, const auto val_b) { return std::atan2(val_a, val_b); }, ctx, @@ -64,8 +67,7 @@ Tensor& atan2_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index c1c40a38f34..6974789eccf 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -134,8 +134,12 @@ Tensor& clamp_out( static constexpr const char op_name[] = "clamp.out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE val_out = val_in; if (has_min) { val_out = utils::max_override( @@ -150,8 +154,7 @@ Tensor& clamp_out( ctx, in, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; @@ -210,11 +213,15 @@ Tensor& clamp_tensor_out( static constexpr const char op_name[] = "clamp.Tensor_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_tritensor_elementwise_fn( + utils::apply_tritensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [has_min, has_max]( const CTYPE_COMPUTE val_in, const CTYPE_COMPUTE val_min, const CTYPE_COMPUTE val_max) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE val_out = val_in; if (has_min) { val_out = utils::max_override(val_out, val_min); @@ -231,8 +238,7 @@ Tensor& clamp_tensor_out( utils::SupportedTensorDtypes::REALHBBF16, max, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp index 19b0c3a2f6a..30fff4d2c10 100644 --- a/kernels/portable/cpu/op_copy.cpp +++ b/kernels/portable/cpu/op_copy.cpp @@ -47,15 +47,17 @@ Tensor& copy_out( static constexpr const char op_name[] = "copy.out"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy.out", CTYPE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; }, + utils::apply_bitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [](ET_UNUSED const auto _, const auto val_src) { return val_src; }, ctx, in, utils::SupportedTensorDtypes::REALHBBF16, src, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -80,15 +82,17 @@ Tensor& copy_( static constexpr const char op_name[] = "copy_"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy_", CTYPE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; }, + utils::apply_bitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [](ET_UNUSED const auto _, const auto val_src) { return val_src; }, ctx, in, utils::SupportedTensorDtypes::REALHBBF16, src, utils::SupportedTensorDtypes::REALHBBF16, - in, - utils::SupportedTensorDtypes::REALHBBF16); + in); }); return in; diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp index 94cd9ea5011..70f9479c464 100644 --- a/kernels/portable/cpu/op_div.cpp +++ b/kernels/portable/cpu/op_div.cpp @@ -58,17 +58,17 @@ Tensor& div_out( static constexpr const char op_name[] = "div.out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { - return val_a / val_b; - }, + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::FLOATHBF16>( + [](const auto val_a, const auto val_b) { return val_a / val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; @@ -122,9 +122,13 @@ Tensor& div_out_mode( bool div_by_zero_error = false; ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [mode_is_trunc, &div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. if (is_integral_type::value) { if (val_b == 0) { div_by_zero_error = true; @@ -146,8 +150,7 @@ Tensor& div_out_mode( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( @@ -188,13 +191,15 @@ Tensor& div_scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( - [val_b](const CTYPE_COMPUTE val_a) { return val_a / val_b; }, + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b](const auto val_a) { return val_a / val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp index d4846fb1bfb..d6533642860 100644 --- a/kernels/portable/cpu/op_elu.cpp +++ b/kernels/portable/cpu/op_elu.cpp @@ -44,8 +44,12 @@ Tensor& elu_out( ET_EXTRACT_SCALAR(scale, math_scale); ET_EXTRACT_SCALAR(input_scale, math_input_scale); const auto negcoef = math_alpha * math_scale; - utils::apply_unitensor_elementwise_fn( - [negcoef, math_scale, math_input_scale](auto x) { + utils::apply_unitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [negcoef, math_scale, math_input_scale](const auto x) { + // TODO: rewrite this to be vectorization-capable. return MathT(x) <= MathT(0) ? std::expm1(MathT(x) * math_input_scale) * negcoef : MathT(x) * math_scale; @@ -53,8 +57,7 @@ Tensor& elu_out( ctx, in, utils::SupportedTensorDtypes::FLOATHBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; } diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp index 85eb612ea1e..50723c3fa0a 100644 --- a/kernels/portable/cpu/op_floor_divide.cpp +++ b/kernels/portable/cpu/op_floor_divide.cpp @@ -53,9 +53,13 @@ Tensor& floor_divide_out( bool div_by_zero_error = false; ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. if (is_integral_type::value) { if (val_b == 0) { div_by_zero_error = true; @@ -69,8 +73,7 @@ Tensor& floor_divide_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp index 1e8cba0f1ae..96a971b166a 100644 --- a/kernels/portable/cpu/op_fmod.cpp +++ b/kernels/portable/cpu/op_fmod.cpp @@ -55,9 +55,13 @@ Tensor& fmod_Tensor_out( bool div_by_zero_error = false; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = 0; if (is_integral_type::value) { if (val_b == 0) { @@ -73,8 +77,7 @@ Tensor& fmod_Tensor_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( @@ -131,16 +134,19 @@ Tensor& fmod_Scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = std::fmod(val_a, val_b); return value; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index 5cf3b5a19f8..3a84095a4df 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -45,7 +45,10 @@ Tensor& maximum_out( static constexpr const char op_name[] = "maximum.out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { return utils::max_override(val_a, val_b); }, @@ -54,8 +57,7 @@ Tensor& maximum_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index e2c641bdb22..5c0e79eb9bb 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -45,8 +45,12 @@ Tensor& minimum_out( static constexpr const char op_name[] = "minimum.out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. return utils::min_override(val_a, val_b); }, ctx, @@ -54,8 +58,7 @@ Tensor& minimum_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 114e60ff171..6156227732d 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -97,13 +97,15 @@ Tensor& mul_scalar_out( ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( - [val_b](const CTYPE_COMPUTE val_a) { return val_a * val_b; }, + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b](const auto val_a) { return val_a * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp index 81319b03d9f..4d2673cb72d 100644 --- a/kernels/portable/cpu/op_pow.cpp +++ b/kernels/portable/cpu/op_pow.cpp @@ -53,8 +53,12 @@ Tensor& pow_Tensor_Tensor_out( static constexpr const char op_name[] = "pow.Tensor_Tensor_out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. return std::pow(val_a, val_b); }, ctx, @@ -62,8 +66,7 @@ Tensor& pow_Tensor_Tensor_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; @@ -104,13 +107,16 @@ Tensor& pow_Tensor_Scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + // TODO: rewrite this to be vectorization-capable. [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; @@ -151,13 +157,16 @@ Tensor& pow_Scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_a = utils::scalar_to(a); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + // TODO: rewrite this to be vectorization-capable. [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); }, ctx, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp index d34c34a0380..01a5d72de01 100644 --- a/kernels/portable/cpu/op_remainder.cpp +++ b/kernels/portable/cpu/op_remainder.cpp @@ -53,9 +53,13 @@ Tensor& remainder_Tensor_out( bool div_by_zero_error = false; ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = 0; if (is_integral_type::value) { if (val_b == 0) { @@ -71,8 +75,7 @@ Tensor& remainder_Tensor_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( @@ -126,15 +129,18 @@ Tensor& remainder_Scalar_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. return utils::remainder_override(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp index 46af021efda..6a0a77b6596 100644 --- a/kernels/portable/cpu/op_rsub.cpp +++ b/kernels/portable/cpu/op_rsub.cpp @@ -52,15 +52,17 @@ Tensor& rsub_scalar_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_unitensor_elementwise_fn( - [val_b, val_alpha](const CTYPE_COMPUTE val_a) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b, val_alpha](const auto val_a) { return val_b - val_alpha * val_a; }, ctx, a, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp index 09cfed524f9..acb743a2db6 100644 --- a/kernels/portable/cpu/op_sigmoid.cpp +++ b/kernels/portable/cpu/op_sigmoid.cpp @@ -45,8 +45,12 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { static constexpr const char op_name[] = "sigmoid.out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_unitensor_elementwise_fn( - [](const CTYPE_COMPUTE val_in) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::FLOATHBF16>( + [](const auto val_in) -> CTYPE_COMPUTE { + // TODO: rewrite this to be vectorization-capable CTYPE_COMPUTE out_val = static_cast(1.0) / (static_cast(1.0) + exp(-val_in)); return out_val; @@ -54,8 +58,7 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { ctx, in, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp index 6217f82c3b1..aa90df8dee4 100644 --- a/kernels/portable/cpu/op_sub.cpp +++ b/kernels/portable/cpu/op_sub.cpp @@ -56,8 +56,11 @@ Tensor& sub_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_bitensor_elementwise_fn( - [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + [val_alpha](const auto val_a, const auto val_b) { return val_a - val_alpha * val_b; }, ctx, @@ -65,8 +68,7 @@ Tensor& sub_out( utils::SupportedTensorDtypes::REALHBF16, b, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; @@ -110,15 +112,17 @@ Tensor& sub_scalar_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_unitensor_elementwise_fn( - [val_b, val_alpha](const CTYPE_COMPUTE val_a) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b, val_alpha](const auto val_a) { return val_a - val_alpha * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp index b455c45c2d1..692e296ee00 100644 --- a/kernels/portable/cpu/op_where.cpp +++ b/kernels/portable/cpu/op_where.cpp @@ -43,10 +43,13 @@ Tensor& where_out( static constexpr const char op_name[] = "where.self_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_tritensor_elementwise_fn( - [](const CTYPE_COMPUTE val_a, - const CTYPE_COMPUTE val_b, - const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; }, + utils::apply_tritensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [](const auto val_a, const auto val_b, const auto val_c) { + return val_c ? val_a : val_b; + }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, @@ -54,8 +57,7 @@ Tensor& where_out( utils::SupportedTensorDtypes::REALHBBF16, cond, utils::SupportedTensorDtypes::BOOL_OR_BYTE, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h index 6e4c111b8f2..f78ce796e6c 100644 --- a/kernels/portable/cpu/pattern/bitwise_op.h +++ b/kernels/portable/cpu/pattern/bitwise_op.h @@ -80,15 +80,18 @@ Tensor& bitwise_tensor_out( ET_SWITCH_INT_TYPES_AND( Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + // TODO: rewrite this to be vectorization-capable. BitwiseFnForOp::value, ctx, a, utils::SupportedTensorDtypes::INTB, b, utils::SupportedTensorDtypes::INTB, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -121,16 +124,19 @@ Tensor& bitwise_scalar_out( ET_SWITCH_INT_TYPES_AND( Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. return BitwiseFnForOp::value( val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::INTB, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/pattern/comparison_op.h b/kernels/portable/cpu/pattern/comparison_op.h index e0d9bf4dcab..643d7623922 100644 --- a/kernels/portable/cpu/pattern/comparison_op.h +++ b/kernels/portable/cpu/pattern/comparison_op.h @@ -91,15 +91,18 @@ Tensor& comparison_tensor_out( ScalarType compute_type = utils::get_compute_type(common_type); ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + // TODO: rewrite this to be vectorization-capable. ComparisonFnForOp::value, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -127,15 +130,18 @@ Tensor& comparison_scalar_out( ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. return ComparisonFnForOp::value(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/pattern/logical_op.h b/kernels/portable/cpu/pattern/logical_op.h index 017822a85a6..4547d3df51b 100644 --- a/kernels/portable/cpu/pattern/logical_op.h +++ b/kernels/portable/cpu/pattern/logical_op.h @@ -34,15 +34,18 @@ Tensor& logical_tensor_out( InvalidArgument, out); - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + bool, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + // TODO: rewrite this to be vectorization-capable. fn, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); return out; } From ef74fe1ea9a9afc5ae255c7879da251ad1146ef2 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 1 Apr 2025 19:33:13 -0700 Subject: [PATCH 22/27] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.h | 35 ++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index b5cd980b085..eb1ee83111e 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -228,7 +228,7 @@ enum class SupportedTensorDtypes { namespace internal { template -load_to_compute_fn get_load_to_compute_fn( +load_to_compute_fn get_load_to_compute_fn_impl( const Tensor& t, SupportedTensorDtypes dtypes) { switch (dtypes) { @@ -252,7 +252,7 @@ load_to_compute_fn get_load_to_compute_fn( } template -store_compute_to_tensor_fn get_store_compute_to_tensor_fn( +store_compute_to_tensor_fn get_store_compute_to_tensor_fn_impl( const Tensor& t, SupportedTensorDtypes dtypes) { switch (dtypes) { @@ -285,6 +285,37 @@ store_compute_to_tensor_fn get_store_compute_to_tensor_fn( return nullptr; } +#ifndef EXECUTORCH_SELECTIVE_BUILD_DTYPE +constexpr const char kGenericElementwiseOpName[] = "generic_elementwise_op"; +#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE + +template +load_to_compute_fn get_load_to_compute_fn( + const Tensor& t, + SupportedTensorDtypes dtypes) { + return get_load_to_compute_fn_impl< + CTYPE_COMPUTE, +#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE + op_name +#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE + kGenericElementwiseOpName +#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE + >(t, dtypes); +} + +template +store_compute_to_tensor_fn get_store_compute_to_tensor_fn( + const Tensor& t, + SupportedTensorDtypes dtypes) { + return get_store_compute_to_tensor_fn_impl< + CTYPE_COMPUTE, +#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE + op_name +#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE + kGenericElementwiseOpName +#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE + >(t, dtypes); +} bool check_tensor_dtype( const Tensor t, SupportedTensorDtypes dtypes, From 3aa266d6537815d70e5332e45c160c9a43346158 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 2 Apr 2025 10:09:37 -0700 Subject: [PATCH 23/27] Update [ghstack-poisoned] --- kernels/portable/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index 53ad88880d6..edea045d65f 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -66,9 +66,7 @@ gen_operators_lib( # Portable kernels support optional parallelization (and, in the # future, perhaps other performance features). If support is present, # produce an optimized version. -set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL) - -if(BUILD_OPTIMIZED_PORTABLE_KERNELS) +if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_library(optimized_portable_kernels ${_portable_kernels__srcs}) target_link_libraries(optimized_portable_kernels PRIVATE executorch) target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) From 3c88a5662e950077bf471a91168ee69c28752af0 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 2 Apr 2025 10:09:41 -0700 Subject: [PATCH 24/27] Update [ghstack-poisoned] --- tools/cmake/executorch-config.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index 49aa6cf08af..56c7fa2d7d4 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -149,7 +149,7 @@ endif() if(TARGET coremldelegate) set_target_properties( coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES - "coreml_inmemoryfs;coreml_util" + "coreml_inmemoryfs;coreml_util" ) endif() @@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib) endif() if(TARGET extension_threadpool) target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) + set_target_properties( + extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES + "cpuinfo;pthreadpool" + ) endif() From 153735d91465623e35c9394c0b4a0a282eb35327 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 2 Apr 2025 10:09:46 -0700 Subject: [PATCH 25/27] Update [ghstack-poisoned] --- test/CMakeLists.txt | 13 +++++++ test/build_optimized_size_test.sh | 57 +++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 test/build_optimized_size_test.sh diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3932f1097e1..812e8e4a67a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release") target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections") endif() +# +# size_test_all_optimized_ops: binary with optimized ops and no delegate backend +# +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) +add_executable(size_test_all_optimized_ops ${_size_test__srcs}) +target_link_options_shared_lib(optimized_native_cpu_ops_lib) +target_link_libraries( + size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib) +if(CMAKE_BUILD_TYPE EQUAL "Release") + target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections") +endif() +endif() + # Print all summary executorch_print_configuration_summary() diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh new file mode 100644 index 00000000000..181c2ce617d --- /dev/null +++ b/test/build_optimized_size_test.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Unlike build_size_test.sh, this script: +# - does not attempt to disable exceptions and RTTI +# - as a consequence, is able to build optimized kernels +# - uses MinSizeRel builds +# - is not currently intended to run in CI +# - sets -g to make it easier to use tools like bloaty to investigate size + +set -e + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh" + +cmake_install_executorch_lib() { + echo "Installing libexecutorch.a" + clean_executorch_install_folders + update_tokenizers_git_submodule + CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \ + -DCMAKE_CXX_STANDARD_REQUIRED=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DOPTIMIZE_SIZE=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config MinSizeRel +} + +test_cmake_size_test() { + CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test + + echo "Build size test" + cmake --build cmake-out/test -j9 --config MinSizeRel + + echo 'ExecuTorch with no ops binary size, unstripped:' + ls -al cmake-out/test/size_test + + echo 'ExecuTorch with portable ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_ops + + echo 'ExecuTorch with optimized ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_optimized_ops +} + +if [[ -z $PYTHON_EXECUTABLE ]]; then + PYTHON_EXECUTABLE=python3 +fi + +cmake_install_executorch_lib +test_cmake_size_test From 77a4fc6fc5e4280e394f0fb3f45e360099b0c519 Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 2 Apr 2025 10:09:53 -0700 Subject: [PATCH 26/27] Update [ghstack-poisoned] --- kernels/portable/cpu/op_add.cpp | 20 ++++++++----- kernels/portable/cpu/op_addmm.cpp | 10 ++++--- kernels/portable/cpu/op_atan2.cpp | 10 ++++--- kernels/portable/cpu/op_clamp.cpp | 18 ++++++++---- kernels/portable/cpu/op_copy.cpp | 20 ++++++++----- kernels/portable/cpu/op_div.cpp | 31 ++++++++++++-------- kernels/portable/cpu/op_elu.cpp | 11 ++++--- kernels/portable/cpu/op_floor_divide.cpp | 9 ++++-- kernels/portable/cpu/op_fmod.cpp | 18 ++++++++---- kernels/portable/cpu/op_maximum.cpp | 8 +++-- kernels/portable/cpu/op_minimum.cpp | 9 ++++-- kernels/portable/cpu/op_mul.cpp | 10 ++++--- kernels/portable/cpu/op_pow.cpp | 27 +++++++++++------ kernels/portable/cpu/op_remainder.cpp | 18 ++++++++---- kernels/portable/cpu/op_rsub.cpp | 10 ++++--- kernels/portable/cpu/op_sigmoid.cpp | 11 ++++--- kernels/portable/cpu/op_sub.cpp | 20 ++++++++----- kernels/portable/cpu/op_where.cpp | 14 +++++---- kernels/portable/cpu/pattern/bitwise_op.h | 18 ++++++++---- kernels/portable/cpu/pattern/comparison_op.h | 18 ++++++++---- kernels/portable/cpu/pattern/logical_op.h | 9 ++++-- 21 files changed, 201 insertions(+), 118 deletions(-) diff --git a/kernels/portable/cpu/op_add.cpp b/kernels/portable/cpu/op_add.cpp index adb9d4ea723..555341b3447 100644 --- a/kernels/portable/cpu/op_add.cpp +++ b/kernels/portable/cpu/op_add.cpp @@ -52,8 +52,11 @@ Tensor& add_out( ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_bitensor_elementwise_fn( - [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [val_alpha](const auto val_a, const auto val_b) { return val_a + val_alpha * val_b; }, ctx, @@ -61,8 +64,7 @@ Tensor& add_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -100,8 +102,11 @@ Tensor& add_scalar_out( static constexpr const char op_name[] = "add.Scalar_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_unitensor_elementwise_fn( - [b, alpha](const CTYPE_COMPUTE val_a) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [b, alpha](const auto val_a) { CTYPE_COMPUTE val_b = utils::scalar_to(b); CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); return val_a + val_alpha * val_b; @@ -109,8 +114,7 @@ Tensor& add_scalar_out( ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_addmm.cpp b/kernels/portable/cpu/op_addmm.cpp index d1df5818cd8..440a8b2c0fa 100644 --- a/kernels/portable/cpu/op_addmm.cpp +++ b/kernels/portable/cpu/op_addmm.cpp @@ -88,8 +88,11 @@ Tensor& addmm_out( n, p); - utils::apply_bitensor_elementwise_fn( - [alpha_val, beta_val](const CTYPE val_a, const CTYPE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + [alpha_val, beta_val](const auto val_a, const auto val_b) { return val_a * alpha_val + val_b * beta_val; }, ctx, @@ -97,8 +100,7 @@ Tensor& addmm_out( utils::SupportedTensorDtypes::REALHBF16, in, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); } }); diff --git a/kernels/portable/cpu/op_atan2.cpp b/kernels/portable/cpu/op_atan2.cpp index 19267ef49dd..33d66cf2ad7 100644 --- a/kernels/portable/cpu/op_atan2.cpp +++ b/kernels/portable/cpu/op_atan2.cpp @@ -55,8 +55,11 @@ Tensor& atan2_out( static constexpr const char op_name[] = "atan2.out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::FLOATHBF16>( + [](const auto val_a, const auto val_b) { return std::atan2(val_a, val_b); }, ctx, @@ -64,8 +67,7 @@ Tensor& atan2_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_clamp.cpp b/kernels/portable/cpu/op_clamp.cpp index c1c40a38f34..6974789eccf 100644 --- a/kernels/portable/cpu/op_clamp.cpp +++ b/kernels/portable/cpu/op_clamp.cpp @@ -134,8 +134,12 @@ Tensor& clamp_out( static constexpr const char op_name[] = "clamp.out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE val_out = val_in; if (has_min) { val_out = utils::max_override( @@ -150,8 +154,7 @@ Tensor& clamp_out( ctx, in, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; @@ -210,11 +213,15 @@ Tensor& clamp_tensor_out( static constexpr const char op_name[] = "clamp.Tensor_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_tritensor_elementwise_fn( + utils::apply_tritensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [has_min, has_max]( const CTYPE_COMPUTE val_in, const CTYPE_COMPUTE val_min, const CTYPE_COMPUTE val_max) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE val_out = val_in; if (has_min) { val_out = utils::max_override(val_out, val_min); @@ -231,8 +238,7 @@ Tensor& clamp_tensor_out( utils::SupportedTensorDtypes::REALHBBF16, max, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_copy.cpp b/kernels/portable/cpu/op_copy.cpp index 19b0c3a2f6a..30fff4d2c10 100644 --- a/kernels/portable/cpu/op_copy.cpp +++ b/kernels/portable/cpu/op_copy.cpp @@ -47,15 +47,17 @@ Tensor& copy_out( static constexpr const char op_name[] = "copy.out"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy.out", CTYPE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; }, + utils::apply_bitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [](ET_UNUSED const auto _, const auto val_src) { return val_src; }, ctx, in, utils::SupportedTensorDtypes::REALHBBF16, src, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -80,15 +82,17 @@ Tensor& copy_( static constexpr const char op_name[] = "copy_"; ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "copy_", CTYPE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](ET_UNUSED const CTYPE _, const CTYPE val_src) { return val_src; }, + utils::apply_bitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + [](ET_UNUSED const auto _, const auto val_src) { return val_src; }, ctx, in, utils::SupportedTensorDtypes::REALHBBF16, src, utils::SupportedTensorDtypes::REALHBBF16, - in, - utils::SupportedTensorDtypes::REALHBBF16); + in); }); return in; diff --git a/kernels/portable/cpu/op_div.cpp b/kernels/portable/cpu/op_div.cpp index 94cd9ea5011..70f9479c464 100644 --- a/kernels/portable/cpu/op_div.cpp +++ b/kernels/portable/cpu/op_div.cpp @@ -58,17 +58,17 @@ Tensor& div_out( static constexpr const char op_name[] = "div.out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( - [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { - return val_a / val_b; - }, + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::FLOATHBF16>( + [](const auto val_a, const auto val_b) { return val_a / val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; @@ -122,9 +122,13 @@ Tensor& div_out_mode( bool div_by_zero_error = false; ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [mode_is_trunc, &div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. if (is_integral_type::value) { if (val_b == 0) { div_by_zero_error = true; @@ -146,8 +150,7 @@ Tensor& div_out_mode( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( @@ -188,13 +191,15 @@ Tensor& div_scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( - [val_b](const CTYPE_COMPUTE val_a) { return val_a / val_b; }, + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b](const auto val_a) { return val_a / val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_elu.cpp b/kernels/portable/cpu/op_elu.cpp index d4846fb1bfb..d6533642860 100644 --- a/kernels/portable/cpu/op_elu.cpp +++ b/kernels/portable/cpu/op_elu.cpp @@ -44,8 +44,12 @@ Tensor& elu_out( ET_EXTRACT_SCALAR(scale, math_scale); ET_EXTRACT_SCALAR(input_scale, math_input_scale); const auto negcoef = math_alpha * math_scale; - utils::apply_unitensor_elementwise_fn( - [negcoef, math_scale, math_input_scale](auto x) { + utils::apply_unitensor_elementwise_fn< + CTYPE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [negcoef, math_scale, math_input_scale](const auto x) { + // TODO: rewrite this to be vectorization-capable. return MathT(x) <= MathT(0) ? std::expm1(MathT(x) * math_input_scale) * negcoef : MathT(x) * math_scale; @@ -53,8 +57,7 @@ Tensor& elu_out( ctx, in, utils::SupportedTensorDtypes::FLOATHBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; } diff --git a/kernels/portable/cpu/op_floor_divide.cpp b/kernels/portable/cpu/op_floor_divide.cpp index 85eb612ea1e..50723c3fa0a 100644 --- a/kernels/portable/cpu/op_floor_divide.cpp +++ b/kernels/portable/cpu/op_floor_divide.cpp @@ -53,9 +53,13 @@ Tensor& floor_divide_out( bool div_by_zero_error = false; ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. if (is_integral_type::value) { if (val_b == 0) { div_by_zero_error = true; @@ -69,8 +73,7 @@ Tensor& floor_divide_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( diff --git a/kernels/portable/cpu/op_fmod.cpp b/kernels/portable/cpu/op_fmod.cpp index 1e8cba0f1ae..96a971b166a 100644 --- a/kernels/portable/cpu/op_fmod.cpp +++ b/kernels/portable/cpu/op_fmod.cpp @@ -55,9 +55,13 @@ Tensor& fmod_Tensor_out( bool div_by_zero_error = false; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = 0; if (is_integral_type::value) { if (val_b == 0) { @@ -73,8 +77,7 @@ Tensor& fmod_Tensor_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( @@ -131,16 +134,19 @@ Tensor& fmod_Scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = std::fmod(val_a, val_b); return value; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_maximum.cpp b/kernels/portable/cpu/op_maximum.cpp index 5cf3b5a19f8..3a84095a4df 100644 --- a/kernels/portable/cpu/op_maximum.cpp +++ b/kernels/portable/cpu/op_maximum.cpp @@ -45,7 +45,10 @@ Tensor& maximum_out( static constexpr const char op_name[] = "maximum.out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { return utils::max_override(val_a, val_b); }, @@ -54,8 +57,7 @@ Tensor& maximum_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_minimum.cpp b/kernels/portable/cpu/op_minimum.cpp index e2c641bdb22..5c0e79eb9bb 100644 --- a/kernels/portable/cpu/op_minimum.cpp +++ b/kernels/portable/cpu/op_minimum.cpp @@ -45,8 +45,12 @@ Tensor& minimum_out( static constexpr const char op_name[] = "minimum.out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. return utils::min_override(val_a, val_b); }, ctx, @@ -54,8 +58,7 @@ Tensor& minimum_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp index 114e60ff171..6156227732d 100644 --- a/kernels/portable/cpu/op_mul.cpp +++ b/kernels/portable/cpu/op_mul.cpp @@ -97,13 +97,15 @@ Tensor& mul_scalar_out( ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( - [val_b](const CTYPE_COMPUTE val_a) { return val_a * val_b; }, + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b](const auto val_a) { return val_a * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_pow.cpp b/kernels/portable/cpu/op_pow.cpp index 81319b03d9f..4d2673cb72d 100644 --- a/kernels/portable/cpu/op_pow.cpp +++ b/kernels/portable/cpu/op_pow.cpp @@ -53,8 +53,12 @@ Tensor& pow_Tensor_Tensor_out( static constexpr const char op_name[] = "pow.Tensor_Tensor_out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. return std::pow(val_a, val_b); }, ctx, @@ -62,8 +66,7 @@ Tensor& pow_Tensor_Tensor_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; @@ -104,13 +107,16 @@ Tensor& pow_Tensor_Scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + // TODO: rewrite this to be vectorization-capable. [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; @@ -151,13 +157,16 @@ Tensor& pow_Scalar_out( ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_a = utils::scalar_to(a); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + // TODO: rewrite this to be vectorization-capable. [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); }, ctx, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_remainder.cpp b/kernels/portable/cpu/op_remainder.cpp index d34c34a0380..01a5d72de01 100644 --- a/kernels/portable/cpu/op_remainder.cpp +++ b/kernels/portable/cpu/op_remainder.cpp @@ -53,9 +53,13 @@ Tensor& remainder_Tensor_out( bool div_by_zero_error = false; ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [&div_by_zero_error]( const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + // TODO: rewrite this to be vectorization-capable. CTYPE_COMPUTE value = 0; if (is_integral_type::value) { if (val_b == 0) { @@ -71,8 +75,7 @@ Tensor& remainder_Tensor_out( utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); ET_KERNEL_CHECK_MSG( @@ -126,15 +129,18 @@ Tensor& remainder_Scalar_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. return utils::remainder_override(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_rsub.cpp b/kernels/portable/cpu/op_rsub.cpp index 46af021efda..6a0a77b6596 100644 --- a/kernels/portable/cpu/op_rsub.cpp +++ b/kernels/portable/cpu/op_rsub.cpp @@ -52,15 +52,17 @@ Tensor& rsub_scalar_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_unitensor_elementwise_fn( - [val_b, val_alpha](const CTYPE_COMPUTE val_a) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b, val_alpha](const auto val_a) { return val_b - val_alpha * val_a; }, ctx, a, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_sigmoid.cpp b/kernels/portable/cpu/op_sigmoid.cpp index 09cfed524f9..acb743a2db6 100644 --- a/kernels/portable/cpu/op_sigmoid.cpp +++ b/kernels/portable/cpu/op_sigmoid.cpp @@ -45,8 +45,12 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { static constexpr const char op_name[] = "sigmoid.out"; ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_unitensor_elementwise_fn( - [](const CTYPE_COMPUTE val_in) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::FLOATHBF16>( + [](const auto val_in) -> CTYPE_COMPUTE { + // TODO: rewrite this to be vectorization-capable CTYPE_COMPUTE out_val = static_cast(1.0) / (static_cast(1.0) + exp(-val_in)); return out_val; @@ -54,8 +58,7 @@ Tensor& sigmoid_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) { ctx, in, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::FLOATHBF16); + out); }); return out; diff --git a/kernels/portable/cpu/op_sub.cpp b/kernels/portable/cpu/op_sub.cpp index 6217f82c3b1..aa90df8dee4 100644 --- a/kernels/portable/cpu/op_sub.cpp +++ b/kernels/portable/cpu/op_sub.cpp @@ -56,8 +56,11 @@ Tensor& sub_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_bitensor_elementwise_fn( - [val_alpha](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBF16>( + [val_alpha](const auto val_a, const auto val_b) { return val_a - val_alpha * val_b; }, ctx, @@ -65,8 +68,7 @@ Tensor& sub_out( utils::SupportedTensorDtypes::REALHBF16, b, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::REALHBF16); + out); }); return out; @@ -110,15 +112,17 @@ Tensor& sub_scalar_out( ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); const CTYPE_COMPUTE val_alpha = utils::scalar_to(alpha); - utils::apply_unitensor_elementwise_fn( - [val_b, val_alpha](const CTYPE_COMPUTE val_a) { + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [val_b, val_alpha](const auto val_a) { return val_a - val_alpha * val_b; }, ctx, a, utils::SupportedTensorDtypes::REALHBF16, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/op_where.cpp b/kernels/portable/cpu/op_where.cpp index b455c45c2d1..692e296ee00 100644 --- a/kernels/portable/cpu/op_where.cpp +++ b/kernels/portable/cpu/op_where.cpp @@ -43,10 +43,13 @@ Tensor& where_out( static constexpr const char op_name[] = "where.self_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_tritensor_elementwise_fn( - [](const CTYPE_COMPUTE val_a, - const CTYPE_COMPUTE val_b, - const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; }, + utils::apply_tritensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::SAME_AS_COMMON>( + [](const auto val_a, const auto val_b, const auto val_c) { + return val_c ? val_a : val_b; + }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, @@ -54,8 +57,7 @@ Tensor& where_out( utils::SupportedTensorDtypes::REALHBBF16, cond, utils::SupportedTensorDtypes::BOOL_OR_BYTE, - out, - utils::SupportedTensorDtypes::SAME_AS_COMMON); + out); }); return out; diff --git a/kernels/portable/cpu/pattern/bitwise_op.h b/kernels/portable/cpu/pattern/bitwise_op.h index 6e4c111b8f2..f78ce796e6c 100644 --- a/kernels/portable/cpu/pattern/bitwise_op.h +++ b/kernels/portable/cpu/pattern/bitwise_op.h @@ -80,15 +80,18 @@ Tensor& bitwise_tensor_out( ET_SWITCH_INT_TYPES_AND( Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + // TODO: rewrite this to be vectorization-capable. BitwiseFnForOp::value, ctx, a, utils::SupportedTensorDtypes::INTB, b, utils::SupportedTensorDtypes::INTB, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -121,16 +124,19 @@ Tensor& bitwise_scalar_out( ET_SWITCH_INT_TYPES_AND( Bool, compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. return BitwiseFnForOp::value( val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::INTB, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/pattern/comparison_op.h b/kernels/portable/cpu/pattern/comparison_op.h index e0d9bf4dcab..643d7623922 100644 --- a/kernels/portable/cpu/pattern/comparison_op.h +++ b/kernels/portable/cpu/pattern/comparison_op.h @@ -91,15 +91,18 @@ Tensor& comparison_tensor_out( ScalarType compute_type = utils::get_compute_type(common_type); ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + // TODO: rewrite this to be vectorization-capable. ComparisonFnForOp::value, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; @@ -127,15 +130,18 @@ Tensor& comparison_scalar_out( ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { const CTYPE_COMPUTE val_b = utils::scalar_to(b); - utils::apply_unitensor_elementwise_fn( + utils::apply_unitensor_elementwise_fn< + CTYPE_COMPUTE, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( [val_b](const CTYPE_COMPUTE val_a) { + // TODO: rewrite this to be vectorization-capable. return ComparisonFnForOp::value(val_a, val_b); }, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); }); return out; diff --git a/kernels/portable/cpu/pattern/logical_op.h b/kernels/portable/cpu/pattern/logical_op.h index 017822a85a6..4547d3df51b 100644 --- a/kernels/portable/cpu/pattern/logical_op.h +++ b/kernels/portable/cpu/pattern/logical_op.h @@ -34,15 +34,18 @@ Tensor& logical_tensor_out( InvalidArgument, out); - utils::apply_bitensor_elementwise_fn( + utils::apply_bitensor_elementwise_fn< + bool, + op_name, + utils::SupportedTensorDtypes::REALHBBF16>( + // TODO: rewrite this to be vectorization-capable. fn, ctx, a, utils::SupportedTensorDtypes::REALHBBF16, b, utils::SupportedTensorDtypes::REALHBBF16, - out, - utils::SupportedTensorDtypes::REALHBBF16); + out); return out; } From 21ae5da3534db7560d72fcec2f327167dd08240d Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 2 Apr 2025 10:09:57 -0700 Subject: [PATCH 27/27] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.h | 35 ++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index b5cd980b085..eb1ee83111e 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -228,7 +228,7 @@ enum class SupportedTensorDtypes { namespace internal { template -load_to_compute_fn get_load_to_compute_fn( +load_to_compute_fn get_load_to_compute_fn_impl( const Tensor& t, SupportedTensorDtypes dtypes) { switch (dtypes) { @@ -252,7 +252,7 @@ load_to_compute_fn get_load_to_compute_fn( } template -store_compute_to_tensor_fn get_store_compute_to_tensor_fn( +store_compute_to_tensor_fn get_store_compute_to_tensor_fn_impl( const Tensor& t, SupportedTensorDtypes dtypes) { switch (dtypes) { @@ -285,6 +285,37 @@ store_compute_to_tensor_fn get_store_compute_to_tensor_fn( return nullptr; } +#ifndef EXECUTORCH_SELECTIVE_BUILD_DTYPE +constexpr const char kGenericElementwiseOpName[] = "generic_elementwise_op"; +#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE + +template +load_to_compute_fn get_load_to_compute_fn( + const Tensor& t, + SupportedTensorDtypes dtypes) { + return get_load_to_compute_fn_impl< + CTYPE_COMPUTE, +#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE + op_name +#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE + kGenericElementwiseOpName +#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE + >(t, dtypes); +} + +template +store_compute_to_tensor_fn get_store_compute_to_tensor_fn( + const Tensor& t, + SupportedTensorDtypes dtypes) { + return get_store_compute_to_tensor_fn_impl< + CTYPE_COMPUTE, +#ifdef EXECUTORCH_SELECTIVE_BUILD_DTYPE + op_name +#else // EXECUTORCH_SELECTIVE_BUILD_DTYPE + kGenericElementwiseOpName +#endif // EXECUTORCH_SELECTIVE_BUILD_DTYPE + >(t, dtypes); +} bool check_tensor_dtype( const Tensor t, SupportedTensorDtypes dtypes,