From 31a49e0212d88b6b16979f926f437beed9cde1dc Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:07 -0700 Subject: [PATCH 1/3] Update [ghstack-poisoned] --- CMakeLists.txt | 9 +++++++-- kernels/optimized/CMakeLists.txt | 1 + kernels/portable/CMakeLists.txt | 2 ++ runtime/core/portable_type/c10/c10/targets.bzl | 3 ++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3a53b8a6e2a..8fe08a2c25e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch) # Real integrations should supply their own YAML file that only lists the # operators necessary for the models that will run. # +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + # find pytorch lib here to make it available to all + # sub-directories. Find it before including portable so that + # optimized_portabale_kernels can use it. + find_package_torch_headers() +endif() + if(BUILD_EXECUTORCH_PORTABLE_OPS) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) endif() if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) - # find pytorch lib here to make it available to all sub-directories - find_package_torch_headers() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized) endif() diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 7cba9e91fe5..693be68c35e 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}") list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(optimized_kernels ${_optimized_kernels__srcs}) target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft") +target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS) target_link_libraries( optimized_kernels PUBLIC executorch_core cpublas extension_threadpool ) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index e27ba12ac0d..53ad88880d6 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -73,6 +73,8 @@ if(BUILD_OPTIMIZED_PORTABLE_KERNELS) target_link_libraries(optimized_portable_kernels PRIVATE executorch) target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options}) + target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS}) + target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS) install( TARGETS optimized_portable_kernels DESTINATION lib diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index a727cb19ac1..b8883c75bfe 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -73,6 +73,7 @@ def define_common_targets(): # -Wmacro-redefined, and we only care about getting # reasonable vectorization and Sleef support. "-DCPU_CAPABILITY_AVX2", + "-DET_USE_PYTORCH_HEADERS", "-DHAVE_AVX2_CPU_DEFINITION", "-DSTANDALONE_TORCH_HEADER", ] + get_sleef_preprocessor_flags(), @@ -87,5 +88,5 @@ def define_common_targets(): # linker failure. "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(), "DEFAULT": [], - }) + ["-DSTANDALONE_TORCH_HEADER"], + }) + ["-DET_USE_PYTORCH_HEADERS", "-DSTANDALONE_TORCH_HEADER"], ) From 9fcd8857fb0e00bee0b401f5e25f1fd081fe3c9c Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 18 Mar 2025 17:32:12 -0700 Subject: [PATCH 2/3] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.h | 11 ---------- kernels/portable/cpu/util/elementwise_util.h | 23 ++++++++++++++++---- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 2bbd5de4577..59b82cdc51b 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -86,12 +86,6 @@ load_to_common_fn get_load_to_common_fn_bool_or_byte( template load_to_common_fn get_load_to_common_fn_same_as_compute( const Tensor& t) { - constexpr auto common_scalar_type = CppTypeToScalarType::value; - ET_CHECK_MSG( - t.scalar_type() == common_scalar_type, - "Unhandled dtype %s for %s", - ::executorch::runtime::toString(common_scalar_type), - op_name); return internal::load_and_convert; } @@ -180,11 +174,6 @@ template store_common_to_tensor_fn get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) { constexpr auto common_scalar_type = CppTypeToScalarType::value; - ET_CHECK_MSG( - t.scalar_type() == common_scalar_type, - "Unhandled dtype %s for %s", - ::executorch::runtime::toString(common_scalar_type), - op_name); return internal::convert_and_store; } diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index f5932069005..021ec42bf27 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -51,6 +51,13 @@ inline int64_t scalar_to(const Scalar& s) { } namespace internal { +template +using ignore_first_yield_second = T; + +template +using op_call_result = + std::invoke_result_t...>; + template < typename CTYPE_COMMON, const char* op_name, @@ -89,9 +96,16 @@ inline void apply_elementwise_fn( inputs.first->element_size(), })...}; - const auto store_common_to_out = - internal::get_store_common_to_tensor_fn( - out, out_dtypes); + // NOTE: the result of compute_fun is not necessarily CTYPE_COMMON! + // For example, consider the possibility that compute_fun is a + // trigonometric function like acos, the common input type is bool, + // and the output type is float -- we would truncate acos(0) ~= 1.67 + // to just 1. Conveniently, it costs us nothing at runtime to handle + // this correctly. + const auto store_compute_result_to_out = + internal::get_store_common_to_tensor_fn< + op_call_result, + op_name>(out, out_dtypes); char* const data_out = reinterpret_cast(out.mutable_data_ptr()); const auto out_element_size = out.element_size(); @@ -114,7 +128,8 @@ inline void apply_elementwise_fn( .data_ptr[indexes[idx + 1] * input_info.element_size]); } auto result = std::apply(compute_fun, loaded_inputs); - store_common_to_out(result, &data_out[indexes[0] * out_element_size]); + store_compute_result_to_out( + result, &data_out[indexes[0] * out_element_size]); } }); } From 40c1b1be46d2ad91f6ca39fe3008d9b685d3f45b Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Wed, 19 Mar 2025 09:58:10 -0700 Subject: [PATCH 3/3] Update [ghstack-poisoned] --- kernels/portable/cpu/util/dtype_util.h | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/kernels/portable/cpu/util/dtype_util.h b/kernels/portable/cpu/util/dtype_util.h index 76579301850..1f0e3403e82 100644 --- a/kernels/portable/cpu/util/dtype_util.h +++ b/kernels/portable/cpu/util/dtype_util.h @@ -173,27 +173,13 @@ get_store_common_to_tensor_fn_bool_or_byte(const Tensor& t) { template store_common_to_tensor_fn get_store_common_to_tensor_fn_same_as_compute(const Tensor& t) { - return internal::convert_and_store; + // We already validate tensor types earlier in the process, so at + // this phase, treat same_as_compute the same as our widest + // SupportedTensorDtypes set. + return get_store_common_to_tensor_fn_realhbf16(t); } -template < - typename CTYPE_COMMON, - const char* op_name, - std::enable_if_t, bool> = true> -store_common_to_tensor_fn -get_store_common_to_tensor_fn_same_as_common(const Tensor& t) { - void (*result)(CTYPE_COMMON, void*) = nullptr; - ET_SWITCH_THREE_TYPES( - Float, Half, BFloat16, t.scalar_type(), unused, op_name, CTYPE, [&]() { - result = internal::convert_and_store; - }); - return result; -} - -template < - typename CTYPE_COMMON, - const char* op_name, - std::enable_if_t, bool> = true> +template store_common_to_tensor_fn get_store_common_to_tensor_fn_same_as_common(const Tensor& t) { return get_store_common_to_tensor_fn_same_as_compute(