diff --git a/CMakeLists.txt b/CMakeLists.txt index b34ed07a10e..6dbb66afdaa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -645,13 +645,18 @@ target_link_options_shared_lib(executorch) # Real integrations should supply their own YAML file that only lists the # operators necessary for the models that will run. # +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) + # find pytorch lib here to make it available to all + # sub-directories. Find it before including portable so that + # optimized_portable_kernels can use it. + find_package_torch_headers() +endif() + if(BUILD_EXECUTORCH_PORTABLE_OPS) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable) endif() if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) - # find pytorch lib here to make it available to all sub-directories - find_package_torch_headers() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized) endif() diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt index 7cba9e91fe5..693be68c35e 100644 --- a/kernels/optimized/CMakeLists.txt +++ b/kernels/optimized/CMakeLists.txt @@ -62,6 +62,7 @@ message("Generated files ${gen_command_sources}") list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/") add_library(optimized_kernels ${_optimized_kernels__srcs}) target_include_directories(optimized_kernels PRIVATE ${TORCH_INCLUDE_DIRS} "${EXECUTORCH_ROOT}/third-party/pocketfft") +target_compile_definitions(optimized_kernels PRIVATE ET_USE_PYTORCH_HEADERS) target_link_libraries( optimized_kernels PUBLIC executorch_core cpublas extension_threadpool ) diff --git a/kernels/portable/CMakeLists.txt b/kernels/portable/CMakeLists.txt index e27ba12ac0d..edea045d65f 100644 --- a/kernels/portable/CMakeLists.txt +++ b/kernels/portable/CMakeLists.txt @@ -66,13 +66,13 @@ gen_operators_lib( # Portable kernels support optional parallelization (and, in the # future, perhaps other performance features). If support is present, # produce an optimized version. -set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL) - -if(BUILD_OPTIMIZED_PORTABLE_KERNELS) +if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_KERNELS_OPTIMIZED) add_library(optimized_portable_kernels ${_portable_kernels__srcs}) target_link_libraries(optimized_portable_kernels PRIVATE executorch) target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool) target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options}) + target_include_directories(optimized_portable_kernels PRIVATE ${TORCH_INCLUDE_DIRS}) + target_compile_definitions(optimized_portable_kernels PRIVATE ET_USE_PYTORCH_HEADERS) install( TARGETS optimized_portable_kernels DESTINATION lib diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h index 206be87f98e..4e0718bc522 100644 --- a/kernels/portable/cpu/util/elementwise_util.h +++ b/kernels/portable/cpu/util/elementwise_util.h @@ -51,12 +51,8 @@ inline int64_t scalar_to(const Scalar& s) { } namespace internal { -template < - typename CTYPE_COMPUTE, - const char* op_name, - typename Op, - typename... Args> -inline void apply_elementwise_fn( +template +inline bool validate_elementwise_fn_inputs( const Op& compute_fun, KernelRuntimeContext& ctx, const Tensor& out, @@ -65,7 +61,6 @@ inline void apply_elementwise_fn( static_assert( (std::is_same_v> && ...)); - constexpr auto kNumInputs = sizeof...(inputs); constexpr auto compute_type = CppTypeToScalarType::value; const auto check_input_dtype = [](auto input, auto compute_type) { return internal::check_tensor_dtype( @@ -75,7 +70,30 @@ inline void apply_elementwise_fn( ctx, (check_input_dtype(inputs, compute_type) && ...) && internal::check_tensor_dtype(out, out_dtypes, compute_type), - InvalidArgument, ); + InvalidArgument, + false); + + return true; +} + +template < + typename CTYPE_COMPUTE, + const char* op_name, + typename Op, + typename... Args> +inline void apply_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& out, + SupportedTensorDtypes out_dtypes, + Args... inputs) { + const bool inputs_valid = validate_elementwise_fn_inputs( + compute_fun, ctx, out, out_dtypes, inputs...); + if (!inputs_valid) { + return; + } + + constexpr auto kNumInputs = sizeof...(inputs); struct InputInfo { load_to_compute_fn load_to_compute; @@ -120,6 +138,7 @@ inline void apply_elementwise_fn( }); } +/// DEPRECATED: prefer the variant with out_dtypes in the template argument. template inline void apply_unitensor_elementwise_fn( const Op& compute_fun, @@ -132,12 +151,53 @@ inline void apply_unitensor_elementwise_fn( compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); } +template < + typename CTYPE_COMPUTE, + const char* op_name, + SupportedTensorDtypes out_dtypes, + typename Op> +inline void apply_unitensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& out) { + internal::apply_elementwise_fn( + compute_fun, ctx, out, out_dtypes, std::make_pair(&a, a_dtypes)); +} + +/** + * DEPRECATED: prefer the variant with out_dtypes in the template argument list. + */ +template +inline void apply_bitensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& b, + SupportedTensorDtypes b_dtypes, + const Tensor& out, + SupportedTensorDtypes out_dtypes) { + internal::apply_elementwise_fn( + compute_fun, + ctx, + out, + out_dtypes, + std::make_pair(&a, a_dtypes), + std::make_pair(&b, b_dtypes)); +} + /** * Useful for bi-tensor elementwise operators. For each element of the inputs, * perform a computation and write to the corresponding element of the output. * Tensor broadcasting is applied wherever it is required. */ -template +template < + typename CTYPE_COMPUTE, + const char* op_name, + SupportedTensorDtypes out_dtypes, + typename Op> inline void apply_bitensor_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, @@ -145,6 +205,29 @@ inline void apply_bitensor_elementwise_fn( SupportedTensorDtypes a_dtypes, const Tensor& b, SupportedTensorDtypes b_dtypes, + const Tensor& out) { + internal::apply_elementwise_fn( + compute_fun, + ctx, + out, + out_dtypes, + std::make_pair(&a, a_dtypes), + std::make_pair(&b, b_dtypes)); +} + +/** + * DEPRECATED: prefer the variant with out_dtypes in the template argument list. + */ +template +inline void apply_tritensor_elementwise_fn( + const Op& compute_fun, + KernelRuntimeContext& ctx, + const Tensor& a, + SupportedTensorDtypes a_dtypes, + const Tensor& b, + SupportedTensorDtypes b_dtypes, + const Tensor& c, + SupportedTensorDtypes c_dtypes, const Tensor& out, SupportedTensorDtypes out_dtypes) { internal::apply_elementwise_fn( @@ -153,7 +236,8 @@ inline void apply_bitensor_elementwise_fn( out, out_dtypes, std::make_pair(&a, a_dtypes), - std::make_pair(&b, b_dtypes)); + std::make_pair(&b, b_dtypes), + std::make_pair(&c, c_dtypes)); } /** @@ -176,7 +260,11 @@ inline void apply_bitensor_elementwise_fn( * static constexpr const char op_name[] = "my_op"; * apply_ternary_elementwise_fn. */ -template +template < + typename CTYPE_COMPUTE, + const char* op_name, + SupportedTensorDtypes out_dtypes, + typename Op> inline void apply_tritensor_elementwise_fn( const Op& compute_fun, KernelRuntimeContext& ctx, @@ -186,8 +274,7 @@ inline void apply_tritensor_elementwise_fn( SupportedTensorDtypes b_dtypes, const Tensor& c, SupportedTensorDtypes c_dtypes, - const Tensor& out, - SupportedTensorDtypes out_dtypes) { + const Tensor& out) { internal::apply_elementwise_fn( compute_fun, ctx, diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl index dbe35f8eefd..d9d72b5be3f 100644 --- a/runtime/core/portable_type/c10/c10/targets.bzl +++ b/runtime/core/portable_type/c10/c10/targets.bzl @@ -73,6 +73,7 @@ def define_common_targets(): # -Wmacro-redefined, and we only care about getting # reasonable vectorization and Sleef support. "-DCPU_CAPABILITY_AVX2", + "-DET_USE_PYTORCH_HEADERS", "-DHAVE_AVX2_CPU_DEFINITION", "-DSTANDALONE_TORCH_HEADER", ] + get_sleef_preprocessor_flags(), @@ -86,5 +87,5 @@ def define_common_targets(): # linker failure. "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(), "DEFAULT": [], - }) + ["-DSTANDALONE_TORCH_HEADER"], + }) + ["-DSTANDALONE_TORCH_HEADER"] + ([] if runtime.is_oss else ["-DET_USE_PYTORCH_HEADERS"]), ) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3932f1097e1..812e8e4a67a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -68,5 +68,18 @@ if(CMAKE_BUILD_TYPE EQUAL "Release") target_link_options(size_test_all_ops PRIVATE "LINKER:--gc-sections") endif() +# +# size_test_all_optimized_ops: binary with optimized ops and no delegate backend +# +if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED) +add_executable(size_test_all_optimized_ops ${_size_test__srcs}) +target_link_options_shared_lib(optimized_native_cpu_ops_lib) +target_link_libraries( + size_test_all_optimized_ops executorch optimized_native_cpu_ops_lib) +if(CMAKE_BUILD_TYPE EQUAL "Release") + target_link_options(size_test_all_optimized_ops PRIVATE "LINKER:--gc-sections") +endif() +endif() + # Print all summary executorch_print_configuration_summary() diff --git a/test/build_optimized_size_test.sh b/test/build_optimized_size_test.sh new file mode 100644 index 00000000000..181c2ce617d --- /dev/null +++ b/test/build_optimized_size_test.sh @@ -0,0 +1,57 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Unlike build_size_test.sh, this script: +# - does not attempt to disable exceptions and RTTI +# - as a consequence, is able to build optimized kernels +# - uses MinSizeRel builds +# - is not currently intended to run in CI +# - sets -g to make it easier to use tools like bloaty to investigate size + +set -e + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/../.ci/scripts/utils.sh" + +cmake_install_executorch_lib() { + echo "Installing libexecutorch.a" + clean_executorch_install_folders + update_tokenizers_git_submodule + CXXFLAGS="-g" retry cmake -DBUCK2="$BUCK2" \ + -DCMAKE_CXX_STANDARD_REQUIRED=ON \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DCMAKE_BUILD_TYPE=MinSizeRel \ + -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DOPTIMIZE_SIZE=ON \ + -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ + -Bcmake-out . + cmake --build cmake-out -j9 --target install --config MinSizeRel +} + +test_cmake_size_test() { + CXXFLAGS="-g" retry cmake -DCMAKE_BUILD_TYPE=MinSizeRel -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON -DCMAKE_INSTALL_PREFIX=cmake-out -Bcmake-out/test test + + echo "Build size test" + cmake --build cmake-out/test -j9 --config MinSizeRel + + echo 'ExecuTorch with no ops binary size, unstripped:' + ls -al cmake-out/test/size_test + + echo 'ExecuTorch with portable ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_ops + + echo 'ExecuTorch with optimized ops binary size, unstripped:' + ls -al cmake-out/test/size_test_all_optimized_ops +} + +if [[ -z $PYTHON_EXECUTABLE ]]; then + PYTHON_EXECUTABLE=python3 +fi + +cmake_install_executorch_lib +test_cmake_size_test diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake index 49aa6cf08af..56c7fa2d7d4 100644 --- a/tools/cmake/executorch-config.cmake +++ b/tools/cmake/executorch-config.cmake @@ -149,7 +149,7 @@ endif() if(TARGET coremldelegate) set_target_properties( coremldelegate PROPERTIES INTERFACE_LINK_LIBRARIES - "coreml_inmemoryfs;coreml_util" + "coreml_inmemoryfs;coreml_util" ) endif() @@ -167,4 +167,8 @@ if(TARGET optimized_native_cpu_ops_lib) endif() if(TARGET extension_threadpool) target_compile_definitions(extension_threadpool INTERFACE ET_USE_THREADPOOL) + set_target_properties( + extension_threadpool PROPERTIES INTERFACE_LINK_LIBRARIES + "cpuinfo;pthreadpool" + ) endif()