Skip to content

ggml: GGML_NATIVE uses -mcpu=native on ARM #10752

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,10 @@ if (NOT GGML_CUDA_GRAPHS_DEFAULT)
endif()

# general
option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON)
option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON)

# debug
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
Expand Down
105 changes: 44 additions & 61 deletions ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,70 +111,53 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
endif ()

set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
elseif (APPLE)
if (GGML_NATIVE)
set(USER_PROVIDED_MARCH FALSE)
foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS)
if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+")
set(USER_PROVIDED_MARCH TRUE)
break()
endif()
endforeach()

if (NOT USER_PROVIDED_MARCH)
set(MARCH_FLAGS "-march=armv8.2a")

check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
if (GGML_COMPILER_SUPPORT_DOTPROD)
set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod")
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD)

message(STATUS "ARM feature DOTPROD enabled")
endif ()

set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm")

set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}")

check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm")
list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8)

message(STATUS "ARM feature MATMUL_INT8 enabled")
endif ()

set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})

list(APPEND ARCH_FLAGS "${MARCH_FLAGS}")
endif ()
endif ()
else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
# Android armeabi-v7a
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -mcpu=native)

# Show enabled features
execute_process(
COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E -
INPUT_FILE "/dev/null"
OUTPUT_VARIABLE ARM_FEATURE
RESULT_VARIABLE ARM_FEATURE_RESULT
)
if (ARM_FEATURE_RESULT)
message(WARNING "Failed to get ARM features")
else()
# Raspberry Pi 2
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC)
string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos)
if (NOT ${feature_pos} EQUAL -1)
message(STATUS "ARM feature ${feature} enabled")
endif()
endforeach()
endif()
else()
Copy link
Contributor Author

@angt angt Dec 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Putting all the old code in a else might be too drastic but I guess the other cases are only relevant when cross-compiling.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the other code in fact seems to be doing the same thing that -march=native would do. GGML_NATIVE disabled should generate a consistent build depending on the flags specified during compilation, which is not the case at the moment.

This needs to be completely revamped, and as it is, this PR is just adding to the mess that will need to be cleaned up later.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can help with revamping but I need some clarification first :)

Today, building on generic ARM gives very poor performance because the build system completely ignores the GGML_NATIVE directive, so I just aligned the current code with the current description of GGML_NATIVE found in ggml/CMakeLists.txt :

option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})

I completely agree that it's not the best way to get performance but it's better than nothing and it already fixes many modern setups.

So, do we want to relax the definition of GGML_NATIVE and allow to use, for example, -mcpu=native on ARM which would be much better for performance?

The old code was clearly aimed at small devices, like android and raspberry and also used CMAKE_SYSTEM_PROCESSOR so for me it wasn't used as a way to fix -march=native at all but rather as a way to find acceptable flags when cross-compiling and in this case you really don't want GGML_NATIVE (hence the move to else).

Maybe @ggerganov has some memories to share about that ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I believe these flags were mostly set by trial and error, back when we were running whisper.cpp on some raspberries. But this is very likely wrong as I didn't really understand the specifics and should be revamped. I'm not really an expert and I still get quite confused with all the different Arm architectures, so whatever you think makes sense to improve this is welcome. I can test changes on the entire spectrum of Apple Silicon if necessary.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand correctly, with gcc/clang it is enough to set the correct architecture flags with -march, and -match=native should work in the same way as x86. The exception is likely to be MSVC once again, because it does not set the preprocessor definitions of the enabled ARM features. In which case, we may consider just dropping support for MSVC with ARM entirely, because it is a constant source of problems, doesn't work with the inline asm kernels, and doesn't really add anything beyond clang or possibly MINGW.

I believe this should work:

  • Set -march=native if GGML_NATIVE is enabled
  • Add a parameter GGML_CPU_ARCH to the build to set the architecture, so that if GGML_NATIVE is disabled and this parameter is provided, then -march=${GGML_CPU_ARCH} is used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On ARM, to build for local use (i.e. using GGML_NATIVE), -mcpu=native alone should be the best option as far as I know. The -march=native will often miss some opportunities. The -mtune=native will optimize for the current microarchitecture (so still not fully optimized for the cpu).

So I think redefining GGML_NATIVE to something like "Try to optimize builds for the current cpu" and using -march=native on x86_64 and -mcpu=native on ARM would already be much simpler and an improvement.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If this sounds good to you, I can adapt this PR in this direction so we can see how it works in practice.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, sounds good.

check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
# Android armeabi-v7a
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
else()
# Raspberry Pi 2
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif()
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Android arm64-v8a
# Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif()
if (GGML_SVE)
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
endif()
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Android arm64-v8a
# Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif()
if (GGML_SVE)
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
endif()
endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
Expand Down
Loading