diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a6d25b50bd..6c1eed33f3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -229,3 +229,6 @@ In chronological order: * Christopher Daley * [2024-01-24] Optimize GEMV forwarding on ARM64 systems + +* Aymen Qader + * [2024-12-09] Add Arm®v9-A architecture SME2 SGEMM kernels diff --git a/Makefile.arm64 b/Makefile.arm64 index fccc0d0d0f..f0a6ef2cbd 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve endif endif +ifeq ($(CORE), ARMV9SME) +CCOMMON_OPT += -march=armv9-a+sme2 -O3 +FCOMMON_OPT += -march=armv9-a+sve2 -O3 +endif + ifeq ($(CORE), CORTEXA53) CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 ifneq ($(F_COMPILER), NAG) diff --git a/Makefile.system b/Makefile.system index 29ea819f13..b9b2453f5b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64) export MACOSX_DEPLOYMENT_TARGET=11.0 ifeq ($(C_COMPILER), GCC) export NO_SVE = 1 +export NO_SME = 1 endif else export MACOSX_DEPLOYMENT_TARGET=10.8 @@ -709,6 +710,11 @@ DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += ARMV8SVE DYNAMIC_CORE += A64FX endif +# Disabled by default while ARMV9SME is WIP +NO_SME ?= 1 +ifneq ($(NO_SME), 1) +DYNAMIC_CORE += ARMV9SME +endif DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 @@ -1474,6 +1480,10 @@ ifeq ($(NO_SVE), 1) CCOMMON_OPT += -DNO_SVE endif +ifeq ($(NO_SME), 1) +CCOMMON_OPT += -DNO_SME +endif + ifdef SMP CCOMMON_OPT += -DSMP_SERVER diff --git a/README.md b/README.md index d8e73b2022..0857194245 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Apple Vortex**: preliminary support based on ThunderX2/3 - **A64FX**: preliminary support, optimized Level-3 BLAS - **ARMV8SVE**: any ARMV8 cpu with SVE extensions +- **ARMV9SME**: WIP target, any Arm®v9-A core with SME2 support. Only functional for GEMM. #### PPC/PPC64 diff --git a/TargetList.txt b/TargetList.txt index 25eeddfb00..232e12ffa6 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -111,6 +111,7 @@ THUNDERX3T110 VORTEX A64FX ARMV8SVE +ARMV9SME FT2000 9.System Z: diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 27ba6f8727..ec91a2d598 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -44,9 +44,21 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) - if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10 + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + endif () + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + endif() + elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) + endif () + if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 + set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) + endif() endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 775239e1cd..2a48ba5ab5 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE) endif () endif () +if (${CORE} STREQUAL ARMV9SME) + if (NOT DYNAMIC_ARCH) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme2") + endif () +endif () + if (${CORE} STREQUAL CORTEXA510) if (NOT DYNAMIC_ARCH) set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 53a78d782f..f6ca73b7b6 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1014,7 +1014,7 @@ endif () set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) - elseif ("${TCORE}" STREQUAL "NEOVERSEN2") + elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME") file(APPEND ${TARGET_CONF_TEMP} "#define L1_CODE_SIZE\t65536\n" "#define L1_CODE_LINESIZE\t64\n" diff --git a/cmake/system.cmake b/cmake/system.cmake index 4ac244e3ea..d49de6f7c2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -310,6 +310,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") endif() endif() + if (${TARGET} STREQUAL ARMV9SME) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme2 -O3") + endif() if (${TARGET} STREQUAL A64FX) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") diff --git a/common_arm64.h b/common_arm64.h index 595a01995a..5856898a2b 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -175,7 +175,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HUGE_PAGESIZE ( 4 << 20) #ifndef BUFFERSIZE -#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) +#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME) #define BUFFER_SIZE (32 << 22) #else #define BUFFER_SIZE (32 << 20) diff --git a/driver/level3/level3.c b/driver/level3/level3.c index b7328876b4..e22bcb1079 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -345,6 +345,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; +#elif defined(ARMV9SME) && !defined(DOUBLE) && !defined(COMPLEX) + /* the current SME SGEMM kernel requires n>=8*GEMM_UNROLL_N to achieve best performance */ + if (min_jj >= 8*GEMM_UNROLL_N) min_jj = 8*GEMM_UNROLL_N; #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index dc88d816fb..828eccd138 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif +#ifdef DYN_ARMV9SME +extern gotoblas_t gotoblas_ARMV9SME; +#else +#define gotoblas_ARMV9SME gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -148,6 +153,13 @@ extern gotoblas_t gotoblas_A64FX; #define gotoblas_ARMV8SVE gotoblas_ARMV8 #define gotoblas_A64FX gotoblas_ARMV8 #endif + +#ifndef NO_SME +extern gotoblas_t gotoblas_ARMV9SME; +#else +#define gotoblas_ARMV9SME gotoblas_ARMV8SVE +#endif + extern gotoblas_t gotoblas_THUNDERX3T110; #endif #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 @@ -393,6 +405,13 @@ static gotoblas_t *get_coretype(void) { snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } + +#if !defined(NO_SME) && defined(HWCAP2_SME2) + if ((getauxval(AT_HWCAP2) & HWCAP2_SME2)) { + return &gotoblas_ARMV9SME; + } +#endif + #ifndef NO_SVE if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { return &gotoblas_ARMV8SVE; diff --git a/getarch.c b/getarch.c index 826dd1ce0a..2097f230f0 100644 --- a/getarch.c +++ b/getarch.c @@ -1289,6 +1289,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "ARMV8SVE" #endif +#ifdef FORCE_ARMV9SME +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV9SME" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DARMV9SME " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9" +#define LIBNAME "armv9sme" +#define CORENAME "ARMV9SME" +#endif + #ifdef FORCE_ARMV8 #define FORCE diff --git a/kernel/arm64/KERNEL.ARMV9SME b/kernel/arm64/KERNEL.ARMV9SME new file mode 100644 index 0000000000..2e00ae4a70 --- /dev/null +++ b/kernel/arm64/KERNEL.ARMV9SME @@ -0,0 +1,8 @@ +include $(KERNELDIR)/KERNEL.ARMV8SVE + +SGEMMKERNEL = sgemm_kernel_sme.c + +SGEMMINCOPY = sgemm_ncopy_sme.c +SGEMMITCOPY = sgemm_tcopy_sme.c +SGEMMONCOPY = sgemm_ncopy_sme.c +SGEMMOTCOPY = sgemm_tcopy_sme.c diff --git a/kernel/arm64/sgemm_kernel_sme.c b/kernel/arm64/sgemm_kernel_sme.c new file mode 100644 index 0000000000..073c97f5e0 --- /dev/null +++ b/kernel/arm64/sgemm_kernel_sme.c @@ -0,0 +1,188 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" +#include "sme_abi.h" + +// Outer product kernel. +// Computes a 2SVL x 2SVL block of C, utilizing all four FP32 tiles of ZA. +// This kernel is unpredicated, and assumes a full 2SVL x 2SVL block. +__attribute__((always_inline)) inline void +kernel_2x2(const float *A, const float *B, float *C, float alpha, + size_t shared_dim, size_t a_step, size_t b_step, size_t c_step) + __arm_out("za") __arm_streaming { + const size_t svl = svcntw(); + + // Predicate set-up + svbool_t ptrue = svptrue_b32(); + + // Load from C into ZA + for (size_t i = 0; i < (svl >> 1); i++) { + svld1_ver_za32(0, i, ptrue, &C[0 * svl + i * c_step]); + svld1_ver_za32(1, i, ptrue, &C[1 * svl + i * c_step]); + svld1_ver_za32(2, i, ptrue, &C[0 * svl + (i + svl) * c_step]); + svld1_ver_za32(3, i, ptrue, &C[1 * svl + (i + svl) * c_step]); + } + + svfloat32_t alpha_vec = svdup_f32(alpha); + + // Iterate through shared dimension (K) + for (size_t k = 0; k < shared_dim; k++) { + // Load column of A + svfloat32x2_t cols_a = svld1_x2(svptrue_c32(), &A[k * a_step]); + + // Load row of B + svfloat32x2_t rows_b = svld1_x2(svptrue_c32(), &B[k * b_step]); + + // Multiply B through by alpha + svfloat32_t row_b_0 = svmul_x(ptrue, alpha_vec, svget2(rows_b, 0)); + svfloat32_t row_b_1 = svmul_x(ptrue, alpha_vec, svget2(rows_b, 1)); + + // Perform outer products + svmopa_za32_m(0, ptrue, ptrue, svget2(cols_a, 0), row_b_0); + svmopa_za32_m(1, ptrue, ptrue, svget2(cols_a, 1), row_b_0); + svmopa_za32_m(2, ptrue, ptrue, svget2(cols_a, 0), row_b_1); + svmopa_za32_m(3, ptrue, ptrue, svget2(cols_a, 1), row_b_1); + } + + // Store out to C from ZA + for (size_t i = 0; i < (svl >> 1); i++) { + // Store out one row of C per tile + svst1_ver_za32(0, i, ptrue, &C[0 * svl + i * c_step]); + svst1_ver_za32(1, i, ptrue, &C[1 * svl + i * c_step]); + svst1_ver_za32(2, i, ptrue, &C[0 * svl + (i + svl) * c_step]); + svst1_ver_za32(3, i, ptrue, &C[1 * svl + (i + svl) * c_step]); + } +} + +// Outer product kernel. +// Computes an SVL x SVL block of C, utilizing a single FP32 tile of ZA (ZA0). +// This kernel is predicated, and can handle under-filled blocks. +__attribute__((always_inline)) inline void +kernel_1x1(const float *A, const float *B, float *C, float alpha, + size_t shared_dim, size_t a_len, size_t a_step, size_t b_len, + size_t b_step, size_t c_step, size_t c_rows, size_t c_cols) + __arm_out("za") __arm_streaming { + + // Predicate set-up + svbool_t pg = svptrue_b32(); + svbool_t pg_a = svwhilelt_b32_u64(0, a_len); + svbool_t pg_b = svwhilelt_b32_u64(0, b_len); + svbool_t pg_c = svwhilelt_b32_u64(0, c_rows); + + // Load from C into ZA + for (size_t i = 0; i < c_cols; i++) { + svld1_ver_za32(0, i, pg_c, &C[i * c_step]); + } + + svfloat32_t alpha_vec = svdup_f32_z(pg_b, alpha); + + // Iterate through shared dimension (K) + for (size_t k = 0; k < shared_dim; k++) { + // Load column of A + svfloat32_t col_a = svld1(pg_a, &A[k * a_step]); + // Load row of B + svfloat32_t row_b = svld1(pg_b, &B[k * b_step]); + // Multiply B through by alpha + row_b = svmul_x(pg_b, alpha_vec, row_b); + // Perform outer product + svmopa_za32_m(0, pg, pg, col_a, row_b); + } + + // Store out to C from ZA + for (size_t i = 0; i < c_cols; i++) { + svst1_ver_za32(0, i, pg_c, &C[i * c_step]); + } +} + +__arm_new("za") __arm_locally_streaming + int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha0, FLOAT *ba, + FLOAT *bb, FLOAT *C, BLASLONG ldc) { + + const BLASLONG num_rows = bm; + const BLASLONG num_cols = bn; + + const FLOAT *a_ptr = ba; + const FLOAT *b_ptr = bb; + FLOAT *c_ptr = C; + + const BLASLONG svl = svcntw(); + + const BLASLONG a_step = bm; + const BLASLONG b_step = bn; + const BLASLONG c_step = ldc; + + // Block over rows of C (panels of A) + BLASLONG row_idx = 0; + + // 2x2 loop + BLASLONG row_batch = 2 * svl; + + // Block over row dimension of C + for (; row_idx + row_batch <= num_rows; row_idx += row_batch) { + BLASLONG col_idx = 0; + BLASLONG col_batch = 2 * svl; + + // Block over column dimension of C + for (; col_idx + col_batch <= num_cols; col_idx += col_batch) { + kernel_2x2(&a_ptr[row_idx], &b_ptr[col_idx], + &c_ptr[row_idx + col_idx * c_step], alpha0, bk, a_step, b_step, + c_step); + } + + // Handle under-filled blocks w/ 2x(1x1) kernels + col_batch = 1 * svl; + for (; col_idx < num_cols; col_idx += col_batch) { + col_batch = MIN(col_batch, num_cols - col_idx); + + kernel_1x1(&a_ptr[row_idx], &b_ptr[col_idx], + &c_ptr[row_idx + col_idx * c_step], alpha0, bk, svl, a_step, + col_batch, b_step, c_step, svl, col_batch); + + kernel_1x1(&a_ptr[row_idx + svl], &b_ptr[col_idx], + &c_ptr[(row_idx + svl) + col_idx * c_step], alpha0, bk, svl, + a_step, col_batch, b_step, c_step, svl, col_batch); + } + } + + // Handle under-filled blocks w/ 1x1 kernels + row_batch = 1 * svl; + for (; row_idx < num_rows; row_idx += row_batch) { + row_batch = MIN(row_batch, num_rows - row_idx); + // Block over column dimension of C + BLASLONG col_batch = svl; + for (BLASLONG col_idx = 0; col_idx < num_cols; col_idx += col_batch) { + col_batch = MIN(col_batch, num_cols - col_idx); + kernel_1x1(&a_ptr[row_idx], &b_ptr[col_idx], + &c_ptr[row_idx + col_idx * c_step], alpha0, bk, row_batch, + a_step, col_batch, b_step, c_step, row_batch, col_batch); + } + } + return 0; +} diff --git a/kernel/arm64/sgemm_ncopy_sme.c b/kernel/arm64/sgemm_ncopy_sme.c new file mode 100644 index 0000000000..8a2271b886 --- /dev/null +++ b/kernel/arm64/sgemm_ncopy_sme.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" +#include "sme_abi.h" + +// Transpose 1SVL x N panel of A into B +__attribute__((always_inline)) inline static void +transpose_panel(const FLOAT *a, FLOAT *b, BLASLONG rows, BLASLONG cols, + BLASLONG a_step, BLASLONG b_step) + __arm_out("za") __arm_streaming { + BLASLONG col_batch = svcntsw(); + const svbool_t pg_a = svwhilelt_b32_u64(0, rows); + + for (BLASLONG k = 0; k < cols; k += col_batch) { + col_batch = MIN(col_batch, cols - k); + for (BLASLONG col = 0; col < col_batch; col++) { + svld1_ver_za32(0, col, pg_a, &a[(col + k) * a_step]); + } + + const svbool_t pg_b = svwhilelt_b32_u64(k, cols); + for (BLASLONG row = 0; row < rows; row++) { + svst1_hor_za32(0, row, pg_b, &b[row * b_step + k]); + } + } +} + +__arm_new("za") __arm_locally_streaming + int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) { + const BLASLONG num_rows = m; + BLASLONG row_batch = svcntsw(); + for (BLASLONG row_idx = 0; row_idx < num_rows; row_idx += row_batch) { + // Transpose 1xSVL panel + row_batch = MIN(row_batch, num_rows - row_idx); + transpose_panel(&a[row_idx], &b[row_idx * n], row_batch, n, lda, n); + } + return 0; +} diff --git a/kernel/arm64/sgemm_tcopy_sme.c b/kernel/arm64/sgemm_tcopy_sme.c new file mode 100644 index 0000000000..4687a73fa8 --- /dev/null +++ b/kernel/arm64/sgemm_tcopy_sme.c @@ -0,0 +1,42 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" +#include "sme_abi.h" + +__arm_locally_streaming int CNAME(BLASLONG m, BLASLONG n, FLOAT *restrict a, + BLASLONG lda, FLOAT *restrict b) { + for (BLASLONG i = 0; i < m; i++) { + for (BLASLONG j = 0; j < n; j += svcntw()) { + svbool_t pg = svwhilelt_b32_u64(j, n); + svst1(pg, &b[i * n + j], svld1(pg, &a[i * lda + j])); + } + } + return 0; +} diff --git a/kernel/arm64/sme_abi.h b/kernel/arm64/sme_abi.h new file mode 100644 index 0000000000..3e737ad6c9 --- /dev/null +++ b/kernel/arm64/sme_abi.h @@ -0,0 +1,45 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF +THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#pragma once + +#include + +/** + * These are SME ABI routines for saving & restoring SME state. + * They are typically provided by a compiler runtime library such + * as libgcc or compiler-rt, but support for these routines is not + * yet available on all platforms. + * + * Define these as aborting stubs so that we loudly fail on nested + * usage of SME state. + * + * These are defined as weak symbols so that a compiler runtime can + * override them if supported. + */ +__attribute__((weak)) void __arm_tpidr2_save() { abort(); } +__attribute__((weak)) void __arm_tpidr2_restore() { abort(); } diff --git a/param.h b/param.h index fee9195d02..4063fe71c1 100644 --- a/param.h +++ b/param.h @@ -3667,7 +3667,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE +#elif defined(ARMV8SVE) || defined(ARMV9SME) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8