diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a6d25b50bd..6c1eed33f3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -229,3 +229,6 @@ In chronological order:
 
 * Christopher Daley <https://github.com/cdaley>
   * [2024-01-24] Optimize GEMV forwarding on ARM64 systems
+
+* Aymen Qader <aymen.qader@arm.com>
+  * [2024-12-09] Add Arm®v9-A architecture SME2 SGEMM kernels
diff --git a/Makefile.arm64 b/Makefile.arm64
index fccc0d0d0f..f0a6ef2cbd 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve
 endif
 endif
 
+ifeq ($(CORE), ARMV9SME)
+CCOMMON_OPT += -march=armv9-a+sme2 -O3
+FCOMMON_OPT += -march=armv9-a+sve2 -O3
+endif
+
 ifeq ($(CORE), CORTEXA53)
 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
 ifneq ($(F_COMPILER), NAG)
diff --git a/Makefile.system b/Makefile.system
index 29ea819f13..b9b2453f5b 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64)
 export MACOSX_DEPLOYMENT_TARGET=11.0
 ifeq ($(C_COMPILER), GCC)
 export NO_SVE = 1
+export NO_SME = 1
 endif
 else
 export MACOSX_DEPLOYMENT_TARGET=10.8
@@ -709,6 +710,11 @@ DYNAMIC_CORE += NEOVERSEN2
 DYNAMIC_CORE += ARMV8SVE
 DYNAMIC_CORE += A64FX
 endif
+# Disabled by default while ARMV9SME is WIP
+NO_SME ?= 1
+ifneq ($(NO_SME), 1)
+DYNAMIC_CORE += ARMV9SME
+endif
 DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
 DYNAMIC_CORE += TSV110
@@ -1474,6 +1480,10 @@ ifeq ($(NO_SVE), 1)
 CCOMMON_OPT     += -DNO_SVE
 endif
 
+ifeq ($(NO_SME), 1)
+CCOMMON_OPT     += -DNO_SME
+endif
+
 ifdef SMP
 CCOMMON_OPT	+= -DSMP_SERVER
 
diff --git a/README.md b/README.md
index d8e73b2022..0857194245 100644
--- a/README.md
+++ b/README.md
@@ -188,6 +188,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **Apple Vortex**: preliminary support based on ThunderX2/3
 - **A64FX**:  preliminary support, optimized Level-3 BLAS
 - **ARMV8SVE**: any ARMV8 cpu with SVE extensions 
+- **ARMV9SME**: WIP target, any Arm®v9-A core with SME2 support. Only functional for GEMM.
 
 #### PPC/PPC64
 
diff --git a/TargetList.txt b/TargetList.txt
index 25eeddfb00..232e12ffa6 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -111,6 +111,7 @@ THUNDERX3T110
 VORTEX
 A64FX
 ARMV8SVE
+ARMV9SME
 FT2000
 
 9.System Z:
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 27ba6f8727..ec91a2d598 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -44,9 +44,21 @@ endif ()
 
 if (DYNAMIC_ARCH)
   if (ARM64)
-	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
-    if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
-	    set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
+    set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+      if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10
+        set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
+      endif ()
+      if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14
+        set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
+      endif()
+    elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang")
+      if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11
+        set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
+      endif ()
+      if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19
+        set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME)
+      endif()
     endif ()
     if (DYNAMIC_LIST)
 	  set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 775239e1cd..2a48ba5ab5 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE)
   endif ()
 endif ()
 
+if (${CORE} STREQUAL ARMV9SME)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme2")
+  endif ()
+endif ()
+
 if (${CORE} STREQUAL CORTEXA510)
   if (NOT DYNAMIC_ARCH)
     set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 53a78d782f..f6ca73b7b6 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -1014,7 +1014,7 @@ endif ()
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
-  elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
+  elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_CODE_SIZE\t65536\n"
       "#define L1_CODE_LINESIZE\t64\n"
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 4ac244e3ea..d49de6f7c2 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -310,6 +310,9 @@ if (${TARGET} STREQUAL NEOVERSEV1)
       set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
     endif()
   endif()
+  if (${TARGET} STREQUAL ARMV9SME)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv9-a+sme2 -O3")
+  endif()
   if (${TARGET} STREQUAL A64FX)
     if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
       set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")
diff --git a/common_arm64.h b/common_arm64.h
index 595a01995a..5856898a2b 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -175,7 +175,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define HUGE_PAGESIZE   ( 4 << 20)
 
 #ifndef BUFFERSIZE
-#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
+#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME)
 #define BUFFER_SIZE     (32 << 22)
 #else
 #define BUFFER_SIZE     (32 << 20)
diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index b7328876b4..e22bcb1079 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -345,6 +345,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
 	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#elif defined(ARMV9SME) && !defined(DOUBLE) && !defined(COMPLEX)
+	/* the current SME SGEMM kernel requires n>=8*GEMM_UNROLL_N to achieve best performance */
+	if (min_jj >= 8*GEMM_UNROLL_N) min_jj = 8*GEMM_UNROLL_N;
 #else
         if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
         else
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index dc88d816fb..828eccd138 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE;
 #else
 #define gotoblas_ARMV8SVE gotoblas_ARMV8
 #endif
+#ifdef DYN_ARMV9SME
+extern gotoblas_t gotoblas_ARMV9SME;
+#else
+#define gotoblas_ARMV9SME gotoblas_ARMV8
+#endif
 #ifdef DYN_CORTEX_A55
 extern gotoblas_t  gotoblas_CORTEXA55;
 #else
@@ -148,6 +153,13 @@ extern gotoblas_t  gotoblas_A64FX;
 #define gotoblas_ARMV8SVE   gotoblas_ARMV8
 #define gotoblas_A64FX      gotoblas_ARMV8
 #endif
+
+#ifndef NO_SME
+extern gotoblas_t  gotoblas_ARMV9SME;
+#else
+#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
+#endif
+
 extern gotoblas_t  gotoblas_THUNDERX3T110;
 #endif
 #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
@@ -393,6 +405,13 @@ static gotoblas_t *get_coretype(void) {
       snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
       openblas_warning(1, coremsg);
   }
+
+#if !defined(NO_SME) && defined(HWCAP2_SME2)
+  if ((getauxval(AT_HWCAP2) & HWCAP2_SME2)) {
+    return &gotoblas_ARMV9SME;
+  }
+#endif
+
 #ifndef NO_SVE
   if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
     return &gotoblas_ARMV8SVE;
diff --git a/getarch.c b/getarch.c
index 826dd1ce0a..2097f230f0 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1289,6 +1289,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "ARMV8SVE"
 #endif
 
+#ifdef FORCE_ARMV9SME
+#define FORCE
+#define ARCHITECTURE    "ARM64"
+#define SUBARCHITECTURE "ARMV9SME"
+#define SUBDIRNAME      "arm64"
+#define ARCHCONFIG   "-DARMV9SME " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
+       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9"
+#define LIBNAME   "armv9sme"
+#define CORENAME  "ARMV9SME"
+#endif
+
 
 #ifdef FORCE_ARMV8
 #define FORCE
diff --git a/kernel/arm64/KERNEL.ARMV9SME b/kernel/arm64/KERNEL.ARMV9SME
new file mode 100644
index 0000000000..2e00ae4a70
--- /dev/null
+++ b/kernel/arm64/KERNEL.ARMV9SME
@@ -0,0 +1,8 @@
+include $(KERNELDIR)/KERNEL.ARMV8SVE
+
+SGEMMKERNEL = sgemm_kernel_sme.c
+
+SGEMMINCOPY = sgemm_ncopy_sme.c
+SGEMMITCOPY = sgemm_tcopy_sme.c
+SGEMMONCOPY = sgemm_ncopy_sme.c
+SGEMMOTCOPY = sgemm_tcopy_sme.c
diff --git a/kernel/arm64/sgemm_kernel_sme.c b/kernel/arm64/sgemm_kernel_sme.c
new file mode 100644
index 0000000000..073c97f5e0
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_sme.c
@@ -0,0 +1,188 @@
+/***************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <arm_sme.h>
+
+#include "common.h"
+#include "sme_abi.h"
+
+// Outer product kernel.
+// Computes a 2SVL x 2SVL block of C, utilizing all four FP32 tiles of ZA.
+// This kernel is unpredicated, and assumes a full 2SVL x 2SVL block.
+__attribute__((always_inline)) inline void
+kernel_2x2(const float *A, const float *B, float *C, float alpha,
+           size_t shared_dim, size_t a_step, size_t b_step, size_t c_step)
+    __arm_out("za") __arm_streaming {
+  const size_t svl = svcntw();
+
+  // Predicate set-up
+  svbool_t ptrue = svptrue_b32();
+
+  // Load from C into ZA
+  for (size_t i = 0; i < (svl >> 1); i++) {
+    svld1_ver_za32(0, i, ptrue, &C[0 * svl + i * c_step]);
+    svld1_ver_za32(1, i, ptrue, &C[1 * svl + i * c_step]);
+    svld1_ver_za32(2, i, ptrue, &C[0 * svl + (i + svl) * c_step]);
+    svld1_ver_za32(3, i, ptrue, &C[1 * svl + (i + svl) * c_step]);
+  }
+
+  svfloat32_t alpha_vec = svdup_f32(alpha);
+
+  // Iterate through shared dimension (K)
+  for (size_t k = 0; k < shared_dim; k++) {
+    // Load column of A
+    svfloat32x2_t cols_a = svld1_x2(svptrue_c32(), &A[k * a_step]);
+
+    // Load row of B
+    svfloat32x2_t rows_b = svld1_x2(svptrue_c32(), &B[k * b_step]);
+
+    // Multiply B through by alpha
+    svfloat32_t row_b_0 = svmul_x(ptrue, alpha_vec, svget2(rows_b, 0));
+    svfloat32_t row_b_1 = svmul_x(ptrue, alpha_vec, svget2(rows_b, 1));
+
+    // Perform outer products
+    svmopa_za32_m(0, ptrue, ptrue, svget2(cols_a, 0), row_b_0);
+    svmopa_za32_m(1, ptrue, ptrue, svget2(cols_a, 1), row_b_0);
+    svmopa_za32_m(2, ptrue, ptrue, svget2(cols_a, 0), row_b_1);
+    svmopa_za32_m(3, ptrue, ptrue, svget2(cols_a, 1), row_b_1);
+  }
+
+  // Store out to C from ZA
+  for (size_t i = 0; i < (svl >> 1); i++) {
+    // Store out one row of C per tile
+    svst1_ver_za32(0, i, ptrue, &C[0 * svl + i * c_step]);
+    svst1_ver_za32(1, i, ptrue, &C[1 * svl + i * c_step]);
+    svst1_ver_za32(2, i, ptrue, &C[0 * svl + (i + svl) * c_step]);
+    svst1_ver_za32(3, i, ptrue, &C[1 * svl + (i + svl) * c_step]);
+  }
+}
+
+// Outer product kernel.
+// Computes an SVL x SVL block of C, utilizing a single FP32 tile of ZA (ZA0).
+// This kernel is predicated, and can handle under-filled blocks.
+__attribute__((always_inline)) inline void
+kernel_1x1(const float *A, const float *B, float *C, float alpha,
+           size_t shared_dim, size_t a_len, size_t a_step, size_t b_len,
+           size_t b_step, size_t c_step, size_t c_rows, size_t c_cols)
+    __arm_out("za") __arm_streaming {
+
+  // Predicate set-up
+  svbool_t pg = svptrue_b32();
+  svbool_t pg_a = svwhilelt_b32_u64(0, a_len);
+  svbool_t pg_b = svwhilelt_b32_u64(0, b_len);
+  svbool_t pg_c = svwhilelt_b32_u64(0, c_rows);
+
+  // Load from C into ZA
+  for (size_t i = 0; i < c_cols; i++) {
+    svld1_ver_za32(0, i, pg_c, &C[i * c_step]);
+  }
+
+  svfloat32_t alpha_vec = svdup_f32_z(pg_b, alpha);
+
+  // Iterate through shared dimension (K)
+  for (size_t k = 0; k < shared_dim; k++) {
+    // Load column of A
+    svfloat32_t col_a = svld1(pg_a, &A[k * a_step]);
+    // Load row of B
+    svfloat32_t row_b = svld1(pg_b, &B[k * b_step]);
+    // Multiply B through by alpha
+    row_b = svmul_x(pg_b, alpha_vec, row_b);
+    // Perform outer product
+    svmopa_za32_m(0, pg, pg, col_a, row_b);
+  }
+
+  // Store out to C from ZA
+  for (size_t i = 0; i < c_cols; i++) {
+    svst1_ver_za32(0, i, pg_c, &C[i * c_step]);
+  }
+}
+
+__arm_new("za") __arm_locally_streaming
+    int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha0, FLOAT *ba,
+              FLOAT *bb, FLOAT *C, BLASLONG ldc) {
+
+  const BLASLONG num_rows = bm;
+  const BLASLONG num_cols = bn;
+
+  const FLOAT *a_ptr = ba;
+  const FLOAT *b_ptr = bb;
+  FLOAT *c_ptr = C;
+
+  const BLASLONG svl = svcntw();
+
+  const BLASLONG a_step = bm;
+  const BLASLONG b_step = bn;
+  const BLASLONG c_step = ldc;
+
+  // Block over rows of C (panels of A)
+  BLASLONG row_idx = 0;
+
+  // 2x2 loop
+  BLASLONG row_batch = 2 * svl;
+
+  // Block over row dimension of C
+  for (; row_idx + row_batch <= num_rows; row_idx += row_batch) {
+    BLASLONG col_idx = 0;
+    BLASLONG col_batch = 2 * svl;
+
+    // Block over column dimension of C
+    for (; col_idx + col_batch <= num_cols; col_idx += col_batch) {
+      kernel_2x2(&a_ptr[row_idx], &b_ptr[col_idx],
+                 &c_ptr[row_idx + col_idx * c_step], alpha0, bk, a_step, b_step,
+                 c_step);
+    }
+
+    // Handle under-filled blocks w/ 2x(1x1) kernels
+    col_batch = 1 * svl;
+    for (; col_idx < num_cols; col_idx += col_batch) {
+      col_batch = MIN(col_batch, num_cols - col_idx);
+
+      kernel_1x1(&a_ptr[row_idx], &b_ptr[col_idx],
+                 &c_ptr[row_idx + col_idx * c_step], alpha0, bk, svl, a_step,
+                 col_batch, b_step, c_step, svl, col_batch);
+
+      kernel_1x1(&a_ptr[row_idx + svl], &b_ptr[col_idx],
+                 &c_ptr[(row_idx + svl) + col_idx * c_step], alpha0, bk, svl,
+                 a_step, col_batch, b_step, c_step, svl, col_batch);
+    }
+  }
+
+  // Handle under-filled blocks w/ 1x1 kernels
+  row_batch = 1 * svl;
+  for (; row_idx < num_rows; row_idx += row_batch) {
+    row_batch = MIN(row_batch, num_rows - row_idx);
+    // Block over column dimension of C
+    BLASLONG col_batch = svl;
+    for (BLASLONG col_idx = 0; col_idx < num_cols; col_idx += col_batch) {
+      col_batch = MIN(col_batch, num_cols - col_idx);
+      kernel_1x1(&a_ptr[row_idx], &b_ptr[col_idx],
+                 &c_ptr[row_idx + col_idx * c_step], alpha0, bk, row_batch,
+                 a_step, col_batch, b_step, c_step, row_batch, col_batch);
+    }
+  }
+  return 0;
+}
diff --git a/kernel/arm64/sgemm_ncopy_sme.c b/kernel/arm64/sgemm_ncopy_sme.c
new file mode 100644
index 0000000000..8a2271b886
--- /dev/null
+++ b/kernel/arm64/sgemm_ncopy_sme.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <arm_sme.h>
+
+#include "common.h"
+#include "sme_abi.h"
+
+// Transpose 1SVL x N panel of A into B
+__attribute__((always_inline)) inline static void
+transpose_panel(const FLOAT *a, FLOAT *b, BLASLONG rows, BLASLONG cols,
+                BLASLONG a_step, BLASLONG b_step)
+    __arm_out("za") __arm_streaming {
+  BLASLONG col_batch = svcntsw();
+  const svbool_t pg_a = svwhilelt_b32_u64(0, rows);
+
+  for (BLASLONG k = 0; k < cols; k += col_batch) {
+    col_batch = MIN(col_batch, cols - k);
+    for (BLASLONG col = 0; col < col_batch; col++) {
+      svld1_ver_za32(0, col, pg_a, &a[(col + k) * a_step]);
+    }
+
+    const svbool_t pg_b = svwhilelt_b32_u64(k, cols);
+    for (BLASLONG row = 0; row < rows; row++) {
+      svst1_hor_za32(0, row, pg_b, &b[row * b_step + k]);
+    }
+  }
+}
+
+__arm_new("za") __arm_locally_streaming
+    int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) {
+  const BLASLONG num_rows = m;
+  BLASLONG row_batch = svcntsw();
+  for (BLASLONG row_idx = 0; row_idx < num_rows; row_idx += row_batch) {
+    // Transpose 1xSVL panel
+    row_batch = MIN(row_batch, num_rows - row_idx);
+    transpose_panel(&a[row_idx], &b[row_idx * n], row_batch, n, lda, n);
+  }
+  return 0;
+}
diff --git a/kernel/arm64/sgemm_tcopy_sme.c b/kernel/arm64/sgemm_tcopy_sme.c
new file mode 100644
index 0000000000..4687a73fa8
--- /dev/null
+++ b/kernel/arm64/sgemm_tcopy_sme.c
@@ -0,0 +1,42 @@
+/***************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <arm_sve.h>
+
+#include "common.h"
+#include "sme_abi.h"
+
+__arm_locally_streaming int CNAME(BLASLONG m, BLASLONG n, FLOAT *restrict a,
+                                  BLASLONG lda, FLOAT *restrict b) {
+  for (BLASLONG i = 0; i < m; i++) {
+    for (BLASLONG j = 0; j < n; j += svcntw()) {
+      svbool_t pg = svwhilelt_b32_u64(j, n);
+      svst1(pg, &b[i * n + j], svld1(pg, &a[i * lda + j]));
+    }
+  }
+  return 0;
+}
diff --git a/kernel/arm64/sme_abi.h b/kernel/arm64/sme_abi.h
new file mode 100644
index 0000000000..3e737ad6c9
--- /dev/null
+++ b/kernel/arm64/sme_abi.h
@@ -0,0 +1,45 @@
+/***************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
+GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#pragma once
+
+#include <stdlib.h>
+
+/**
+ * These are SME ABI routines for saving & restoring SME state.
+ * They are typically provided by a compiler runtime library such
+ * as libgcc or compiler-rt, but support for these routines is not
+ * yet available on all platforms.
+ *
+ * Define these as aborting stubs so that we loudly fail on nested
+ * usage of SME state.
+ *
+ * These are defined as weak symbols so that a compiler runtime can
+ * override them if supported.
+ */
+__attribute__((weak)) void __arm_tpidr2_save() { abort(); }
+__attribute__((weak)) void __arm_tpidr2_restore() { abort(); }
diff --git a/param.h b/param.h
index fee9195d02..4063fe71c1 100644
--- a/param.h
+++ b/param.h
@@ -3667,7 +3667,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE
+#elif defined(ARMV8SVE) || defined(ARMV9SME) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE
 
 #if defined(XDOUBLE) || defined(DOUBLE)
 #define SWITCH_RATIO            8