intel
diff --git a/‎.github/workflows/unit-test-bestla.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/unit-test-bestla.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎bestla/bestla/bestla.h
Lines changed: 2 additions & 2 deletions b/‎bestla/bestla/bestla.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎bestla/bestla/bestla_parallel.h
Lines changed: 1 addition & 0 deletions b/‎bestla/bestla/bestla_parallel.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎bestla/bestla/bestla_prologue_b.h
Lines changed: 59 additions & 111 deletions b/‎bestla/bestla/bestla_prologue_b.h
Lines changed: 59 additions & 111 deletions
diff --git a/‎bestla/bestla/bestla_utils.h
Lines changed: 6 additions & 2 deletions b/‎bestla/bestla/bestla_utils.h
Lines changed: 6 additions & 2 deletions
diff --git a/‎bestla/bestla/bestla_wrapper.h
Lines changed: 24 additions & 0 deletions b/‎bestla/bestla/bestla_wrapper.h
Lines changed: 24 additions & 0 deletions
@@ -13,15 +13,15 @@ on:
           description: 'compiler_version'
           required: false
           type: string
-          default: '12.1.0'
+          default: '13.2.0'
 
 # If there is a new commit, the previous jobs will be canceled
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
 
 env:
-  INPUT_COMPILER_VERSION: ${{ inputs.compiler_version || '12.1.0' }}
+  INPUT_COMPILER_VERSION: ${{ inputs.compiler_version || '13.2.0' }}
   WORKING_DIR: ${{ github.workspace }}
   CONTAINER_NAME: "utTest"
 
 
@@ -28,10 +28,10 @@ enum class BTLA_ISA : uint8_t {
   AVX512F,
   AVX512BW,
   AVX512_VNNI,
+  AVX512_BF16,
+  AVX512_FP16,
   AMX_BF16,
   AMX_INT8,
-  AVX512_FP16,
-  AVX512_BF16,
   AMX_FP16,
   ISA_COUNT,
 };
 
@@ -19,6 +19,7 @@
 #if BTLA_OPENMP
 #include <omp.h>
 #endif
+#include <immintrin.h>
 #include "bestla_utils.h"
 #include "bestla_device.h"
 
 
@@ -224,48 +224,47 @@ class WeightKBlockNInteger {
     int rawnk_scale = utils::updiv(K, stor->mBlockSize);
     int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
     parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->SDtype() == BTLA_DTYPE::F32) {  // fp32 to fp32 direct copy
+    if (stor->SDtype() == BTLA_DTYPE::BF16 || stor->SDtype() == BTLA_DTYPE::F16 || stor->SDtype() == BTLA_DTYPE::F32) {
       threading->parallel_for([&](int tidx) {
         parallel::ThreadProblem2D thdp{tidx};
         _para.getIndex(thdp);
         if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr)
-                std::memcpy(stor->template SPtr<float>() + i * stor->mNPad, scales + i * N, N * sizeof(scales[0]));
-              if (zero_points != nullptr)
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
+          int rows = thdp.loc[1] + thdp.size[1] <= rawnk_scale ? thdp.size[1] : rawnk_scale - thdp.loc[1];
+          if (scales) {
+            if (stor->SDtype() == BTLA_DTYPE::BF16) {
+              kernel::wrapper::Memcpy2DFp32TPadding<utils::bf16>::forward_auto(
+                  scales + thdp.loc[1] * N, stor->template SPtr<utils::bf16>() + thdp.loc[1] * stor->mNPad, rows, N,
+                  N * sizeof(scales[0]), stor->mNPad * sizeof(utils::bf16), true);
+            } else if (stor->SDtype() == BTLA_DTYPE::F32) {
+              kernel::wrapper::Memcpy2DPadding::forward(
+                  scales + thdp.loc[1] * N, stor->template SPtr<float>() + thdp.loc[1] * stor->mNPad, rows,
+                  N * sizeof(float), N * sizeof(scales[0]), stor->mNPad * sizeof(float), true);
+            } else if (stor->SDtype() == BTLA_DTYPE::F16) {
+              kernel::wrapper::Memcpy2DFp32TPadding<utils::fp16>::forward_auto(
+                  scales + thdp.loc[1] * N, stor->template SPtr<utils::fp16>() + thdp.loc[1] * stor->mNPad, rows, N,
+                  N * sizeof(scales[0]), stor->mNPad * sizeof(utils::fp16), true);
             }
-          }
-        }
-      });
-    } else if (stor->SDtype() == BTLA_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              if (scales != nullptr) {
-                for (size_t j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[j + i * stor->mNPad] = static_cast<utils::bf16>(scales[i * N + j]);
-                }
-              }
-              if (zero_points != nullptr) {
-                std::memcpy(stor->template ZPtr<int8_t>() + i * stor->mNPad, zero_points + i * N,
-                            N * sizeof(zero_points[0]));
+            if (rows < thdp.size[1]) {
+              auto sb = bestla::utils::bestla_dtype_bytes(stor->SDtype());
+              if (sb == 2) {
+                std::memset(stor->template SPtr<utils::fp16>() + (thdp.loc[1] + rows) * stor->mNPad, 0,
+                            sb * (thdp.size[1] - rows) * stor->mNPad);
+              } else if (sb == 4) {
+                std::memset(stor->template SPtr<float>() + (thdp.loc[1] + rows) * stor->mNPad, 0,
+                            sb * (thdp.size[1] - rows) * stor->mNPad);
+              } else {
+                assert(0);
               }
-            } else {
-              if (scales != nullptr)
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              if (zero_points != nullptr)
-                std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
+            }
+          }
+          if (zero_points) {
+            kernel::wrapper::Memcpy2DPadding::forward(
+                zero_points + thdp.loc[1] * N, stor->template ZPtr<int8_t>() + thdp.loc[1] * stor->mNPad, rows,
+                N * sizeof(zero_points[0]), N * sizeof(zero_points[0]), sizeof(int8_t) * stor->mNPad, true);
+
+            if (rows < thdp.size[1]) {
+              std::memset(stor->template ZPtr<int8_t>() + (thdp.loc[1] + rows) * stor->mNPad, 0,
+                          sizeof(int8_t) * (thdp.size[1] - rows) * stor->mNPad);
             }
           }
         }
@@ -334,84 +333,24 @@ class WeightKBlockNInteger {
     utils::afree(countptr);
   }
 
-  AUTOCALL void setTransposeQuantCorrection(const int N, const int K, const int8_t* zero_points, const float* scales,
+  AUTOCALL void setTransposeQuantCorrection(const int N, const int K, const int8_t* zero_pointsT, const float* scalesT,
                                             StorageWeight* stor, parallel::IThreading* threading) {
     int rawnk_scale = utils::updiv(K, stor->mBlockSize);
-    int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize);
-    parallel::Scheduler2D _para({threading->num_threads(), 1, nk_scale, 1, 1});
-    if (stor->SDtype() == BTLA_DTYPE::F32) {  // fp32 to fp32 direct copy
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (int j = 0; j < N; j++) {
-                  stor->template SPtr<float>()[i * stor->mNPad + j] = scales[j * rawnk_scale + i];
-                }
-              } else {
-                std::memset(stor->template SPtr<float>() + i * stor->mNPad, 0, stor->mNPad * sizeof(float));
-              }
-            }
-          }
-        }
-      });
-    } else if (stor->SDtype() == BTLA_DTYPE::BF16) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (int j = 0; j < N; j++) {
-                  stor->template SPtr<utils::bf16>()[i * stor->mNPad + j] = utils::bf16(scales[j * rawnk_scale + i]);
-                }
-              } else {
-                std::memset(stor->template SPtr<utils::bf16>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::bf16));
-              }
-            }
-          }
-        }
-      });
-    } else if (stor->SDtype() == BTLA_DTYPE::F8_E8M0) {
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          if (scales) {
-            for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-              if (i < rawnk_scale) {
-                for (int j = 0; j < N; j++) {
-                  stor->template SPtr<utils::f8>()[i * stor->mNPad + j] = static_cast<int>(scales[j * rawnk_scale + i]);
-                }
-              } else {
-                std::memset(stor->template SPtr<utils::f8>() + i * stor->mNPad, 0, stor->mNPad * sizeof(utils::f8));
-              }
-            }
-          }
-        }
-      });
-    } else {
-      assert(0);
+    auto scales = scalesT ? utils::amalloc<float>(rawnk_scale * N) : nullptr;
+    auto zero_points = zero_pointsT ? utils::amalloc<int8_t>(rawnk_scale * N) : nullptr;
+    if (scales) {
+      transposeWeight<float>(N, rawnk_scale, scalesT, rawnk_scale, scales, N, threading);
+    }
+    if (zero_points) {
+      transposeWeight<int8_t>(N, rawnk_scale, zero_pointsT, rawnk_scale, zero_points, N, threading);
+    }
+    setQuantCorrection(N, K, zero_points, scales, stor, threading);
+    if (scales) {
+      utils::afree(scales);
+    }
+    if (zero_points) {
+      utils::afree(zero_points);
     }
-    if (stor->IsAsym() && zero_points)
-      threading->parallel_for([&](int tidx) {
-        parallel::ThreadProblem2D thdp{tidx};
-        _para.getIndex(thdp);
-        if (thdp.valid) {
-          for (int i = thdp.loc[1]; i < thdp.loc[1] + thdp.size[1]; i++) {
-            if (i < rawnk_scale) {
-              for (int j = 0; j < N; j++) {
-                stor->template ZPtr<int8_t>()[i * stor->mNPad + j] = zero_points[j * rawnk_scale + i];
-              }
-            } else {
-              std::memset(stor->template ZPtr<int8_t>() + i * stor->mNPad, 0, stor->mNPad * sizeof(zero_points[0]));
-            }
-          }
-        }
-      });
   }
 
   AUTOCALL void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales,
@@ -445,6 +384,7 @@ class WeightKBlockNInteger {
     auto blks_padding2 = utils::padto(blks, 2);
     auto tmpscales = tmp;
     auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks);
+    assert(isasym == (zero_points != nullptr));
     if (scales) {
       for (size_t i = 0; i < N * blks; i += 1) {
         tmpscales[i] = scales[i];
@@ -640,6 +580,7 @@ class WeightKBlockNInteger {
       }
     });
   }
+
   AUTOCALL void compressWeight(const int N, const int K, const int8_t* B, const int ldb, int8_t* dstptr,
                                BTLA_DTYPE qtype, parallel::IThreading* threading) {
     if (qtype == BTLA_DTYPE::S7_CLIP) return compressBit7Weight(N, K, B, dstptr, qtype, threading);
@@ -726,6 +667,13 @@ class WeightKBlockNInteger {
           utils::updiv(k_size, wptr->mBlockSize), n_size, wptr->CStep() * 2, n_size * 4, false);
       *dststep = n_size;
     }
+    if (wptr->SDtype() == BTLA_DTYPE::F16) {
+      auto aptr = wptr->template SPtr<utils::fp16>();
+      kernel::wrapper::Memcpy2DFp16CvtFp32::forward<ISA_T>(
+          aptr + k_offset / wptr->mBlockSize * wptr->CStep() + n_offset, *dstptr,
+          utils::updiv(k_size, wptr->mBlockSize), n_size, wptr->CStep() * 2, n_size * 4, false);
+      *dststep = n_size;
+    }
     if (wptr->SDtype() == BTLA_DTYPE::DQ8_BNB) {
       auto aptr = wptr->template SPtr<uint8_t>();
       auto internal_k_offset = k_offset / wptr->mBlockSize;
 
@@ -20,6 +20,8 @@
 #define BTLA_OPENMP 0
 #endif
 
+#define FP32_BF16_FAST 0
+
 #if BTLA_OPENMP
 #include <omp.h>
 #endif
@@ -83,8 +85,6 @@
 // runtime auto-dispatch ISA, not time critical functions
 #define AUTOCALL static
 
-#include <immintrin.h>
-
 namespace bestla {
 namespace utils {
 
@@ -388,6 +388,8 @@ inline constexpr size_t bestla_dtype_bits(const BTLA_DTYPE t) {
   return bestla_dtype_get_mask_val(t, BTLA_DTYPE::EleBitsMask, BTLA_DTYPE::EleBitsShift);
 }
 
+inline constexpr size_t bestla_dtype_bytes(const BTLA_DTYPE t) { return bestla_dtype_bits(t) >> 3; }
+
 inline constexpr size_t bestla_dtype_type(const BTLA_DTYPE t) {
   return bestla_dtype_get_mask_val(t, BTLA_DTYPE::TypeMask, BTLA_DTYPE::TypeShift);
 }
@@ -464,9 +466,11 @@ class isa_base {
   static bool constexpr avx2 = ISA_T >= BTLA_ISA::AVX2;
   static bool constexpr avx512f = ISA_T >= BTLA_ISA::AVX512F;
   static bool constexpr avx512_vnni = ISA_T >= BTLA_ISA::AVX512_VNNI;
+  static bool constexpr avx512_bf16 = ISA_T >= BTLA_ISA::AVX512_BF16;
   static bool constexpr avx512_fp16 = ISA_T >= BTLA_ISA::AVX512_FP16;
   static bool constexpr amx_bf16 = ISA_T >= BTLA_ISA::AMX_BF16;
   static bool constexpr amx_int8 = ISA_T >= BTLA_ISA::AMX_INT8;
+  static bool constexpr amx_fp16 = ISA_T >= BTLA_ISA::AMX_FP16;
 };
 
 static inline int padto_le(int src, int padding) { return src / padding * padding; }
 
@@ -353,6 +353,7 @@ class LauncherBase {
               _param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP;
       if constexpr (support()) {
         impl &= _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F32 ||
+                _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F16 ||
                 _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::BF16;
       }
 
@@ -451,6 +452,17 @@ class LauncherBase {
             if (m == 7) gemv_kblock<utils::bf16, 7>(_param, _config);
             if (m == 8) gemv_kblock<utils::bf16, 8>(_param, _config);
           }
+        } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F16) {
+          if (m == 1) gemv_kblock<utils::fp16, 1>(_param, _config);
+          if (m == 2) gemv_kblock<utils::fp16, 2>(_param, _config);
+          if (m == 3) gemv_kblock<utils::fp16, 3>(_param, _config);
+          if (m == 4) gemv_kblock<utils::fp16, 4>(_param, _config);
+          if constexpr (Reg32) {
+            if (m == 5) gemv_kblock<utils::fp16, 5>(_param, _config);
+            if (m == 6) gemv_kblock<utils::fp16, 6>(_param, _config);
+            if (m == 7) gemv_kblock<utils::fp16, 7>(_param, _config);
+            if (m == 8) gemv_kblock<utils::fp16, 8>(_param, _config);
+          }
         }
       }
     }
@@ -622,6 +634,7 @@ class LauncherIntKBlock {
               _param.paramB.packedW->mDType == BTLA_DTYPE::S1_CLIP ||
               _param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP;
       impl &= _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F32 ||
+              _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F16 ||
               _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::BF16;
       impl &= _param.problem.dims[1] <= MaxGemvM;
       return impl;
@@ -699,6 +712,17 @@ class LauncherIntKBlock {
             if (m == 7) gemv_kblock<utils::bf16, 7>(_param, _config);
             if (m == 8) gemv_kblock<utils::bf16, 8>(_param, _config);
           }
+        } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F16) {
+          if (m == 1) gemv_kblock<utils::fp16, 1>(_param, _config);
+          if (m == 2) gemv_kblock<utils::fp16, 2>(_param, _config);
+          if (m == 3) gemv_kblock<utils::fp16, 3>(_param, _config);
+          if (m == 4) gemv_kblock<utils::fp16, 4>(_param, _config);
+          if constexpr (Reg32) {
+            if (m == 5) gemv_kblock<utils::fp16, 5>(_param, _config);
+            if (m == 6) gemv_kblock<utils::fp16, 6>(_param, _config);
+            if (m == 7) gemv_kblock<utils::fp16, 7>(_param, _config);
+            if (m == 8) gemv_kblock<utils::fp16, 8>(_param, _config);
+          }
         }
       }
     }
Original file line number	Diff line number	Diff line change
`@@ -353,6 +353,7 @@ class LauncherBase {`
`353`	`353`	`_param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP;`
`354`	`354`	`if constexpr (support()) {`
`355`	`355`	`impl &= _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F32 \|\|`
	`356`	`+ _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F16 \|\|`
`356`	`357`	`_param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::BF16;`
`357`	`358`	`}`
`358`	`359`
`@@ -451,6 +452,17 @@ class LauncherBase {`
`451`	`452`	`if (m == 7) gemv_kblock<utils::bf16, 7>(_param, _config);`
`452`	`453`	`if (m == 8) gemv_kblock<utils::bf16, 8>(_param, _config);`
`453`	`454`	`}`
	`455`	`+ } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F16) {`
	`456`	`+ if (m == 1) gemv_kblock<utils::fp16, 1>(_param, _config);`
	`457`	`+ if (m == 2) gemv_kblock<utils::fp16, 2>(_param, _config);`
	`458`	`+ if (m == 3) gemv_kblock<utils::fp16, 3>(_param, _config);`
	`459`	`+ if (m == 4) gemv_kblock<utils::fp16, 4>(_param, _config);`
	`460`	`+ if constexpr (Reg32) {`
	`461`	`+ if (m == 5) gemv_kblock<utils::fp16, 5>(_param, _config);`
	`462`	`+ if (m == 6) gemv_kblock<utils::fp16, 6>(_param, _config);`
	`463`	`+ if (m == 7) gemv_kblock<utils::fp16, 7>(_param, _config);`
	`464`	`+ if (m == 8) gemv_kblock<utils::fp16, 8>(_param, _config);`
	`465`	`+ }`
`454`	`466`	`}`
`455`	`467`	`}`
`456`	`468`	`}`
`@@ -622,6 +634,7 @@ class LauncherIntKBlock {`
`622`	`634`	`_param.paramB.packedW->mDType == BTLA_DTYPE::S1_CLIP \|\|`
`623`	`635`	`_param.paramB.packedW->mDType == BTLA_DTYPE::S2_CLIP;`
`624`	`636`	`impl &= _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F32 \|\|`
	`637`	`+ _param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::F16 \|\|`
`625`	`638`	`_param.paramB.packedW->mCorrection.mScaT == BTLA_DTYPE::BF16;`
`626`	`639`	`impl &= _param.problem.dims[1] <= MaxGemvM;`
`627`	`640`	`return impl;`
`@@ -699,6 +712,17 @@ class LauncherIntKBlock {`
`699`	`712`	`if (m == 7) gemv_kblock<utils::bf16, 7>(_param, _config);`
`700`	`713`	`if (m == 8) gemv_kblock<utils::bf16, 8>(_param, _config);`
`701`	`714`	`}`
	`715`	`+ } else if (_param.paramB.packedW->SDtype() == BTLA_DTYPE::F16) {`
	`716`	`+ if (m == 1) gemv_kblock<utils::fp16, 1>(_param, _config);`
	`717`	`+ if (m == 2) gemv_kblock<utils::fp16, 2>(_param, _config);`
	`718`	`+ if (m == 3) gemv_kblock<utils::fp16, 3>(_param, _config);`
	`719`	`+ if (m == 4) gemv_kblock<utils::fp16, 4>(_param, _config);`
	`720`	`+ if constexpr (Reg32) {`
	`721`	`+ if (m == 5) gemv_kblock<utils::fp16, 5>(_param, _config);`
	`722`	`+ if (m == 6) gemv_kblock<utils::fp16, 6>(_param, _config);`
	`723`	`+ if (m == 7) gemv_kblock<utils::fp16, 7>(_param, _config);`
	`724`	`+ if (m == 8) gemv_kblock<utils::fp16, 8>(_param, _config);`
	`725`	`+ }`
`702`	`726`	`}`
`703`	`727`	`}`
`704`	`728`	`}`