From 32b896faa823e7afb4af7de2eaa1d8f0089306bb Mon Sep 17 00:00:00 2001
From: skykongkong8 <ss.kong@samsung.com>
Date: Wed, 14 Feb 2024 10:46:14 +0900
Subject: [PATCH] [ meson ] Add omp setting in meson.build

- get omp thread num option when build, and allocate thread accordingly.
- getting system core num will be added in the near future

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <ss.kong@samsung.com>
---
 meson.build                    |  4 ++++
 meson_options.txt              |  1 +
 nntrainer/tensor/blas_neon.cpp | 28 ++++++++++++++++++----------
 nntrainer/tensor/omp_setting.h | 14 ++++++++------
 tools/package_android.sh       |  4 ++--
 5 files changed, 33 insertions(+), 18 deletions(-)
diff --git a/meson.build b/meson.build
index 35be7c16f7..1c9e73a280 100644
--- a/meson.build
+++ b/meson.build
@@ -207,6 +207,10 @@ message('set nntrainer num threads=@0@'.format(get_option('nntr-num-threads')))
 openmp_dep = dummy_dep
 if get_option('enable-openmp')
   openmp_dep = dependency('openmp')
+  if get_option('omp-num-threads') > 0
+    extra_defines += '-DOMP_NUM_THREADS=@0@'.format(get_option('omp-num-threads'))
+    message('set nntrainer omp threads=@0@'.format(get_option('omp-num-threads')))
+  endif
 endif
 
 if get_option('enable-profile')
diff --git a/meson_options.txt b/meson_options.txt
index 5674160eba..1e661b9a82 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -27,6 +27,7 @@ option('tizen-version-minor', type: 'integer', min : 0, max : 9999, value: 0)
 option('openblas-num-threads', type: 'integer', min : 0, max : 9999, value: 0)
 #This is for the multi-threading in nntrainer
 option('nntr-num-threads', type: 'integer', min : 0, max : 9999, value: 1)
+option('omp-num-threads', type: 'integer', min : 0, max : 9999, value: 1)
 
 # test related option
 option('reduce-tolerance', type: 'boolean', value: true)
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index c2867b325c..fa683a84c1 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -13,8 +13,10 @@
  */
 
 #include <blas_neon.h>
+#include <iostream>
 #include <nntrainer_error.h>
 #include <omp_setting.h>
+
 namespace nntrainer::neon {
 
 void sgemv_neon(const float *A, const float *X, float *Y, uint32_t rows,
@@ -702,9 +704,13 @@ void sgemv_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows,
 void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
                                uint32_t M, uint32_t K, float alpha,
                                float beta) {
+#ifdef OMP_NUM_THREADS
+  set_gemv_num_threads(OMP_NUM_THREADS);
+#endif
+  size_t GEMV_NUM_THREADS = get_gemv_num_threads();
+
   float Y32[K];
   unsigned int idx = 0;
-  size_t NEON_NUM_THREADS = get_gemv_num_threads();
 
   for (; K - idx >= 8; idx += 8) {
     float32x4_t y0_3_32 = vcvt_f32_f16(vld1_f16(&Y[idx]));
@@ -728,7 +734,7 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
   for (; M - i >= 8; i += 8) {
     __fp16 x[8];
     vst1q_f16(&x[0], vmulq_n_f16(vld1q_f16(&A[i]), alpha));
-#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
+#pragma omp parallel for schedule(guided) num_threads(GEMV_NUM_THREADS)
     for (unsigned int idx = 0; idx < K - 8; idx += 8) {
       float16x8_t wvec0_7_f16 = vmulq_n_f16(vld1q_f16(&A[i * K + idx]), x[0]);
       wvec0_7_f16 =
@@ -846,7 +852,7 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
   for (; M - i >= 4; i += 4) {
     __fp16 x[4];
     vst1_f16(&x[0], vmul_n_f16(vld1_f16(&A[i]), alpha));
-#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
+#pragma omp parallel for schedule(guided) num_threads(GEMV_NUM_THREADS)
     for (unsigned int idx = 0; idx < K - 8; idx += 8) {
       float16x8_t wvec0_7_f16 = vmulq_n_f16(vld1q_f16(&A[i * K + idx]), x[0]);
       wvec0_7_f16 =
@@ -930,7 +936,7 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
   }
   for (; i < M; ++i) {
     __fp16 x = alpha * (X[i]);
-#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
+#pragma omp parallel for schedule(guided) num_threads(GEMV_NUM_THREADS)
     for (unsigned int idx = 0; idx < K - 8; idx += 8) {
       float16x8_t wvec0_7_f16 = vmulq_n_f16(vld1q_f16(&A[i * K + idx]), x);
       float32x4_t y0_3 = vaddq_f32(vld1q_f32(&Y32[idx]),
@@ -1400,10 +1406,12 @@ unsigned int isamax_neon_fp16(const unsigned int N, const __fp16 *X) {
 
   return retIdx;
 }
-
 void sgemm_neon_fp16(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M,
                      uint32_t N, uint32_t K, float alpha, float beta,
                      bool TransA, bool TransB) {
+#ifdef OMP_NUM_THREADS
+  set_gemm_num_threads(OMP_NUM_THREADS);
+#endif
 
   // dynamic creation to avoid reaching stack limit(causes segmentation fault)
   float *C32 = (float *)malloc(M * N * sizeof(float));
@@ -1445,14 +1453,14 @@ void sgemm_neon_fp16(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M,
 void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C,
                              uint32_t M, uint32_t N, uint32_t K, float alpha,
                              float beta) {
-  size_t NEON_NUM_THREADS = omp_get_num_threads();
+  size_t GEMM_NUM_THREADS = get_gemm_num_threads();
   unsigned int k = 0;
   __fp16 a[16];
   for (; (K - k) >= 16; k += 16) {
     for (unsigned int m = 0; m < M; m++) {
       vst1q_f16(&a[0], vmulq_n_f16(vld1q_f16(&A[m * K + k]), alpha));
       vst1q_f16(&a[8], vmulq_n_f16(vld1q_f16(&A[m * K + k + 8]), alpha));
-#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
+#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS)
       for (unsigned int n = 0; n < N - 8; n += 8) {
         float16x8_t b0_7_0 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a[0]);
         b0_7_0 = vfmaq_n_f16(b0_7_0, vld1q_f16(&B[(k + 1) * N + n]), a[1]);
@@ -1583,7 +1591,7 @@ void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C,
     for (unsigned int m = 0; m < M; m++) {
       vst1q_f16(a, vmulq_n_f16(vld1q_f16(&A[m * K + k]), alpha));
 
-#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
+#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS)
       for (unsigned int n = 0; n < N - 8; n += 8) {
         float16x8_t b0_7_0 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a[0]);
         b0_7_0 = vfmaq_n_f16(b0_7_0, vld1q_f16(&B[(k + 1) * N + n]), a[1]);
@@ -1674,7 +1682,7 @@ void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C,
     for (unsigned int m = 0; m < M; m++) {
       vst1_f16(a, vmul_n_f16(vld1_f16(&A[m * K + k]), alpha));
 
-#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
+#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS)
       for (unsigned int n = 0; n < N - 8; n += 8) {
 
         float16x8_t b0_7_0 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a[0]);
@@ -1752,7 +1760,7 @@ void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C,
     for (unsigned int m = 0; m < M; m++) {
       __fp16 a0 = alpha * A[m * K + k];
 
-#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
+#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS)
       for (unsigned int n = 0; n < N - 8; n += 8) {
         float16x8_t b0_7 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a0);
 
diff --git a/nntrainer/tensor/omp_setting.h b/nntrainer/tensor/omp_setting.h
index 9d02852cc1..00538eb79f 100644
--- a/nntrainer/tensor/omp_setting.h
+++ b/nntrainer/tensor/omp_setting.h
@@ -25,17 +25,18 @@ const int OMP_THRESHOLD = 20000;
  * @return size_t& num_threads
  */
 inline size_t &GEMM_NUM_THREADS() {
-  static size_t num_threads = omp_get_num_threads();
+  static size_t num_threads = 1;
+  // static size_t num_threads = omp_get_num_threads();
   return num_threads;
 }
 /**
- * @brief Set the gemm num threads object
+ * @brief Set the gemm num threads
  *
  * @param n num_threads to set
  */
 inline void set_gemm_num_threads(size_t n) { GEMM_NUM_THREADS() = n; }
 /**
- * @brief Get the gemm num threads object
+ * @brief Get the gemm num threads
  *
  * @return size_t num_threads
  */
@@ -46,17 +47,18 @@ inline size_t get_gemm_num_threads() { return GEMM_NUM_THREADS(); }
  * @return size_t& num_threads
  */
 inline size_t &GEMV_NUM_THREADS() {
-  static size_t num_threads = omp_get_num_threads();
+  static size_t num_threads = 1;
+  // static size_t num_threads = omp_get_num_threads();
   return num_threads;
 }
 /**
- * @brief Set the gemv num threads object
+ * @brief Set the gemv num threads
  *
  * @param n num_threads to set
  */
 inline void set_gemv_num_threads(size_t n) { GEMV_NUM_THREADS() = n; }
 /**
- * @brief Get the gemv num threads object
+ * @brief Get the gemv num threads
  *
  * @return size_t num_threads
  */
diff --git a/tools/package_android.sh b/tools/package_android.sh
index 3ef7c5d1ec..98f3af8e31 100755
--- a/tools/package_android.sh
+++ b/tools/package_android.sh
@@ -16,13 +16,13 @@ pushd $TARGET
 if [ ! -d builddir ]; then
     #default value of openblas num threads is 1 for android
     #enable-tflite-interpreter=false is just temporally until ci system is stabel
-  meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true
+  meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Domp-num-threads=1
 else
   echo "warning: $TARGET/builddir has already been taken, this script tries to reconfigure and try building"
   pushd builddir
     #default value of openblas num threads is 1 for android
     #enable-tflite-interpreter=false is just temporally until ci system is stabel  
-    meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true
+    meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Domp-num-threads=1
     meson --wipe
   popd
 fi