From 32b896faa823e7afb4af7de2eaa1d8f0089306bb Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Wed, 14 Feb 2024 10:46:14 +0900 Subject: [PATCH] [ meson ] Add omp setting in meson.build - get omp thread num option when build, and allocate thread accordingly. - getting system core num will be added in the near future **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- meson.build | 4 ++++ meson_options.txt | 1 + nntrainer/tensor/blas_neon.cpp | 28 ++++++++++++++++++---------- nntrainer/tensor/omp_setting.h | 14 ++++++++------ tools/package_android.sh | 4 ++-- 5 files changed, 33 insertions(+), 18 deletions(-) diff --git a/meson.build b/meson.build index 35be7c16f7..1c9e73a280 100644 --- a/meson.build +++ b/meson.build @@ -207,6 +207,10 @@ message('set nntrainer num threads=@0@'.format(get_option('nntr-num-threads'))) openmp_dep = dummy_dep if get_option('enable-openmp') openmp_dep = dependency('openmp') + if get_option('omp-num-threads') > 0 + extra_defines += '-DOMP_NUM_THREADS=@0@'.format(get_option('omp-num-threads')) + message('set nntrainer omp threads=@0@'.format(get_option('omp-num-threads'))) + endif endif if get_option('enable-profile') diff --git a/meson_options.txt b/meson_options.txt index 5674160eba..1e661b9a82 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -27,6 +27,7 @@ option('tizen-version-minor', type: 'integer', min : 0, max : 9999, value: 0) option('openblas-num-threads', type: 'integer', min : 0, max : 9999, value: 0) #This is for the multi-threading in nntrainer option('nntr-num-threads', type: 'integer', min : 0, max : 9999, value: 1) +option('omp-num-threads', type: 'integer', min : 0, max : 9999, value: 1) # test related option option('reduce-tolerance', type: 'boolean', value: true) diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index c2867b325c..fa683a84c1 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -13,8 +13,10 @@ */ #include +#include #include #include + namespace nntrainer::neon { void sgemv_neon(const float *A, const float *X, float *Y, uint32_t rows, @@ -702,9 +704,13 @@ void sgemv_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows, void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t K, float alpha, float beta) { +#ifdef OMP_NUM_THREADS + set_gemv_num_threads(OMP_NUM_THREADS); +#endif + size_t GEMV_NUM_THREADS = get_gemv_num_threads(); + float Y32[K]; unsigned int idx = 0; - size_t NEON_NUM_THREADS = get_gemv_num_threads(); for (; K - idx >= 8; idx += 8) { float32x4_t y0_3_32 = vcvt_f32_f16(vld1_f16(&Y[idx])); @@ -728,7 +734,7 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, for (; M - i >= 8; i += 8) { __fp16 x[8]; vst1q_f16(&x[0], vmulq_n_f16(vld1q_f16(&A[i]), alpha)); -#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS) +#pragma omp parallel for schedule(guided) num_threads(GEMV_NUM_THREADS) for (unsigned int idx = 0; idx < K - 8; idx += 8) { float16x8_t wvec0_7_f16 = vmulq_n_f16(vld1q_f16(&A[i * K + idx]), x[0]); wvec0_7_f16 = @@ -846,7 +852,7 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, for (; M - i >= 4; i += 4) { __fp16 x[4]; vst1_f16(&x[0], vmul_n_f16(vld1_f16(&A[i]), alpha)); -#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS) +#pragma omp parallel for schedule(guided) num_threads(GEMV_NUM_THREADS) for (unsigned int idx = 0; idx < K - 8; idx += 8) { float16x8_t wvec0_7_f16 = vmulq_n_f16(vld1q_f16(&A[i * K + idx]), x[0]); wvec0_7_f16 = @@ -930,7 +936,7 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, } for (; i < M; ++i) { __fp16 x = alpha * (X[i]); -#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS) +#pragma omp parallel for schedule(guided) num_threads(GEMV_NUM_THREADS) for (unsigned int idx = 0; idx < K - 8; idx += 8) { float16x8_t wvec0_7_f16 = vmulq_n_f16(vld1q_f16(&A[i * K + idx]), x); float32x4_t y0_3 = vaddq_f32(vld1q_f32(&Y32[idx]), @@ -1400,10 +1406,12 @@ unsigned int isamax_neon_fp16(const unsigned int N, const __fp16 *X) { return retIdx; } - void sgemm_neon_fp16(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M, uint32_t N, uint32_t K, float alpha, float beta, bool TransA, bool TransB) { +#ifdef OMP_NUM_THREADS + set_gemm_num_threads(OMP_NUM_THREADS); +#endif // dynamic creation to avoid reaching stack limit(causes segmentation fault) float *C32 = (float *)malloc(M * N * sizeof(float)); @@ -1445,14 +1453,14 @@ void sgemm_neon_fp16(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M, void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C, uint32_t M, uint32_t N, uint32_t K, float alpha, float beta) { - size_t NEON_NUM_THREADS = omp_get_num_threads(); + size_t GEMM_NUM_THREADS = get_gemm_num_threads(); unsigned int k = 0; __fp16 a[16]; for (; (K - k) >= 16; k += 16) { for (unsigned int m = 0; m < M; m++) { vst1q_f16(&a[0], vmulq_n_f16(vld1q_f16(&A[m * K + k]), alpha)); vst1q_f16(&a[8], vmulq_n_f16(vld1q_f16(&A[m * K + k + 8]), alpha)); -#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS) +#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS) for (unsigned int n = 0; n < N - 8; n += 8) { float16x8_t b0_7_0 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a[0]); b0_7_0 = vfmaq_n_f16(b0_7_0, vld1q_f16(&B[(k + 1) * N + n]), a[1]); @@ -1583,7 +1591,7 @@ void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C, for (unsigned int m = 0; m < M; m++) { vst1q_f16(a, vmulq_n_f16(vld1q_f16(&A[m * K + k]), alpha)); -#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS) +#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS) for (unsigned int n = 0; n < N - 8; n += 8) { float16x8_t b0_7_0 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a[0]); b0_7_0 = vfmaq_n_f16(b0_7_0, vld1q_f16(&B[(k + 1) * N + n]), a[1]); @@ -1674,7 +1682,7 @@ void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C, for (unsigned int m = 0; m < M; m++) { vst1_f16(a, vmul_n_f16(vld1_f16(&A[m * K + k]), alpha)); -#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS) +#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS) for (unsigned int n = 0; n < N - 8; n += 8) { float16x8_t b0_7_0 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a[0]); @@ -1752,7 +1760,7 @@ void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C, for (unsigned int m = 0; m < M; m++) { __fp16 a0 = alpha * A[m * K + k]; -#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS) +#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS) for (unsigned int n = 0; n < N - 8; n += 8) { float16x8_t b0_7 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a0); diff --git a/nntrainer/tensor/omp_setting.h b/nntrainer/tensor/omp_setting.h index 9d02852cc1..00538eb79f 100644 --- a/nntrainer/tensor/omp_setting.h +++ b/nntrainer/tensor/omp_setting.h @@ -25,17 +25,18 @@ const int OMP_THRESHOLD = 20000; * @return size_t& num_threads */ inline size_t &GEMM_NUM_THREADS() { - static size_t num_threads = omp_get_num_threads(); + static size_t num_threads = 1; + // static size_t num_threads = omp_get_num_threads(); return num_threads; } /** - * @brief Set the gemm num threads object + * @brief Set the gemm num threads * * @param n num_threads to set */ inline void set_gemm_num_threads(size_t n) { GEMM_NUM_THREADS() = n; } /** - * @brief Get the gemm num threads object + * @brief Get the gemm num threads * * @return size_t num_threads */ @@ -46,17 +47,18 @@ inline size_t get_gemm_num_threads() { return GEMM_NUM_THREADS(); } * @return size_t& num_threads */ inline size_t &GEMV_NUM_THREADS() { - static size_t num_threads = omp_get_num_threads(); + static size_t num_threads = 1; + // static size_t num_threads = omp_get_num_threads(); return num_threads; } /** - * @brief Set the gemv num threads object + * @brief Set the gemv num threads * * @param n num_threads to set */ inline void set_gemv_num_threads(size_t n) { GEMV_NUM_THREADS() = n; } /** - * @brief Get the gemv num threads object + * @brief Get the gemv num threads * * @return size_t num_threads */ diff --git a/tools/package_android.sh b/tools/package_android.sh index 3ef7c5d1ec..98f3af8e31 100755 --- a/tools/package_android.sh +++ b/tools/package_android.sh @@ -16,13 +16,13 @@ pushd $TARGET if [ ! -d builddir ]; then #default value of openblas num threads is 1 for android #enable-tflite-interpreter=false is just temporally until ci system is stabel - meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true + meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Domp-num-threads=1 else echo "warning: $TARGET/builddir has already been taken, this script tries to reconfigure and try building" pushd builddir #default value of openblas num threads is 1 for android #enable-tflite-interpreter=false is just temporally until ci system is stabel - meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true + meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Domp-num-threads=1 meson --wipe popd fi