Skip to content

Commit

Permalink
[ meson ] Add omp setting in meson.build
Browse files Browse the repository at this point in the history
- get omp thread num option when build, and allocate thread accordingly.
- getting system core num will be added in the near future

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed Feb 14, 2024
1 parent b2c9e74 commit 32b896f
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 18 deletions.
4 changes: 4 additions & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,10 @@ message('set nntrainer num threads=@0@'.format(get_option('nntr-num-threads')))
openmp_dep = dummy_dep
if get_option('enable-openmp')
openmp_dep = dependency('openmp')
if get_option('omp-num-threads') > 0
extra_defines += '-DOMP_NUM_THREADS=@0@'.format(get_option('omp-num-threads'))
message('set nntrainer omp threads=@0@'.format(get_option('omp-num-threads')))
endif
endif

if get_option('enable-profile')
Expand Down
1 change: 1 addition & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ option('tizen-version-minor', type: 'integer', min : 0, max : 9999, value: 0)
option('openblas-num-threads', type: 'integer', min : 0, max : 9999, value: 0)
#This is for the multi-threading in nntrainer
option('nntr-num-threads', type: 'integer', min : 0, max : 9999, value: 1)
option('omp-num-threads', type: 'integer', min : 0, max : 9999, value: 1)

# test related option
option('reduce-tolerance', type: 'boolean', value: true)
Expand Down
28 changes: 18 additions & 10 deletions nntrainer/tensor/blas_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@
*/

#include <blas_neon.h>
#include <iostream>
#include <nntrainer_error.h>
#include <omp_setting.h>

namespace nntrainer::neon {

void sgemv_neon(const float *A, const float *X, float *Y, uint32_t rows,
Expand Down Expand Up @@ -702,9 +704,13 @@ void sgemv_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t rows,
void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
uint32_t M, uint32_t K, float alpha,
float beta) {
#ifdef OMP_NUM_THREADS
set_gemv_num_threads(OMP_NUM_THREADS);
#endif
size_t GEMV_NUM_THREADS = get_gemv_num_threads();

float Y32[K];
unsigned int idx = 0;
size_t NEON_NUM_THREADS = get_gemv_num_threads();

for (; K - idx >= 8; idx += 8) {
float32x4_t y0_3_32 = vcvt_f32_f16(vld1_f16(&Y[idx]));
Expand All @@ -728,7 +734,7 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
for (; M - i >= 8; i += 8) {
__fp16 x[8];
vst1q_f16(&x[0], vmulq_n_f16(vld1q_f16(&A[i]), alpha));
#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
#pragma omp parallel for schedule(guided) num_threads(GEMV_NUM_THREADS)
for (unsigned int idx = 0; idx < K - 8; idx += 8) {
float16x8_t wvec0_7_f16 = vmulq_n_f16(vld1q_f16(&A[i * K + idx]), x[0]);
wvec0_7_f16 =
Expand Down Expand Up @@ -846,7 +852,7 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
for (; M - i >= 4; i += 4) {
__fp16 x[4];
vst1_f16(&x[0], vmul_n_f16(vld1_f16(&A[i]), alpha));
#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
#pragma omp parallel for schedule(guided) num_threads(GEMV_NUM_THREADS)
for (unsigned int idx = 0; idx < K - 8; idx += 8) {
float16x8_t wvec0_7_f16 = vmulq_n_f16(vld1q_f16(&A[i * K + idx]), x[0]);
wvec0_7_f16 =
Expand Down Expand Up @@ -930,7 +936,7 @@ void sgemv_transpose_neon_fp16(const __fp16 *A, const __fp16 *X, __fp16 *Y,
}
for (; i < M; ++i) {
__fp16 x = alpha * (X[i]);
#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
#pragma omp parallel for schedule(guided) num_threads(GEMV_NUM_THREADS)
for (unsigned int idx = 0; idx < K - 8; idx += 8) {
float16x8_t wvec0_7_f16 = vmulq_n_f16(vld1q_f16(&A[i * K + idx]), x);
float32x4_t y0_3 = vaddq_f32(vld1q_f32(&Y32[idx]),
Expand Down Expand Up @@ -1400,10 +1406,12 @@ unsigned int isamax_neon_fp16(const unsigned int N, const __fp16 *X) {

return retIdx;
}

void sgemm_neon_fp16(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M,
uint32_t N, uint32_t K, float alpha, float beta,
bool TransA, bool TransB) {
#ifdef OMP_NUM_THREADS
set_gemm_num_threads(OMP_NUM_THREADS);
#endif

// dynamic creation to avoid reaching stack limit(causes segmentation fault)
float *C32 = (float *)malloc(M * N * sizeof(float));
Expand Down Expand Up @@ -1445,14 +1453,14 @@ void sgemm_neon_fp16(const __fp16 *A, const __fp16 *B, __fp16 *C, uint32_t M,
void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C,
uint32_t M, uint32_t N, uint32_t K, float alpha,
float beta) {
size_t NEON_NUM_THREADS = omp_get_num_threads();
size_t GEMM_NUM_THREADS = get_gemm_num_threads();
unsigned int k = 0;
__fp16 a[16];
for (; (K - k) >= 16; k += 16) {
for (unsigned int m = 0; m < M; m++) {
vst1q_f16(&a[0], vmulq_n_f16(vld1q_f16(&A[m * K + k]), alpha));
vst1q_f16(&a[8], vmulq_n_f16(vld1q_f16(&A[m * K + k + 8]), alpha));
#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS)
for (unsigned int n = 0; n < N - 8; n += 8) {
float16x8_t b0_7_0 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a[0]);
b0_7_0 = vfmaq_n_f16(b0_7_0, vld1q_f16(&B[(k + 1) * N + n]), a[1]);
Expand Down Expand Up @@ -1583,7 +1591,7 @@ void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C,
for (unsigned int m = 0; m < M; m++) {
vst1q_f16(a, vmulq_n_f16(vld1q_f16(&A[m * K + k]), alpha));

#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS)
for (unsigned int n = 0; n < N - 8; n += 8) {
float16x8_t b0_7_0 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a[0]);
b0_7_0 = vfmaq_n_f16(b0_7_0, vld1q_f16(&B[(k + 1) * N + n]), a[1]);
Expand Down Expand Up @@ -1674,7 +1682,7 @@ void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C,
for (unsigned int m = 0; m < M; m++) {
vst1_f16(a, vmul_n_f16(vld1_f16(&A[m * K + k]), alpha));

#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS)
for (unsigned int n = 0; n < N - 8; n += 8) {

float16x8_t b0_7_0 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a[0]);
Expand Down Expand Up @@ -1752,7 +1760,7 @@ void sgemm_neon_fp16_noTrans(const __fp16 *A, const __fp16 *B, float *C,
for (unsigned int m = 0; m < M; m++) {
__fp16 a0 = alpha * A[m * K + k];

#pragma omp parallel for schedule(guided) num_threads(NEON_NUM_THREADS)
#pragma omp parallel for schedule(guided) num_threads(GEMM_NUM_THREADS)
for (unsigned int n = 0; n < N - 8; n += 8) {
float16x8_t b0_7 = vmulq_n_f16(vld1q_f16(&B[k * N + n]), a0);

Expand Down
14 changes: 8 additions & 6 deletions nntrainer/tensor/omp_setting.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,18 @@ const int OMP_THRESHOLD = 20000;
* @return size_t& num_threads
*/
inline size_t &GEMM_NUM_THREADS() {
static size_t num_threads = omp_get_num_threads();
static size_t num_threads = 1;
// static size_t num_threads = omp_get_num_threads();
return num_threads;
}
/**
* @brief Set the gemm num threads object
* @brief Set the gemm num threads
*
* @param n num_threads to set
*/
inline void set_gemm_num_threads(size_t n) { GEMM_NUM_THREADS() = n; }
/**
* @brief Get the gemm num threads object
* @brief Get the gemm num threads
*
* @return size_t num_threads
*/
Expand All @@ -46,17 +47,18 @@ inline size_t get_gemm_num_threads() { return GEMM_NUM_THREADS(); }
* @return size_t& num_threads
*/
inline size_t &GEMV_NUM_THREADS() {
static size_t num_threads = omp_get_num_threads();
static size_t num_threads = 1;
// static size_t num_threads = omp_get_num_threads();
return num_threads;
}
/**
* @brief Set the gemv num threads object
* @brief Set the gemv num threads
*
* @param n num_threads to set
*/
inline void set_gemv_num_threads(size_t n) { GEMV_NUM_THREADS() = n; }
/**
* @brief Get the gemv num threads object
* @brief Get the gemv num threads
*
* @return size_t num_threads
*/
Expand Down
4 changes: 2 additions & 2 deletions tools/package_android.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ pushd $TARGET
if [ ! -d builddir ]; then
#default value of openblas num threads is 1 for android
#enable-tflite-interpreter=false is just temporally until ci system is stabel
meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true
meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Domp-num-threads=1
else
echo "warning: $TARGET/builddir has already been taken, this script tries to reconfigure and try building"
pushd builddir
#default value of openblas num threads is 1 for android
#enable-tflite-interpreter=false is just temporally until ci system is stabel
meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true
meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Domp-num-threads=1
meson --wipe
popd
fi
Expand Down

0 comments on commit 32b896f

Please sign in to comment.