Skip to content

Commit 2ca807d

Browse files
committed
Default to Accelerate for SIMD math on macOS
1 parent 125e65d commit 2ca807d

8 files changed

Lines changed: 463 additions & 111 deletions

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ function(miniexpr_setup_target target_name)
5858
else()
5959
target_compile_definitions(${target_name} PRIVATE ME_USE_SLEEF=0)
6060
endif()
61+
if(APPLE AND MINIEXPR_USE_ACCELERATE)
62+
target_compile_definitions(${target_name} PRIVATE ME_USE_ACCELERATE=1)
63+
target_link_libraries(${target_name} PRIVATE "-framework Accelerate")
64+
else()
65+
target_compile_definitions(${target_name} PRIVATE ME_USE_ACCELERATE=0)
66+
endif()
6167
if(MINIEXPR_ENABLE_TCC_JIT)
6268
target_compile_definitions(${target_name} PRIVATE ME_USE_LIBTCC_FALLBACK=1)
6369
if(DEFINED MINIEXPR_TINYCC_STAGED_SHARED_PATH)

README_DEVELOPERS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ The main [README.md](README.md) keeps the simplest supported build path. This se
8080
- `-DMINIEXPR_BUILD_EXAMPLES=ON|OFF`
8181
- `-DMINIEXPR_BUILD_BENCH=ON|OFF`
8282
- `-DMINIEXPR_USE_SLEEF=ON|OFF`
83+
- `-DMINIEXPR_USE_ACCELERATE=ON|OFF` (macOS only)
8384
- `-DMINIEXPR_ENABLE_TCC_JIT=ON|OFF`
8485
- `-DMINIEXPR_BUILD_BUNDLED_LIBTCC=ON|OFF` (build bundled libtcc from minicc when TCC JIT is enabled)
8586
- `-DMINIEXPR_DSL_TRACE_DEFAULT=ON|OFF` (emit DSL trace logs by default when `ME_DSL_TRACE` is unset)
@@ -91,6 +92,8 @@ The main [README.md](README.md) keeps the simplest supported build path. This se
9192
- On Emscripten, setting `MINIEXPR_ENABLE_TCC_JIT=ON` enables wasm32 JIT support automatically.
9293
- Setting `MINIEXPR_ENABLE_TCC_JIT=OFF` disables TCC-based JIT backends; on Linux/macOS, the separate `# me:compiler=cc` runtime path may still be available.
9394
- `MINIEXPR_USE_SLEEF=ON` fetches SLEEF and enables SIMD math acceleration; set it to `OFF` to build without SLEEF.
95+
- `MINIEXPR_USE_ACCELERATE=ON` enables the macOS Accelerate/vForce backend; in `auto` mode on macOS it is preferred by default, and unsupported functions still fall back to scalar kernels.
96+
- When `ME_SIMD_MATH_BACKEND=accelerate` is active, the `ME_SIMD_ULP_1` / `ME_SIMD_ULP_3_5` distinction does not select different kernels. Those accuracy modes remain meaningful for the SLEEF backend.
9497

9598
### Alternative Build Invocations
9699

@@ -118,6 +121,8 @@ The public/runtime-stable DSL JIT controls remain documented in [README.md](READ
118121

119122
### Internal/Test-Only Environment Variables
120123

124+
- `ME_SIMD_MATH_BACKEND=auto|sleef|accelerate|scalar`: Force the SIMD math backend selection used by `src/functions-simd.c` for benchmarking and debugging. Default: `auto` (`accelerate` on macOS when enabled, otherwise the existing platform backend selection).
125+
- The SIMD math benchmarks print backend-aware columns. For `accelerate` and `scalar`, do not interpret the `ME_SIMD_ULP_1` / `ME_SIMD_ULP_3_5` labels as distinct math implementations.
121126
- `ME_DSL_WHILE_MAX_ITERS=<n>`: Override the runtime safety cap for DSL `while` loops.
122127
- `ME_DSL_JIT_MATH_BRIDGE=0|1`: Enable or disable runtime math-bridge lowering globally. Default: `1`.
123128
- `ME_DSL_JIT_SCALAR_MATH_BRIDGE=0|1`: Enable scalar math-bridge lowering for the `cc` backend. Default: `0`.

RELEASE_NOTES.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ Release notes for miniexpr
44
Changes from 0.2.0 to 0.2.1
55
===========================
66

7-
** add blurb here **
7+
* macOS SIMD math now prefers Accelerate/vForce by default when enabled at build time.
8+
- New CMake option: `MINIEXPR_USE_ACCELERATE=ON|OFF` (enabled by default on macOS, off elsewhere).
9+
- In runtime `auto` mode, macOS uses Accelerate first and falls back to scalar kernels for functions without Accelerate coverage.
10+
- Added `ME_SIMD_MATH_BACKEND=auto|sleef|accelerate|scalar` for backend forcing during debugging and benchmarking.
11+
- Updated SIMD math benchmarks to report backend-appropriate columns instead of implying fake `U10/U35` distinctions for Accelerate/scalar runs.
812

913
Changes from 0.1.0 to 0.2.0
1014
===========================

bench/benchmark_sincos.c

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <stdint.h>
88
#include <stdbool.h>
99
#include <math.h>
10+
#include <string.h>
1011
#include <sys/time.h>
1112
#include "miniexpr.h"
1213

@@ -22,6 +23,22 @@ typedef struct {
2223
size_t elem_size;
2324
} dtype_info_t;
2425

26+
typedef enum {
27+
BENCH_REPORT_U35 = 0,
28+
BENCH_REPORT_ACCELERATE = 1,
29+
BENCH_REPORT_SCALAR = 2
30+
} bench_report_mode_t;
31+
32+
static bench_report_mode_t get_report_mode(const char *backend) {
33+
if (!backend || backend[0] == '\0' || strcmp(backend, "auto") == 0 || strcmp(backend, "sleef") == 0) {
34+
return BENCH_REPORT_U35;
35+
}
36+
if (strcmp(backend, "accelerate") == 0) {
37+
return BENCH_REPORT_ACCELERATE;
38+
}
39+
return BENCH_REPORT_SCALAR;
40+
}
41+
2542
static void fill_data(void *data, const dtype_info_t *info, int nitems) {
2643
if (info->dtype == ME_FLOAT32) {
2744
float *f = (float *)data;
@@ -89,7 +106,7 @@ static double run_c(const void *data, void *out, int nitems,
89106
return (get_time() - start) / iterations;
90107
}
91108

92-
static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nblocks) {
109+
static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nblocks, bench_report_mode_t report_mode) {
93110
int max_block = 0;
94111
for (int i = 0; i < nblocks; i++) {
95112
if (blocks[i] > max_block) {
@@ -125,7 +142,13 @@ static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nbl
125142
printf("\n========================================\n");
126143
printf("sin**2 + cos**2 (%s)\n", info->name);
127144
printf("========================================\n");
128-
printf("BlockKiB ME_U10 ME_U35 ME_SCAL C\n");
145+
if (report_mode == BENCH_REPORT_U35) {
146+
printf("BlockKiB ME_U10 ME_U35 ME_SCAL C\n");
147+
} else if (report_mode == BENCH_REPORT_ACCELERATE) {
148+
printf("BlockKiB ME_ACCEL ME_SCAL C\n");
149+
} else {
150+
printf("BlockKiB ME_SCAL C\n");
151+
}
129152

130153
me_eval_params params_u10 = ME_EVAL_PARAMS_DEFAULTS;
131154
params_u10.simd_ulp_mode = ME_SIMD_ULP_1;
@@ -148,8 +171,16 @@ static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nbl
148171
double c_gbps = data_gb / c_time;
149172

150173
int kib = (int)((nitems * info->elem_size) / 1024);
151-
printf("%6d %7.2f %7.2f %7.2f %7.2f\n",
152-
kib, me_gbps_u10, me_gbps_u35, me_scalar_gbps, c_gbps);
174+
if (report_mode == BENCH_REPORT_U35) {
175+
printf("%6d %7.2f %7.2f %7.2f %7.2f\n",
176+
kib, me_gbps_u10, me_gbps_u35, me_scalar_gbps, c_gbps);
177+
} else if (report_mode == BENCH_REPORT_ACCELERATE) {
178+
printf("%6d %8.2f %7.2f %7.2f\n",
179+
kib, me_gbps_u10, me_scalar_gbps, c_gbps);
180+
} else {
181+
printf("%6d %7.2f %7.2f\n",
182+
kib, me_scalar_gbps, c_gbps);
183+
}
153184
}
154185

155186
me_free(expr);
@@ -158,6 +189,8 @@ static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nbl
158189
}
159190

160191
int main(void) {
192+
const char *backend = getenv("ME_SIMD_MATH_BACKEND");
193+
const bench_report_mode_t report_mode = get_report_mode(backend);
161194
const dtype_info_t infos[] = {
162195
{"float32", ME_FLOAT32, sizeof(float)},
163196
{"float64", ME_FLOAT64, sizeof(double)}
@@ -168,10 +201,11 @@ int main(void) {
168201
printf("========================================\n");
169202
printf("MiniExpr sin/cos Benchmark (Block Sizes)\n");
170203
printf("========================================\n");
204+
printf("Backend: %s\n", backend ? backend : "auto");
171205
printf("Expression: sin(a)**2 + cos(a)**2\n");
172206

173207
for (size_t i = 0; i < sizeof(infos) / sizeof(infos[0]); i++) {
174-
benchmark_dtype(&infos[i], blocks, nblocks);
208+
benchmark_dtype(&infos[i], blocks, nblocks, report_mode);
175209
}
176210

177211
printf("\n========================================\n");

bench/benchmark_transcendentals.c

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <stdint.h>
88
#include <stdio.h>
99
#include <stdlib.h>
10+
#include <string.h>
1011
#include <sys/time.h>
1112
#include "miniexpr.h"
1213

@@ -22,6 +23,22 @@ typedef struct {
2223
size_t elem_size;
2324
} dtype_info_t;
2425

26+
typedef enum {
27+
BENCH_REPORT_U35 = 0,
28+
BENCH_REPORT_ACCELERATE = 1,
29+
BENCH_REPORT_SCALAR = 2
30+
} bench_report_mode_t;
31+
32+
static bench_report_mode_t get_report_mode(const char *backend) {
33+
if (!backend || backend[0] == '\0' || strcmp(backend, "auto") == 0 || strcmp(backend, "sleef") == 0) {
34+
return BENCH_REPORT_U35;
35+
}
36+
if (strcmp(backend, "accelerate") == 0) {
37+
return BENCH_REPORT_ACCELERATE;
38+
}
39+
return BENCH_REPORT_SCALAR;
40+
}
41+
2542
static void fill_data(void *data, const dtype_info_t *info, int nitems) {
2643
const double min = -5.0;
2744
const double max = 5.0;
@@ -94,7 +111,7 @@ static double run_c(const void *data, void *out, int nitems,
94111
return (get_time() - start) / iterations;
95112
}
96113

97-
static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nblocks) {
114+
static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nblocks, bench_report_mode_t report_mode) {
98115
const int max_block = blocks[nblocks - 1];
99116
void *data = malloc((size_t)max_block * info->elem_size);
100117
void *out = malloc((size_t)max_block * info->elem_size);
@@ -124,7 +141,13 @@ static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nbl
124141
printf("\n========================================\n");
125142
printf("Transcendentals chain (%s)\n", info->name);
126143
printf("========================================\n");
127-
printf("BlockKiB ME_U10 ME_U35 ME_SCAL C\n");
144+
if (report_mode == BENCH_REPORT_U35) {
145+
printf("BlockKiB ME_U10 ME_U35 ME_SCAL C\n");
146+
} else if (report_mode == BENCH_REPORT_ACCELERATE) {
147+
printf("BlockKiB ME_ACCEL ME_SCAL C\n");
148+
} else {
149+
printf("BlockKiB ME_SCAL C\n");
150+
}
128151

129152
me_eval_params params_u10 = ME_EVAL_PARAMS_DEFAULTS;
130153
params_u10.simd_ulp_mode = ME_SIMD_ULP_1;
@@ -147,8 +170,16 @@ static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nbl
147170
double c_gbps = data_gb / c_time;
148171

149172
int kib = (int)((nitems * info->elem_size) / 1024);
150-
printf("%6d %7.2f %7.2f %7.2f %7.2f\n",
151-
kib, me_gbps_u10, me_gbps_u35, me_scalar_gbps, c_gbps);
173+
if (report_mode == BENCH_REPORT_U35) {
174+
printf("%6d %7.2f %7.2f %7.2f %7.2f\n",
175+
kib, me_gbps_u10, me_gbps_u35, me_scalar_gbps, c_gbps);
176+
} else if (report_mode == BENCH_REPORT_ACCELERATE) {
177+
printf("%6d %8.2f %7.2f %7.2f\n",
178+
kib, me_gbps_u10, me_scalar_gbps, c_gbps);
179+
} else {
180+
printf("%6d %7.2f %7.2f\n",
181+
kib, me_scalar_gbps, c_gbps);
182+
}
152183
}
153184

154185
me_free(expr);
@@ -157,6 +188,8 @@ static void benchmark_dtype(const dtype_info_t *info, const int *blocks, int nbl
157188
}
158189

159190
int main(void) {
191+
const char *backend = getenv("ME_SIMD_MATH_BACKEND");
192+
const bench_report_mode_t report_mode = get_report_mode(backend);
160193
const dtype_info_t infos[] = {
161194
{"float32", ME_FLOAT32, sizeof(float)},
162195
{"float64", ME_FLOAT64, sizeof(double)}
@@ -167,10 +200,11 @@ int main(void) {
167200
printf("========================================\n");
168201
printf("MiniExpr Transcendentals Benchmark (Block Sizes)\n");
169202
printf("========================================\n");
203+
printf("Backend: %s\n", backend ? backend : "auto");
170204
printf("Expression: log(exp(x) + tanh(x) + log1p(abs(x)) + sqrt(abs(x)) + expm1(x))\n");
171205

172206
for (size_t i = 0; i < sizeof(infos) / sizeof(infos[0]); i++) {
173-
benchmark_dtype(&infos[i], blocks, nblocks);
207+
benchmark_dtype(&infos[i], blocks, nblocks, report_mode);
174208
}
175209

176210
printf("\n========================================\n");

cmake/MiniexprOptions.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ option(MINIEXPR_BUILD_TESTS "Build tests" ON)
44
option(MINIEXPR_BUILD_EXAMPLES "Build examples" ON)
55
option(MINIEXPR_BUILD_BENCH "Build benchmarks" ON)
66
option(MINIEXPR_USE_SLEEF "Enable SLEEF SIMD acceleration" ON)
7+
if(APPLE)
8+
option(MINIEXPR_USE_ACCELERATE "Enable macOS Accelerate/vForce math acceleration" ON)
9+
else()
10+
option(MINIEXPR_USE_ACCELERATE "Enable macOS Accelerate/vForce math acceleration" OFF)
11+
endif()
712
option(MINIEXPR_ENABLE_TCC_JIT "Enable TCC-based JIT backends (libtcc/wasm32)" ON)
813
option(MINIEXPR_BUILD_BUNDLED_LIBTCC "Build bundled libtcc from minicc sources when TCC JIT is enabled" ON)
914
option(MINIEXPR_DSL_TRACE_DEFAULT "Enable DSL trace logs by default when ME_DSL_TRACE is unset" OFF)

0 commit comments

Comments
 (0)