Skip to content

Commit 64a489b

Browse files
committed
Added ARM NEON
1 parent 21ce1a4 commit 64a489b

7 files changed

+128
-7
lines changed

CMakeLists.txt

+10-1
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,22 @@ add_library(
3232

3333
include(CheckCXXCompilerFlag)
3434
check_cxx_compiler_flag("-mavx512f -mavx512dq -mavx512vl -mavx512bf16" HAS_AVX512)
35+
check_cxx_source_compiles("
36+
#include <arm_neon.h>
37+
int main() {
38+
float32x4_t vec = vdupq_n_f32(0.0f);
39+
return 0;
40+
}" HAS_NEON)
3541

3642
if (HAS_ALL_AVX512)
3743
message(STATUS "AVX-512 is supported by the compiler.")
3844
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512dq -mavx512vl -mavx512bf16")
3945
target_sources(fractal-generator_lib PRIVATE source/mandelbrot/equations_simd.cpp)
46+
elseif(HAS_NEON)
47+
message(STATUS "ARM NEON is supported by the compiler.")
48+
target_sources(fractal-generator_lib PRIVATE source/mandelbrot/equations_neon.cpp)
4049
else()
41-
message(STATUS "AVX-512 is not fully supported by the compiler. AVX-512 will not be enabled.")
50+
message(STATUS "SIMD is not fully supported by the compiler. SIMD will not be enabled.")
4251
target_sources(fractal-generator_lib PRIVATE source/mandelbrot/equations_compat.cpp)
4352
endif()
4453

Taskfile.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ tasks:
6060
dir: .
6161
cmds:
6262
- task: build
63-
- taskset -c 0 ./build/benchmark/fractal_benchmarks --benchmark_repetitions=5 --benchmark_min_warmup_time=0.5 --benchmark_report_aggregates_only=true
63+
- ./build/benchmark/fractal_benchmarks --benchmark_repetitions=5 --benchmark_min_warmup_time=0.5 --benchmark_report_aggregates_only=true
6464

6565
fmt:
6666
dir: .

source/graphics/display_to_complex.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ class DisplayToComplexCoordinates {
8989
+ (raw_complex_coord.real() + i) * real_scaling_factor_;
9090
complex.imaginary[i] =
9191
complex_domain_start_.imag()
92-
+ raw_complex_coord.imag() * imaginary_scaling_factor_;
92+
+ (raw_complex_coord.imag()) * imaginary_scaling_factor_;
9393
}
9494
return complex;
9595
}

source/mandelbrot/equations_neon.cpp

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#include "config.hpp"
2+
#include "equations.hpp"
3+
#include "units/units.hpp"
4+
#include "units/units_avx.hpp"
5+
6+
#include <arm_neon.h>
7+
8+
namespace fractal {
9+
std::array<iteration_count, 2> compute_iterations_neon(
10+
const neon_complex& z_0, const neon_complex& constant, iteration_count max_iters
11+
)
12+
{
13+
static const auto SQUARED_DIVERGENCE =
14+
MANDELBROT_DIVERGENCE_NORM * MANDELBROT_DIVERGENCE_NORM;
15+
16+
alignas(16) std::array<double, 2> reals = z_0.real;
17+
alignas(16) std::array<double, 2> imags = z_0.imaginary;
18+
alignas(16) std::array<double, 2> const_reals = constant.real;
19+
alignas(16) std::array<double, 2> const_imags = constant.imaginary;
20+
21+
float64x2_t input_vec_real = vld1q_f64(reals.data());
22+
float64x2_t input_vec_imag = vld1q_f64(imags.data());
23+
float64x2_t input_vec_constant_reals = vld1q_f64(const_reals.data());
24+
float64x2_t input_vec_constant_imags = vld1q_f64(const_imags.data());
25+
26+
uint64x2_t solved_its_vec = vdupq_n_u64(0);
27+
float64x2_t squared_divergence_vec = vdupq_n_f64(SQUARED_DIVERGENCE);
28+
uint64x2_t active_mask = vdupq_n_u64(~0ULL); // all bits set
29+
30+
for (uint64_t iterations = 0; iterations < max_iters; iterations++) {
31+
// load current values
32+
float64x2_t x = input_vec_real;
33+
float64x2_t y = input_vec_imag;
34+
35+
// compute squares and product
36+
float64x2_t x_squared = vmulq_f64(x, x);
37+
float64x2_t y_squared = vmulq_f64(y, y);
38+
float64x2_t xy = vmulq_f64(x, y);
39+
40+
// Update real part: input_vec_real = x_squared - y_squared + constant_reals
41+
float64x2_t temp_real = vsubq_f64(x_squared, y_squared);
42+
input_vec_real = vaddq_f64(temp_real, input_vec_constant_reals);
43+
44+
// update imaginary part: input_vec_imag = 2 * xy + constant_imags
45+
input_vec_imag = vmlaq_f64(input_vec_constant_imags, xy, vdupq_n_f64(2.0));
46+
47+
// compute squared norms
48+
float64x2_t squared_norms_vec = vaddq_f64(x_squared, y_squared);
49+
50+
// determine which elements have diverged
51+
uint64x2_t solved_mask = vcgtq_f64(squared_norms_vec, squared_divergence_vec);
52+
53+
// update iteration counts for elements that have just diverged
54+
uint64x2_t iteration_vec = vdupq_n_u64(iterations);
55+
solved_its_vec = vbslq_u64(solved_mask, iteration_vec, solved_its_vec);
56+
57+
uint64x2_t not_solved_mask =
58+
veorq_u64(solved_mask, vdupq_n_u64(~0ULL)); // Compute bitwise NOT
59+
active_mask = vandq_u64(active_mask, not_solved_mask);
60+
61+
// Reduce active_mask to check if all lanes are zero
62+
if (vaddvq_u64(active_mask) == 0) [[unlikely]]
63+
break;
64+
}
65+
66+
// // set iteration counts to max_iters where they haven't diverged
67+
uint64x2_t zero_vec = vdupq_n_u64(0);
68+
uint64x2_t zero_mask = vceqq_u64(solved_its_vec, zero_vec);
69+
int64x2_t max_iters_vec = vdupq_n_u64(static_cast<uint64_t>(max_iters));
70+
solved_its_vec = vbslq_u64(zero_mask, max_iters_vec, solved_its_vec);
71+
72+
// store the iteration counts
73+
alignas(16) std::array<uint64_t, 2> ret{};
74+
vst1q_u64(ret.data(), solved_its_vec);
75+
std::array ret2{static_cast<uint16_t>(ret[0]), static_cast<uint16_t>(ret[1])};
76+
77+
return ret2;
78+
}
79+
80+
std::array<iteration_count, 8> compute_iterations(
81+
const avx512_complex& z_0, const avx512_complex& constant, iteration_count max_iters
82+
)
83+
{
84+
std::array<iteration_count, 8> ret{};
85+
auto neons_z0 = to_neon_complex(z_0);
86+
auto neons_const = to_neon_complex(constant);
87+
for (uint8_t i = 0; i < 4; ++i) {
88+
auto [it_count_1, it_count_2] =
89+
compute_iterations_neon(neons_z0[i], neons_const[i], max_iters);
90+
ret[i * 2] = it_count_1;
91+
ret[i * 2 + 1] = it_count_2;
92+
}
93+
return ret;
94+
}
95+
} // namespace fractal

source/mandelbrot/mandelbrot_window.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ MandelbrotWindow::arr MandelbrotWindow::calculate_(
4949
auto process_chunk = [&](DisplayDomain::DisplayCoordinateIterator start,
5050
DisplayDomain::DisplayCoordinateIterator end) {
5151
for (auto it = start; it != end; it += 8) {
52-
auto pos = *it;
52+
display_coordinate pos = *it;
5353
std::array<float, 8> t = process_coordinates(pos);
54-
for (size_t i = 0; i < 8; i++) {
54+
for (size_t i = 0; i < 8; ++i) {
5555
ret[pos.x++][pos.y] = Percentage{t[i]};
5656
}
5757
}

source/units/coordinates.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ decay_2d_coordinate(const display_coordinate& coordinate, uint32_t display_width
1111
display_coordinate generate_1d_coordinate(uint32_t coordinate, uint32_t display_width)
1212
{
1313
return {
14-
coordinate % display_width,
15-
(coordinate - coordinate % display_width) / display_width
14+
coordinate % (display_width + 1),
15+
(coordinate - (coordinate % (display_width + 1))) / display_width
1616
};
1717
}
1818
} // namespace fractal

source/units/units_avx.hpp

+17
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,21 @@ struct avx512_complex {
1010
std::array<complex_underlying, 8> imaginary;
1111
};
1212

13+
struct neon_complex {
14+
std::array<complex_underlying, 2> real;
15+
std::array<complex_underlying, 2> imaginary;
16+
};
17+
18+
inline std::array<neon_complex, 4> to_neon_complex(const avx512_complex& complex)
19+
{
20+
std::array<neon_complex, 4> ret{};
21+
for (size_t i = 0; i < 4; ++i) {
22+
ret[i].real[0] = complex.real[i * 2];
23+
ret[i].imaginary[0] = complex.imaginary[i * 2];
24+
ret[i].real[1] = complex.real[(i * 2) + 1];
25+
ret[i].imaginary[1] = complex.imaginary[(i * 2) + 1];
26+
}
27+
return ret;
28+
}
29+
1330
} // namespace fractal

0 commit comments

Comments
 (0)