Added ARM NEON

stevenewald · stevenewald · commit 64a489bac606 · 2024-11-27T15:22:07.000-05:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,13 +32,22 @@ add_library(
 
 include(CheckCXXCompilerFlag)
 check_cxx_compiler_flag("-mavx512f -mavx512dq -mavx512vl -mavx512bf16" HAS_AVX512)
+check_cxx_source_compiles("
+        #include <arm_neon.h>
+        int main() {
+            float32x4_t vec = vdupq_n_f32(0.0f);
+            return 0;
+		}" HAS_NEON)
 
 if (HAS_ALL_AVX512)
     message(STATUS "AVX-512 is supported by the compiler.")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mavx512dq -mavx512vl -mavx512bf16")
 	target_sources(fractal-generator_lib PRIVATE source/mandelbrot/equations_simd.cpp)
+elseif(HAS_NEON)
+	message(STATUS "ARM NEON is supported by the compiler.")
+	target_sources(fractal-generator_lib PRIVATE source/mandelbrot/equations_neon.cpp)
 else()
-    message(STATUS "AVX-512 is not fully supported by the compiler. AVX-512 will not be enabled.")
+	message(STATUS "SIMD is not fully supported by the compiler. SIMD will not be enabled.")
 	target_sources(fractal-generator_lib PRIVATE source/mandelbrot/equations_compat.cpp)
 endif()
 
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -60,7 +60,7 @@ tasks:
     dir: .
     cmds:
       - task: build
-      - taskset -c 0 ./build/benchmark/fractal_benchmarks --benchmark_repetitions=5 --benchmark_min_warmup_time=0.5 --benchmark_report_aggregates_only=true
+      - ./build/benchmark/fractal_benchmarks --benchmark_repetitions=5 --benchmark_min_warmup_time=0.5 --benchmark_report_aggregates_only=true
 
   fmt:
     dir: .
diff --git a/source/graphics/display_to_complex.hpp b/source/graphics/display_to_complex.hpp
@@ -89,7 +89,7 @@ class DisplayToComplexCoordinates {
                               + (raw_complex_coord.real() + i) * real_scaling_factor_;
             complex.imaginary[i] =
                 complex_domain_start_.imag()
-                + raw_complex_coord.imag() * imaginary_scaling_factor_;
+                + (raw_complex_coord.imag()) * imaginary_scaling_factor_;
         }
         return complex;
     }
diff --git a/source/mandelbrot/equations_neon.cpp b/source/mandelbrot/equations_neon.cpp
@@ -0,0 +1,95 @@
+#include "config.hpp"
+#include "equations.hpp"
+#include "units/units.hpp"
+#include "units/units_avx.hpp"
+
+#include <arm_neon.h>
+
+namespace fractal {
+std::array<iteration_count, 2> compute_iterations_neon(
+    const neon_complex& z_0, const neon_complex& constant, iteration_count max_iters
+)
+{
+    static const auto SQUARED_DIVERGENCE =
+        MANDELBROT_DIVERGENCE_NORM * MANDELBROT_DIVERGENCE_NORM;
+
+    alignas(16) std::array<double, 2> reals = z_0.real;
+    alignas(16) std::array<double, 2> imags = z_0.imaginary;
+    alignas(16) std::array<double, 2> const_reals = constant.real;
+    alignas(16) std::array<double, 2> const_imags = constant.imaginary;
+
+    float64x2_t input_vec_real = vld1q_f64(reals.data());
+    float64x2_t input_vec_imag = vld1q_f64(imags.data());
+    float64x2_t input_vec_constant_reals = vld1q_f64(const_reals.data());
+    float64x2_t input_vec_constant_imags = vld1q_f64(const_imags.data());
+
+    uint64x2_t solved_its_vec = vdupq_n_u64(0);
+    float64x2_t squared_divergence_vec = vdupq_n_f64(SQUARED_DIVERGENCE);
+    uint64x2_t active_mask = vdupq_n_u64(~0ULL); // all bits set
+
+    for (uint64_t iterations = 0; iterations < max_iters; iterations++) {
+        // load current values
+        float64x2_t x = input_vec_real;
+        float64x2_t y = input_vec_imag;
+
+        // compute squares and product
+        float64x2_t x_squared = vmulq_f64(x, x);
+        float64x2_t y_squared = vmulq_f64(y, y);
+        float64x2_t xy = vmulq_f64(x, y);
+
+        // Update real part: input_vec_real = x_squared - y_squared + constant_reals
+        float64x2_t temp_real = vsubq_f64(x_squared, y_squared);
+        input_vec_real = vaddq_f64(temp_real, input_vec_constant_reals);
+
+        // update imaginary part: input_vec_imag = 2 * xy + constant_imags
+        input_vec_imag = vmlaq_f64(input_vec_constant_imags, xy, vdupq_n_f64(2.0));
+
+        // compute squared norms
+        float64x2_t squared_norms_vec = vaddq_f64(x_squared, y_squared);
+
+        // determine which elements have diverged
+        uint64x2_t solved_mask = vcgtq_f64(squared_norms_vec, squared_divergence_vec);
+
+        // update iteration counts for elements that have just diverged
+        uint64x2_t iteration_vec = vdupq_n_u64(iterations);
+        solved_its_vec = vbslq_u64(solved_mask, iteration_vec, solved_its_vec);
+
+        uint64x2_t not_solved_mask =
+            veorq_u64(solved_mask, vdupq_n_u64(~0ULL)); // Compute bitwise NOT
+        active_mask = vandq_u64(active_mask, not_solved_mask);
+
+        // Reduce active_mask to check if all lanes are zero
+        if (vaddvq_u64(active_mask) == 0) [[unlikely]]
+            break;
+    }
+
+    // // set iteration counts to max_iters where they haven't diverged
+    uint64x2_t zero_vec = vdupq_n_u64(0);
+    uint64x2_t zero_mask = vceqq_u64(solved_its_vec, zero_vec);
+    int64x2_t max_iters_vec = vdupq_n_u64(static_cast<uint64_t>(max_iters));
+    solved_its_vec = vbslq_u64(zero_mask, max_iters_vec, solved_its_vec);
+
+    // store the iteration counts
+    alignas(16) std::array<uint64_t, 2> ret{};
+    vst1q_u64(ret.data(), solved_its_vec);
+    std::array ret2{static_cast<uint16_t>(ret[0]), static_cast<uint16_t>(ret[1])};
+
+    return ret2;
+}
+
+std::array<iteration_count, 8> compute_iterations(
+    const avx512_complex& z_0, const avx512_complex& constant, iteration_count max_iters
+)
+{
+    std::array<iteration_count, 8> ret{};
+    auto neons_z0 = to_neon_complex(z_0);
+    auto neons_const = to_neon_complex(constant);
+    for (uint8_t i = 0; i < 4; ++i) {
+        auto [it_count_1, it_count_2] =
+            compute_iterations_neon(neons_z0[i], neons_const[i], max_iters);
+        ret[i * 2] = it_count_1;
+        ret[i * 2 + 1] = it_count_2;
+    }
+    return ret;
+}
+} // namespace fractal
diff --git a/source/mandelbrot/mandelbrot_window.cpp b/source/mandelbrot/mandelbrot_window.cpp
@@ -49,9 +49,9 @@ MandelbrotWindow::arr MandelbrotWindow::calculate_(
     auto process_chunk = [&](DisplayDomain::DisplayCoordinateIterator start,
                              DisplayDomain::DisplayCoordinateIterator end) {
         for (auto it = start; it != end; it += 8) {
-            auto pos = *it;
+            display_coordinate pos = *it;
             std::array<float, 8> t = process_coordinates(pos);
-            for (size_t i = 0; i < 8; i++) {
+            for (size_t i = 0; i < 8; ++i) {
                 ret[pos.x++][pos.y] = Percentage{t[i]};
             }
         }
diff --git a/source/units/coordinates.cpp b/source/units/coordinates.cpp
@@ -11,8 +11,8 @@ decay_2d_coordinate(const display_coordinate& coordinate, uint32_t display_width
 display_coordinate generate_1d_coordinate(uint32_t coordinate, uint32_t display_width)
 {
     return {
-        coordinate % display_width,
-        (coordinate - coordinate % display_width) / display_width
+        coordinate % (display_width + 1),
+        (coordinate - (coordinate % (display_width + 1))) / display_width
     };
 }
 } // namespace fractal
diff --git a/source/units/units_avx.hpp b/source/units/units_avx.hpp
@@ -10,4 +10,21 @@ struct avx512_complex {
     std::array<complex_underlying, 8> imaginary;
 };
 
+struct neon_complex {
+    std::array<complex_underlying, 2> real;
+    std::array<complex_underlying, 2> imaginary;
+};
+
+inline std::array<neon_complex, 4> to_neon_complex(const avx512_complex& complex)
+{
+    std::array<neon_complex, 4> ret{};
+    for (size_t i = 0; i < 4; ++i) {
+        ret[i].real[0] = complex.real[i * 2];
+        ret[i].imaginary[0] = complex.imaginary[i * 2];
+        ret[i].real[1] = complex.real[(i * 2) + 1];
+        ret[i].imaginary[1] = complex.imaginary[(i * 2) + 1];
+    }
+    return ret;
+}
+
 } // namespace fractal

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ class DisplayToComplexCoordinates {`
`89`	`89`	`+ (raw_complex_coord.real() + i) * real_scaling_factor_;`
`90`	`90`	`complex.imaginary[i] =`
`91`	`91`	`complex_domain_start_.imag()`
`92`		`- + raw_complex_coord.imag() * imaginary_scaling_factor_;`
	`92`	`+ + (raw_complex_coord.imag()) * imaginary_scaling_factor_;`
`93`	`93`	`}`
`94`	`94`	`return complex;`
`95`	`95`	`}`
Original file line number	Diff line number	Diff line change
`@@ -49,9 +49,9 @@ MandelbrotWindow::arr MandelbrotWindow::calculate_(`
`49`	`49`	`auto process_chunk = [&](DisplayDomain::DisplayCoordinateIterator start,`
`50`	`50`	`DisplayDomain::DisplayCoordinateIterator end) {`
`51`	`51`	`for (auto it = start; it != end; it += 8) {`
`52`		`- auto pos = *it;`
	`52`	`+ display_coordinate pos = *it;`
`53`	`53`	`std::array<float, 8> t = process_coordinates(pos);`
`54`		`- for (size_t i = 0; i < 8; i++) {`
	`54`	`+ for (size_t i = 0; i < 8; ++i) {`
`55`	`55`	`ret[pos.x++][pos.y] = Percentage{t[i]};`
`56`	`56`	`}`
`57`	`57`	`}`
Original file line number	Diff line number	Diff line change
`@@ -11,8 +11,8 @@ decay_2d_coordinate(const display_coordinate& coordinate, uint32_t display_width`
`11`	`11`	`display_coordinate generate_1d_coordinate(uint32_t coordinate, uint32_t display_width)`
`12`	`12`	`{`
`13`	`13`	`return {`
`14`		`- coordinate % display_width,`
`15`		`- (coordinate - coordinate % display_width) / display_width`
	`14`	`+ coordinate % (display_width + 1),`
	`15`	`+ (coordinate - (coordinate % (display_width + 1))) / display_width`
`16`	`16`	`};`
`17`	`17`	`}`
`18`	`18`	`} // namespace fractal`