|
| 1 | +#include "config.hpp" |
| 2 | +#include "equations.hpp" |
| 3 | +#include "units/units.hpp" |
| 4 | +#include "units/units_avx.hpp" |
| 5 | + |
| 6 | +#include <arm_neon.h> |
| 7 | + |
| 8 | +namespace fractal { |
| 9 | +std::array<iteration_count, 2> compute_iterations_neon( |
| 10 | + const neon_complex& z_0, const neon_complex& constant, iteration_count max_iters |
| 11 | +) |
| 12 | +{ |
| 13 | + static const auto SQUARED_DIVERGENCE = |
| 14 | + MANDELBROT_DIVERGENCE_NORM * MANDELBROT_DIVERGENCE_NORM; |
| 15 | + |
| 16 | + alignas(16) std::array<double, 2> reals = z_0.real; |
| 17 | + alignas(16) std::array<double, 2> imags = z_0.imaginary; |
| 18 | + alignas(16) std::array<double, 2> const_reals = constant.real; |
| 19 | + alignas(16) std::array<double, 2> const_imags = constant.imaginary; |
| 20 | + |
| 21 | + float64x2_t input_vec_real = vld1q_f64(reals.data()); |
| 22 | + float64x2_t input_vec_imag = vld1q_f64(imags.data()); |
| 23 | + float64x2_t input_vec_constant_reals = vld1q_f64(const_reals.data()); |
| 24 | + float64x2_t input_vec_constant_imags = vld1q_f64(const_imags.data()); |
| 25 | + |
| 26 | + uint64x2_t solved_its_vec = vdupq_n_u64(0); |
| 27 | + float64x2_t squared_divergence_vec = vdupq_n_f64(SQUARED_DIVERGENCE); |
| 28 | + uint64x2_t active_mask = vdupq_n_u64(~0ULL); // all bits set |
| 29 | + |
| 30 | + for (uint64_t iterations = 0; iterations < max_iters; iterations++) { |
| 31 | + // load current values |
| 32 | + float64x2_t x = input_vec_real; |
| 33 | + float64x2_t y = input_vec_imag; |
| 34 | + |
| 35 | + // compute squares and product |
| 36 | + float64x2_t x_squared = vmulq_f64(x, x); |
| 37 | + float64x2_t y_squared = vmulq_f64(y, y); |
| 38 | + float64x2_t xy = vmulq_f64(x, y); |
| 39 | + |
| 40 | + // Update real part: input_vec_real = x_squared - y_squared + constant_reals |
| 41 | + float64x2_t temp_real = vsubq_f64(x_squared, y_squared); |
| 42 | + input_vec_real = vaddq_f64(temp_real, input_vec_constant_reals); |
| 43 | + |
| 44 | + // update imaginary part: input_vec_imag = 2 * xy + constant_imags |
| 45 | + input_vec_imag = vmlaq_f64(input_vec_constant_imags, xy, vdupq_n_f64(2.0)); |
| 46 | + |
| 47 | + // compute squared norms |
| 48 | + float64x2_t squared_norms_vec = vaddq_f64(x_squared, y_squared); |
| 49 | + |
| 50 | + // determine which elements have diverged |
| 51 | + uint64x2_t solved_mask = vcgtq_f64(squared_norms_vec, squared_divergence_vec); |
| 52 | + |
| 53 | + // update iteration counts for elements that have just diverged |
| 54 | + uint64x2_t iteration_vec = vdupq_n_u64(iterations); |
| 55 | + solved_its_vec = vbslq_u64(solved_mask, iteration_vec, solved_its_vec); |
| 56 | + |
| 57 | + uint64x2_t not_solved_mask = |
| 58 | + veorq_u64(solved_mask, vdupq_n_u64(~0ULL)); // Compute bitwise NOT |
| 59 | + active_mask = vandq_u64(active_mask, not_solved_mask); |
| 60 | + |
| 61 | + // Reduce active_mask to check if all lanes are zero |
| 62 | + if (vaddvq_u64(active_mask) == 0) [[unlikely]] |
| 63 | + break; |
| 64 | + } |
| 65 | + |
| 66 | + // // set iteration counts to max_iters where they haven't diverged |
| 67 | + uint64x2_t zero_vec = vdupq_n_u64(0); |
| 68 | + uint64x2_t zero_mask = vceqq_u64(solved_its_vec, zero_vec); |
| 69 | + int64x2_t max_iters_vec = vdupq_n_u64(static_cast<uint64_t>(max_iters)); |
| 70 | + solved_its_vec = vbslq_u64(zero_mask, max_iters_vec, solved_its_vec); |
| 71 | + |
| 72 | + // store the iteration counts |
| 73 | + alignas(16) std::array<uint64_t, 2> ret{}; |
| 74 | + vst1q_u64(ret.data(), solved_its_vec); |
| 75 | + std::array ret2{static_cast<uint16_t>(ret[0]), static_cast<uint16_t>(ret[1])}; |
| 76 | + |
| 77 | + return ret2; |
| 78 | +} |
| 79 | + |
| 80 | +std::array<iteration_count, 8> compute_iterations( |
| 81 | + const avx512_complex& z_0, const avx512_complex& constant, iteration_count max_iters |
| 82 | +) |
| 83 | +{ |
| 84 | + std::array<iteration_count, 8> ret{}; |
| 85 | + auto neons_z0 = to_neon_complex(z_0); |
| 86 | + auto neons_const = to_neon_complex(constant); |
| 87 | + for (uint8_t i = 0; i < 4; ++i) { |
| 88 | + auto [it_count_1, it_count_2] = |
| 89 | + compute_iterations_neon(neons_z0[i], neons_const[i], max_iters); |
| 90 | + ret[i * 2] = it_count_1; |
| 91 | + ret[i * 2 + 1] = it_count_2; |
| 92 | + } |
| 93 | + return ret; |
| 94 | +} |
| 95 | +} // namespace fractal |
0 commit comments