@@ -57,6 +57,9 @@ D DemoteTo(T t, float maxColors);
57
57
template <typename T>
58
58
T CubicBSpline (T t);
59
59
60
+ template <typename T>
61
+ T LanczosWindow (T x, const T a);
62
+
60
63
#if __arm64__
61
64
#include < arm_neon.h>
62
65
@@ -71,6 +74,58 @@ static inline float32x4_t Cos(const float32x4_t d) {
71
74
return vmlaq_f32 (C0, x2, vmlaq_f32 (C1, x2, vmlaq_n_f32 (C2, x2, C3)));
72
75
}
73
76
77
+ /* * Sin polynomial coefficients */
78
+ constexpr float te_sin_coeff2 = 0 .166666666666f ; // 1/(2*3)
79
+ constexpr float te_sin_coeff3 = 0 .05f ; // 1/(4*5)
80
+ constexpr float te_sin_coeff4 = 0 .023809523810f ; // 1/(6*7)
81
+ constexpr float te_sin_coeff5 = 0 .013888888889f ; // 1/(8*9)
82
+
83
+ __attribute__ ((always_inline))
84
+ inline float32x4_t vsinq_f32(float32x4_t val)
85
+ {
86
+ const float32x4_t pi_v = vdupq_n_f32 (M_PI);
87
+ const float32x4_t pio2_v = vdupq_n_f32 (M_PI / 2 );
88
+ const float32x4_t ipi_v = vdupq_n_f32 (1 / M_PI);
89
+
90
+ // Find positive or negative
91
+ const int32x4_t c_v = vabsq_s32 (vcvtq_s32_f32 (vmulq_f32 (val, ipi_v)));
92
+ const uint32x4_t sign_v = vcleq_f32 (val, vdupq_n_f32 (0 ));
93
+ const uint32x4_t odd_v = vandq_u32 (vreinterpretq_u32_s32 (c_v), vdupq_n_u32 (1 ));
94
+
95
+ uint32x4_t neg_v = veorq_u32 (odd_v, sign_v);
96
+
97
+ // Modulus a - (n * int(a*(1/n)))
98
+ float32x4_t ma = vsubq_f32 (vabsq_f32 (val), vmulq_f32 (pi_v, vcvtq_f32_s32 (c_v)));
99
+ const uint32x4_t reb_v = vcgeq_f32 (ma, pio2_v);
100
+
101
+ // Rebase a between 0 and pi/2
102
+ ma = vbslq_f32 (reb_v, vsubq_f32 (pi_v, ma), ma);
103
+
104
+ // Taylor series
105
+ const float32x4_t ma2 = vmulq_f32 (ma, ma);
106
+
107
+ // 2nd elem: x^3 / 3!
108
+ float32x4_t elem = vmulq_f32 (vmulq_f32 (ma, ma2), vdupq_n_f32 (te_sin_coeff2));
109
+ float32x4_t res = vsubq_f32 (ma, elem);
110
+
111
+ // 3rd elem: x^5 / 5!
112
+ elem = vmulq_f32 (vmulq_f32 (elem, ma2), vdupq_n_f32 (te_sin_coeff3));
113
+ res = vaddq_f32 (res, elem);
114
+
115
+ // 4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
116
+ elem = vmulq_f32 (vmulq_f32 (elem, ma2), vdupq_n_f32 (te_sin_coeff4));
117
+ res = vsubq_f32 (res, elem);
118
+
119
+ // 5th elem: x^9 / 9!
120
+ elem = vmulq_f32 (vmulq_f32 (elem, ma2), vdupq_n_f32 (te_sin_coeff5));
121
+ res = vaddq_f32 (res, elem);
122
+
123
+ // Change of sign
124
+ neg_v = vshlq_n_u32 (neg_v, 31 );
125
+ res = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (res), neg_v));
126
+ return res;
127
+ }
128
+
74
129
__attribute__ ((always_inline))
75
130
static inline float32x4_t FastSin(const float32x4_t v) {
76
131
constexpr float A = 4 .0f /(M_PI*M_PI);
@@ -87,7 +142,7 @@ static inline float32x4_t FastSin(const float32x4_t v) {
87
142
__attribute__ ((always_inline))
88
143
static inline float32x4_t Sinc(const float32x4_t v) {
89
144
const float32x4_t zeros = vdupq_n_f32 (0 );
90
- const float32x4_t ones = vdupq_n_f32 (0 );
145
+ const float32x4_t ones = vdupq_n_f32 (1 );
91
146
uint32x4_t mask = vceqq_f32 (v, zeros);
92
147
// if < 0 then set to 1
93
148
float32x4_t x = vbslq_f32 (mask, ones, v);
@@ -104,20 +159,22 @@ static inline float32x4_t LanczosWindow(const float32x4_t v, const float a) {
104
159
const float32x4_t zeros = vdupq_n_f32 (0 );
105
160
uint32x4_t mask = vcltq_f32 (vabsq_f32 (v), fullLength);
106
161
float32x4_t rv = vmulq_n_f32 (v, M_PI);
107
- float32x4_t x = vmulq_f32 (Sinc (rv), Sinc (vmulq_f32 (v , invLength)));
162
+ float32x4_t x = vmulq_f32 (Sinc (rv), Sinc (vmulq_f32 (rv , invLength)));
108
163
x = vbslq_f32 (mask, x, zeros);
109
164
return x;
110
165
}
111
166
112
167
__attribute__ ((always_inline))
113
168
static inline float32x4_t HannWindow(const float32x4_t d, const float length) {
114
- const float32x4_t fullLength = vrecpeq_f32 (vdupq_n_f32 (length));
115
- const float32x4_t halfLength = vdupq_n_f32 (length / 2 );
169
+ const float doublePi = 2 * M_PI;
170
+ const float32x4_t wndSize = vdupq_n_f32 (length * 2 - 1 );
171
+ const float32x4_t rWndSize = vrecpeq_f32 (wndSize);
172
+ uint32x4_t mask = vcgtq_f32 (vabsq_f32 (d), wndSize);
173
+ const float32x4_t ones = vdupq_n_f32 (1 );
174
+ float32x4_t cx = vmulq_n_f32 (vsubq_f32 (ones, Cos (vmulq_f32 (vmulq_n_f32 (d, doublePi), rWndSize))), 0 .5f );
116
175
const float32x4_t zeros = vdupq_n_f32 (0 );
117
- uint32x4_t mask = vcltq_f32 (vabsq_f32 (d), halfLength);
118
- float32x4_t cx = Cos (vmulq_f32 (vmulq_n_f32 (d, M_PI), fullLength));
119
- cx = vmulq_f32 (vmulq_f32 (cx, cx), fullLength);
120
- return vbslq_f32 (mask, zeros, cx);
176
+ cx = vbslq_f32 (mask, zeros, cx);
177
+ return cx;
121
178
}
122
179
123
180
__attribute__ ((always_inline))
@@ -220,9 +277,6 @@ T MitchellNetravali(T d, T p0, T p1, T p2, T p3);
220
277
template <typename T>
221
278
T sinc (T x);
222
279
223
- template <typename T>
224
- T LanczosWindow (T x, const T a);
225
-
226
280
template <typename T>
227
281
T HannWindow (T x, const T length);
228
282
0 commit comments