|
| 1 | +use crate::shared::float::*; |
| 2 | +use crate::shared::int::*; |
| 3 | +use core::f32::consts::FRAC_PI_2; |
| 4 | +use core::ops::Range; |
| 5 | +use core::simd::*; |
| 6 | +use rand::rngs::ThreadRng; |
| 7 | +use rand::{thread_rng, Rng, RngCore}; |
| 8 | +// #[cfg(target_arch = "x86")] |
| 9 | +// #[allow(unused_imports)] |
| 10 | +// use core::arch::x86::*; |
| 11 | +// #[cfg(target_arch = "x86_64")] |
| 12 | +// #[allow(unused_imports)] |
| 13 | +// use core::arch::x86_64::*; |
| 14 | + |
| 15 | +const ITERS: usize = 1 << 20; |
| 16 | + |
| 17 | +#[inline(never)] |
| 18 | +#[test] |
| 19 | +pub fn scalar_error() { |
| 20 | + const RANGE: Range<f32> = -FRAC_PI_2..FRAC_PI_2; |
| 21 | + const MAX_ERROR_0: f32 = 2.9e-2_f32; |
| 22 | + const MAX_ERROR_1: f32 = 6.0e-4_f32; |
| 23 | + const MAX_ERROR_2: f32 = 6.9e-6_f32; |
| 24 | + const MAX_ERROR_3: f32 = 2.7e-7_f32; |
| 25 | + |
| 26 | + let mut rng = thread_rng(); |
| 27 | + for _i in 0..ITERS { |
| 28 | + let x = rng.gen_range(RANGE); |
| 29 | + |
| 30 | + let approx_0 = unsafe { x.sin_fast_approx::<0>() }; |
| 31 | + let approx_1 = unsafe { x.sin_fast_approx::<1>() }; |
| 32 | + let approx_2 = unsafe { x.sin_fast_approx::<2>() }; |
| 33 | + let approx_3 = unsafe { x.sin_fast_approx::<3>() }; |
| 34 | + let exact = x.sin(); |
| 35 | + |
| 36 | + assert!( |
| 37 | + (exact - approx_0).abs() <= MAX_ERROR_0, |
| 38 | + "Error greater than set maximum: true: {exact}, approx: {approx_0}, x: {x}" |
| 39 | + ); |
| 40 | + assert!( |
| 41 | + (exact - approx_1).abs() <= MAX_ERROR_1, |
| 42 | + "Error greater than set maximum: true: {exact}, approx: {approx_1}, x: {x}" |
| 43 | + ); |
| 44 | + assert!( |
| 45 | + (exact - approx_2).abs() <= MAX_ERROR_2, |
| 46 | + "Error greater than set maximum: true: {exact}, approx: {approx_2}, x: {x}" |
| 47 | + ); |
| 48 | + assert!( |
| 49 | + (exact - approx_3).abs() <= MAX_ERROR_3, |
| 50 | + "Error greater than set maximum: true: {exact}, approx: {approx_3}, x: {x}" |
| 51 | + ); |
| 52 | + } |
| 53 | +} |
| 54 | + |
| 55 | +#[inline(never)] |
| 56 | +#[test] |
| 57 | +pub fn simd_error() { |
| 58 | + const RANGE: Range<f32> = -FRAC_PI_2..FRAC_PI_2; |
| 59 | + const MAX_ERROR_0: f32 = 2.9e-2_f32; |
| 60 | + const MAX_ERROR_1: f32 = 6.0e-4_f32; |
| 61 | + const MAX_ERROR_2: f32 = 6.9e-6_f32; |
| 62 | + const MAX_ERROR_3: f32 = 2.7e-7_f32; |
| 63 | + |
| 64 | + let rng = &mut thread_rng(); |
| 65 | + |
| 66 | + test::<2>(rng); |
| 67 | + test::<4>(rng); |
| 68 | + test::<8>(rng); |
| 69 | + test::<16>(rng); |
| 70 | + |
| 71 | + #[inline(always)] |
| 72 | + fn test<const LANES: usize>(rng: &mut ThreadRng) |
| 73 | + where |
| 74 | + LaneCount<LANES>: SupportedLaneCount, |
| 75 | + { |
| 76 | + for _i in 0..ITERS { |
| 77 | + let mut vec_uninit: core::mem::MaybeUninit<Simd<f32, LANES>> = |
| 78 | + core::mem::MaybeUninit::uninit(); |
| 79 | + let vec_ptr = vec_uninit.as_mut_ptr(); |
| 80 | + |
| 81 | + for i in 0..LANES { |
| 82 | + unsafe { |
| 83 | + (*vec_ptr)[i] = rng.gen_range(RANGE); |
| 84 | + } |
| 85 | + } |
| 86 | + |
| 87 | + let x = unsafe { vec_uninit.assume_init() }; |
| 88 | + |
| 89 | + let approx_0 = unsafe { x.sin_fast_approx::<0>() }; |
| 90 | + let approx_1 = unsafe { x.sin_fast_approx::<1>() }; |
| 91 | + let approx_2 = unsafe { x.sin_fast_approx::<2>() }; |
| 92 | + let approx_3 = unsafe { x.sin_fast_approx::<3>() }; |
| 93 | + |
| 94 | + let mut vec_uninit: core::mem::MaybeUninit<Simd<f32, LANES>> = |
| 95 | + core::mem::MaybeUninit::uninit(); |
| 96 | + let vec_ptr = vec_uninit.as_mut_ptr(); |
| 97 | + |
| 98 | + for i in 0..LANES { |
| 99 | + unsafe { |
| 100 | + (*vec_ptr)[i] = x[i].sin(); |
| 101 | + } |
| 102 | + } |
| 103 | + |
| 104 | + let exact = unsafe { vec_uninit.assume_init() }; |
| 105 | + |
| 106 | + assert!( |
| 107 | + (exact - approx_0) |
| 108 | + .abs() |
| 109 | + .simd_le(Simd::splat(MAX_ERROR_0)) |
| 110 | + .all(), |
| 111 | + "Error greater than set maximum: true: {:?}, approx: {:?}, x: {:?}", |
| 112 | + exact, |
| 113 | + approx_0, |
| 114 | + x |
| 115 | + ); |
| 116 | + assert!( |
| 117 | + (exact - approx_1) |
| 118 | + .abs() |
| 119 | + .simd_le(Simd::splat(MAX_ERROR_1)) |
| 120 | + .all(), |
| 121 | + "Error greater than set maximum: true: {:?}, approx: {:?}, x: {:?}", |
| 122 | + exact, |
| 123 | + approx_1, |
| 124 | + x |
| 125 | + ); |
| 126 | + assert!( |
| 127 | + (exact - approx_2) |
| 128 | + .abs() |
| 129 | + .simd_le(Simd::splat(MAX_ERROR_2)) |
| 130 | + .all(), |
| 131 | + "Error greater than set maximum: true: {:?}, approx: {:?}, x: {:?}", |
| 132 | + exact, |
| 133 | + approx_2, |
| 134 | + x |
| 135 | + ); |
| 136 | + assert!( |
| 137 | + (exact - approx_3) |
| 138 | + .abs() |
| 139 | + .simd_le(Simd::splat(MAX_ERROR_3)) |
| 140 | + .all(), |
| 141 | + "Error greater than set maximum: true: {:?}, approx: {:?}, x: {:?}", |
| 142 | + exact, |
| 143 | + approx_3, |
| 144 | + x |
| 145 | + ); |
| 146 | + } |
| 147 | + } |
| 148 | +} |
| 149 | + |
| 150 | +#[inline(never)] |
| 151 | +#[test] |
| 152 | +pub fn simd_ilog_error() { |
| 153 | + let rng = &mut thread_rng(); |
| 154 | + |
| 155 | + test::<2>(rng); |
| 156 | + test::<4>(rng); |
| 157 | + test::<8>(rng); |
| 158 | + test::<16>(rng); |
| 159 | + |
| 160 | + #[inline(always)] |
| 161 | + fn test<const LANES: usize>(rng: &mut ThreadRng) |
| 162 | + where |
| 163 | + LaneCount<LANES>: SupportedLaneCount, |
| 164 | + { |
| 165 | + for _i in 0..ITERS { |
| 166 | + let mut vec_uninit: core::mem::MaybeUninit<Simd<u32, LANES>> = |
| 167 | + core::mem::MaybeUninit::uninit(); |
| 168 | + let vec_ptr = vec_uninit.as_mut_ptr(); |
| 169 | + |
| 170 | + for i in 0..LANES { |
| 171 | + unsafe { |
| 172 | + (*vec_ptr)[i] = rng.next_u32(); |
| 173 | + } |
| 174 | + } |
| 175 | + |
| 176 | + let x = unsafe { vec_uninit.assume_init() }; |
| 177 | + |
| 178 | + let fast = unsafe { x.ilog_const_base_unchecked::<3>() }; |
| 179 | + |
| 180 | + let mut vec_uninit: core::mem::MaybeUninit<Simd<u32, LANES>> = |
| 181 | + core::mem::MaybeUninit::uninit(); |
| 182 | + let vec_ptr = vec_uninit.as_mut_ptr(); |
| 183 | + |
| 184 | + for i in 0..LANES { |
| 185 | + unsafe { |
| 186 | + (*vec_ptr)[i] = x[i].ilog(3); |
| 187 | + } |
| 188 | + } |
| 189 | + |
| 190 | + let exact = unsafe { vec_uninit.assume_init() }; |
| 191 | + |
| 192 | + assert!( |
| 193 | + exact.simd_eq(fast).all(), |
| 194 | + "Error greater than set maximum: true: {:?}, approx: {:?}, x: {:?}", |
| 195 | + exact, |
| 196 | + fast, |
| 197 | + x |
| 198 | + ); |
| 199 | + } |
| 200 | + } |
| 201 | +} |
| 202 | + |
| 203 | +// /// Options: |
| 204 | +// /// --cfg print_values |
| 205 | +// /// --cfg print_error |
| 206 | +// /// --cfg print_cycles |
| 207 | +// #[allow(dead_code)] |
| 208 | +// pub fn main() { |
| 209 | +// const STEPS: usize = 1000; //1 << 24; |
| 210 | +// const WARMUP_ITRS: usize = 1 << 24; |
| 211 | +// const START: f32 = 0.0; |
| 212 | +// const END: f32 = FRAC_PI_2; |
| 213 | +// |
| 214 | +// const ITRS: usize = STEPS / LANES; |
| 215 | +// const SLICE: f32 = (END - START) / (STEPS as f32); |
| 216 | +// const INCR: Simd<f32, LANES> = Simd::from_array([SLICE * LANES as f32; LANES]); |
| 217 | +// |
| 218 | +// println!("Count: {STEPS}"); |
| 219 | +// |
| 220 | +// #[allow(unused_mut)] |
| 221 | +// let mut vec = Simd::<f32, LANES>::splat(SLICE).mul_add( |
| 222 | +// Simd::from_slice(&(0..LANES).collect::<Box<[usize]>>()).cast::<f32>(), |
| 223 | +// Simd::splat(START), |
| 224 | +// ); |
| 225 | +// |
| 226 | +// if cfg!(print_cycles) { |
| 227 | +// if cfg!(any(target_arch = "x86", target_arch = "x86_64")) { |
| 228 | +// for _i in 0..WARMUP_ITRS { |
| 229 | +// unsafe { |
| 230 | +// black_box(wrap_auto_vectorize!( |
| 231 | +// sin_fast_approx::<PRECISION, COS>, |
| 232 | +// LANES, |
| 233 | +// black_box(vec) |
| 234 | +// )); |
| 235 | +// } |
| 236 | +// } |
| 237 | +// } else { |
| 238 | +// panic!("CPU cycle timings are not supported on this platform"); |
| 239 | +// } |
| 240 | +// } |
| 241 | +// |
| 242 | +// #[allow(unused_variables)] |
| 243 | +// let mut total_error = 0.0_f64; |
| 244 | +// let mut max_error = 0.0_f64; |
| 245 | +// #[allow(unused_variables)] |
| 246 | +// let mut built_string: String; |
| 247 | +// #[cfg(print_values)] |
| 248 | +// { |
| 249 | +// built_string = String::with_capacity(STEPS * 16); |
| 250 | +// } |
| 251 | +// #[allow(unused_variables, unused_mut)] |
| 252 | +// let mut cycles_1: u64; |
| 253 | +// #[cfg(all(print_cycles, any(target_arch = "x86", target_arch = "x86_64")))] |
| 254 | +// unsafe { |
| 255 | +// let mut _unused = 0_u32; |
| 256 | +// cycles_1 = __rdtscp(&mut _unused); |
| 257 | +// } |
| 258 | +// |
| 259 | +// for _i in 0..ITRS { |
| 260 | +// let result = unsafe { |
| 261 | +// black_box(wrap_auto_vectorize!( |
| 262 | +// sin_fast_approx::<PRECISION, COS>, |
| 263 | +// LANES, |
| 264 | +// black_box(vec) |
| 265 | +// )) |
| 266 | +// }; |
| 267 | +// |
| 268 | +// if cfg!(print_error) { |
| 269 | +// let mut array: [f32; LANES] = [0.0; LANES]; |
| 270 | +// |
| 271 | +// for i in 0..LANES { |
| 272 | +// array[i] = if COS { vec[i].cos() } else { vec[i].sin() }; |
| 273 | +// } |
| 274 | +// |
| 275 | +// let true_result = Simd::from_array(array); |
| 276 | +// |
| 277 | +// // the range of sin and cos are between -1 and 1 |
| 278 | +// let distance = (result.cast::<f64>() - true_result.cast::<f64>()).abs(); |
| 279 | +// let distance_epsilons = distance / Simd::splat(f32::EPSILON as f64); |
| 280 | +// total_error += distance_epsilons.reduce_sum(); |
| 281 | +// max_error = max_error.max(distance_epsilons.reduce_max()); |
| 282 | +// |
| 283 | +// #[cfg(print_values)] |
| 284 | +// { |
| 285 | +// for i in 0..LANES { |
| 286 | +// built_string.push_str(&format!( |
| 287 | +// "{:?} {:?} {:?} {:.3}\n", |
| 288 | +// vec[i], result[i], true_result[i], distance_epsilons[i] |
| 289 | +// )); |
| 290 | +// } |
| 291 | +// } |
| 292 | +// } else if cfg!(print_values) { |
| 293 | +// #[cfg(print_values)] |
| 294 | +// { |
| 295 | +// for i in 0..LANES { |
| 296 | +// built_string.push_str(&format!("{:?} {:?}\n", vec[i], result[i])); |
| 297 | +// } |
| 298 | +// } |
| 299 | +// } |
| 300 | +// |
| 301 | +// #[cfg(any(print_values, print_error))] |
| 302 | +// { |
| 303 | +// vec += INCR; |
| 304 | +// } |
| 305 | +// } |
| 306 | +// #[cfg(all(print_cycles, any(target_arch = "x86", target_arch = "x86_64")))] |
| 307 | +// unsafe { |
| 308 | +// let mut _unused = 0_u32; |
| 309 | +// let cycles_2 = __rdtscp(&mut _unused); |
| 310 | +// |
| 311 | +// let cycles_total = cycles_2 - cycles_1; |
| 312 | +// let per_iter_cycles = cycles_total as f64 / (ITRS as f64); |
| 313 | +// let per_op_cycles = cycles_total as f64 / (STEPS as f64); |
| 314 | +// println!("Avg Cycles Per Iter: {per_iter_cycles}\nAvg Cycles Per Op: {per_op_cycles}"); |
| 315 | +// } |
| 316 | +// |
| 317 | +// #[cfg(print_error)] |
| 318 | +// { |
| 319 | +// let per_op_error = total_error / (STEPS as f64); |
| 320 | +// println!("Avg Error Per Op (epsilons): {per_op_error}\nMax Error (epsilons): {max_error}") |
| 321 | +// } |
| 322 | +// |
| 323 | +// #[cfg(print_values)] |
| 324 | +// { |
| 325 | +// println!("Values:\n{built_string}"); |
| 326 | +// } |
| 327 | +// } |
0 commit comments