Skip to content

Commit 0f96c71

Browse files
committed
Improve the floating point parser in dec2flt.
* Remove all remaining traces of unsafe. * Put `parse_8digits` inside a loop. * Rework parsing of inf/NaN values.
1 parent 39bf777 commit 0f96c71

File tree

5 files changed

+188
-288
lines changed

5 files changed

+188
-288
lines changed

library/core/src/num/dec2flt/common.rs

+30-148
Original file line numberDiff line numberDiff line change
@@ -1,165 +1,60 @@
11
//! Common utilities, for internal use only.
22
3-
use crate::ptr;
4-
53
/// Helper methods to process immutable bytes.
6-
pub(crate) trait ByteSlice: AsRef<[u8]> {
7-
unsafe fn first_unchecked(&self) -> u8 {
8-
debug_assert!(!self.is_empty());
9-
// SAFETY: safe as long as self is not empty
10-
unsafe { *self.as_ref().get_unchecked(0) }
11-
}
12-
13-
/// Get if the slice contains no elements.
14-
fn is_empty(&self) -> bool {
15-
self.as_ref().is_empty()
16-
}
17-
18-
/// Check if the slice at least `n` length.
19-
fn check_len(&self, n: usize) -> bool {
20-
n <= self.as_ref().len()
21-
}
22-
23-
/// Check if the first character in the slice is equal to c.
24-
fn first_is(&self, c: u8) -> bool {
25-
self.as_ref().first() == Some(&c)
26-
}
27-
28-
/// Check if the first character in the slice is equal to c1 or c2.
29-
fn first_is2(&self, c1: u8, c2: u8) -> bool {
30-
if let Some(&c) = self.as_ref().first() { c == c1 || c == c2 } else { false }
31-
}
32-
33-
/// Bounds-checked test if the first character in the slice is a digit.
34-
fn first_isdigit(&self) -> bool {
35-
if let Some(&c) = self.as_ref().first() { c.is_ascii_digit() } else { false }
36-
}
37-
38-
/// Check if self starts with u with a case-insensitive comparison.
39-
fn starts_with_ignore_case(&self, u: &[u8]) -> bool {
40-
debug_assert!(self.as_ref().len() >= u.len());
41-
let iter = self.as_ref().iter().zip(u.iter());
42-
let d = iter.fold(0, |i, (&x, &y)| i | (x ^ y));
43-
d == 0 || d == 32
44-
}
45-
46-
/// Get the remaining slice after the first N elements.
47-
fn advance(&self, n: usize) -> &[u8] {
48-
&self.as_ref()[n..]
49-
}
50-
51-
/// Get the slice after skipping all leading characters equal c.
52-
fn skip_chars(&self, c: u8) -> &[u8] {
53-
let mut s = self.as_ref();
54-
while s.first_is(c) {
55-
s = s.advance(1);
56-
}
57-
s
58-
}
59-
60-
/// Get the slice after skipping all leading characters equal c1 or c2.
61-
fn skip_chars2(&self, c1: u8, c2: u8) -> &[u8] {
62-
let mut s = self.as_ref();
63-
while s.first_is2(c1, c2) {
64-
s = s.advance(1);
65-
}
66-
s
67-
}
68-
4+
pub(crate) trait ByteSlice {
695
/// Read 8 bytes as a 64-bit integer in little-endian order.
70-
unsafe fn read_u64_unchecked(&self) -> u64 {
71-
debug_assert!(self.check_len(8));
72-
let src = self.as_ref().as_ptr() as *const u64;
73-
// SAFETY: safe as long as self is at least 8 bytes
74-
u64::from_le(unsafe { ptr::read_unaligned(src) })
75-
}
6+
fn read_u64(&self) -> u64;
767

77-
/// Try to read the next 8 bytes from the slice.
78-
fn read_u64(&self) -> Option<u64> {
79-
if self.check_len(8) {
80-
// SAFETY: self must be at least 8 bytes.
81-
Some(unsafe { self.read_u64_unchecked() })
82-
} else {
83-
None
84-
}
85-
}
86-
87-
/// Calculate the offset of slice from another.
88-
fn offset_from(&self, other: &Self) -> isize {
89-
other.as_ref().len() as isize - self.as_ref().len() as isize
90-
}
91-
}
92-
93-
impl ByteSlice for [u8] {}
94-
95-
/// Helper methods to process mutable bytes.
96-
pub(crate) trait ByteSliceMut: AsMut<[u8]> {
978
/// Write a 64-bit integer as 8 bytes in little-endian order.
98-
unsafe fn write_u64_unchecked(&mut self, value: u64) {
99-
debug_assert!(self.as_mut().len() >= 8);
100-
let dst = self.as_mut().as_mut_ptr() as *mut u64;
101-
// NOTE: we must use `write_unaligned`, since dst is not
102-
// guaranteed to be properly aligned. Miri will warn us
103-
// if we use `write` instead of `write_unaligned`, as expected.
104-
// SAFETY: safe as long as self is at least 8 bytes
105-
unsafe {
106-
ptr::write_unaligned(dst, u64::to_le(value));
107-
}
108-
}
109-
}
9+
fn write_u64(&mut self, value: u64);
11010

111-
impl ByteSliceMut for [u8] {}
11+
/// Calculate the offset of a slice from another.
12+
fn offset_from(&self, other: &Self) -> isize;
11213

113-
/// Bytes wrapper with specialized methods for ASCII characters.
114-
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115-
pub(crate) struct AsciiStr<'a> {
116-
slc: &'a [u8],
14+
/// Iteratively parse and consume digits from bytes.
15+
/// Returns the same bytes with consumed digits being
16+
/// elided.
17+
fn parse_digits(&self, func: impl FnMut(u8)) -> &Self;
11718
}
11819

119-
impl<'a> AsciiStr<'a> {
120-
pub fn new(slc: &'a [u8]) -> Self {
121-
Self { slc }
20+
impl ByteSlice for [u8] {
21+
#[inline(always)] // inlining this is crucial to remove bound checks
22+
fn read_u64(&self) -> u64 {
23+
let mut tmp = [0; 8];
24+
tmp.copy_from_slice(&self[..8]);
25+
u64::from_le_bytes(tmp)
12226
}
12327

124-
/// Advance the view by n, advancing it in-place to (n..).
125-
pub unsafe fn step_by(&mut self, n: usize) -> &mut Self {
126-
// SAFETY: safe as long n is less than the buffer length
127-
self.slc = unsafe { self.slc.get_unchecked(n..) };
128-
self
28+
#[inline(always)] // inlining this is crucial to remove bound checks
29+
fn write_u64(&mut self, value: u64) {
30+
self[..8].copy_from_slice(&value.to_le_bytes())
12931
}
13032

131-
/// Advance the view by n, advancing it in-place to (1..).
132-
pub unsafe fn step(&mut self) -> &mut Self {
133-
// SAFETY: safe as long as self is not empty
134-
unsafe { self.step_by(1) }
33+
#[inline]
34+
fn offset_from(&self, other: &Self) -> isize {
35+
other.len() as isize - self.len() as isize
13536
}
13637

137-
/// Iteratively parse and consume digits from bytes.
138-
pub fn parse_digits(&mut self, mut func: impl FnMut(u8)) {
139-
while let Some(&c) = self.as_ref().first() {
38+
#[inline]
39+
fn parse_digits(&self, mut func: impl FnMut(u8)) -> &Self {
40+
let mut s = self;
41+
42+
// FIXME: Can't use s.split_first() here yet,
43+
// see https://github.com/rust-lang/rust/issues/109328
44+
while let [c, s_next @ ..] = s {
14045
let c = c.wrapping_sub(b'0');
14146
if c < 10 {
14247
func(c);
143-
// SAFETY: self cannot be empty
144-
unsafe {
145-
self.step();
146-
}
48+
s = s_next;
14749
} else {
14850
break;
14951
}
15052
}
151-
}
152-
}
15353

154-
impl<'a> AsRef<[u8]> for AsciiStr<'a> {
155-
#[inline]
156-
fn as_ref(&self) -> &[u8] {
157-
self.slc
54+
s
15855
}
15956
}
16057

161-
impl<'a> ByteSlice for AsciiStr<'a> {}
162-
16358
/// Determine if 8 bytes are all decimal digits.
16459
/// This does not care about the order in which the bytes were loaded.
16560
pub(crate) fn is_8digits(v: u64) -> bool {
@@ -168,19 +63,6 @@ pub(crate) fn is_8digits(v: u64) -> bool {
16863
(a | b) & 0x8080_8080_8080_8080 == 0
16964
}
17065

171-
/// Iteratively parse and consume digits from bytes.
172-
pub(crate) fn parse_digits(s: &mut &[u8], mut f: impl FnMut(u8)) {
173-
while let Some(&c) = s.get(0) {
174-
let c = c.wrapping_sub(b'0');
175-
if c < 10 {
176-
f(c);
177-
*s = s.advance(1);
178-
} else {
179-
break;
180-
}
181-
}
182-
}
183-
18466
/// A custom 64-bit floating point type, representing `f * 2^e`.
18567
/// e is biased, so it be directly shifted into the exponent bits.
18668
#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]

library/core/src/num/dec2flt/decimal.rs

+36-29
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
//! algorithm can be found in "ParseNumberF64 by Simple Decimal Conversion",
1010
//! available online: <https://nigeltao.github.io/blog/2020/parse-number-f64-simple.html>.
1111
12-
use crate::num::dec2flt::common::{is_8digits, parse_digits, ByteSlice, ByteSliceMut};
12+
use crate::num::dec2flt::common::{is_8digits, ByteSlice};
1313

1414
#[derive(Clone)]
1515
pub struct Decimal {
@@ -205,29 +205,32 @@ impl Decimal {
205205
pub fn parse_decimal(mut s: &[u8]) -> Decimal {
206206
let mut d = Decimal::default();
207207
let start = s;
208-
s = s.skip_chars(b'0');
209-
parse_digits(&mut s, |digit| d.try_add_digit(digit));
210-
if s.first_is(b'.') {
211-
s = s.advance(1);
208+
209+
while let Some((&b'0', s_next)) = s.split_first() {
210+
s = s_next;
211+
}
212+
213+
s = s.parse_digits(|digit| d.try_add_digit(digit));
214+
215+
if let Some((b'.', s_next)) = s.split_first() {
216+
s = s_next;
212217
let first = s;
213218
// Skip leading zeros.
214219
if d.num_digits == 0 {
215-
s = s.skip_chars(b'0');
220+
while let Some((&b'0', s_next)) = s.split_first() {
221+
s = s_next;
222+
}
216223
}
217224
while s.len() >= 8 && d.num_digits + 8 < Decimal::MAX_DIGITS {
218-
// SAFETY: s is at least 8 bytes.
219-
let v = unsafe { s.read_u64_unchecked() };
225+
let v = s.read_u64();
220226
if !is_8digits(v) {
221227
break;
222228
}
223-
// SAFETY: d.num_digits + 8 is less than d.digits.len()
224-
unsafe {
225-
d.digits[d.num_digits..].write_u64_unchecked(v - 0x3030_3030_3030_3030);
226-
}
229+
d.digits[d.num_digits..].write_u64(v - 0x3030_3030_3030_3030);
227230
d.num_digits += 8;
228-
s = s.advance(8);
231+
s = &s[8..];
229232
}
230-
parse_digits(&mut s, |digit| d.try_add_digit(digit));
233+
s = s.parse_digits(|digit| d.try_add_digit(digit));
231234
d.decimal_point = s.len() as i32 - first.len() as i32;
232235
}
233236
if d.num_digits != 0 {
@@ -248,22 +251,26 @@ pub fn parse_decimal(mut s: &[u8]) -> Decimal {
248251
d.num_digits = Decimal::MAX_DIGITS;
249252
}
250253
}
251-
if s.first_is2(b'e', b'E') {
252-
s = s.advance(1);
253-
let mut neg_exp = false;
254-
if s.first_is(b'-') {
255-
neg_exp = true;
256-
s = s.advance(1);
257-
} else if s.first_is(b'+') {
258-
s = s.advance(1);
259-
}
260-
let mut exp_num = 0_i32;
261-
parse_digits(&mut s, |digit| {
262-
if exp_num < 0x10000 {
263-
exp_num = 10 * exp_num + digit as i32;
254+
if let Some((&ch, s_next)) = s.split_first() {
255+
if ch == b'e' || ch == b'E' {
256+
s = s_next;
257+
let mut neg_exp = false;
258+
if let Some((&ch, s_next)) = s.split_first() {
259+
neg_exp = ch == b'-';
260+
if ch == b'-' || ch == b'+' {
261+
s = s_next;
262+
}
264263
}
265-
});
266-
d.decimal_point += if neg_exp { -exp_num } else { exp_num };
264+
let mut exp_num = 0_i32;
265+
266+
s.parse_digits(|digit| {
267+
if exp_num < 0x10000 {
268+
exp_num = 10 * exp_num + digit as i32;
269+
}
270+
});
271+
272+
d.decimal_point += if neg_exp { -exp_num } else { exp_num };
273+
}
267274
}
268275
for i in d.num_digits..Decimal::MAX_DIGITS_WITHOUT_OVERFLOW {
269276
d.digits[i] = 0;

library/core/src/num/dec2flt/mod.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ use crate::error::Error;
7979
use crate::fmt;
8080
use crate::str::FromStr;
8181

82-
use self::common::{BiasedFp, ByteSlice};
82+
use self::common::BiasedFp;
8383
use self::float::RawFloat;
8484
use self::lemire::compute_float;
8585
use self::parse::{parse_inf_nan, parse_number};
@@ -238,17 +238,18 @@ pub fn dec2flt<F: RawFloat>(s: &str) -> Result<F, ParseFloatError> {
238238
};
239239
let negative = c == b'-';
240240
if c == b'-' || c == b'+' {
241-
s = s.advance(1);
241+
s = &s[1..];
242242
}
243243
if s.is_empty() {
244244
return Err(pfe_invalid());
245245
}
246246

247-
let num = match parse_number(s, negative) {
247+
let mut num = match parse_number(s) {
248248
Some(r) => r,
249249
None if let Some(value) = parse_inf_nan(s, negative) => return Ok(value),
250250
None => return Err(pfe_invalid()),
251251
};
252+
num.negative = negative;
252253
if let Some(value) = num.try_fast_path::<F>() {
253254
return Ok(value);
254255
}

0 commit comments

Comments
 (0)