|
| 1 | +//! Operations on ASCII `[u8]`. |
| 2 | +
|
| 3 | +use crate::mem; |
| 4 | + |
| 5 | +#[lang = "slice_u8"] |
| 6 | +#[cfg(not(test))] |
| 7 | +impl [u8] { |
| 8 | + /// Checks if all bytes in this slice are within the ASCII range. |
| 9 | + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
| 10 | + #[inline] |
| 11 | + pub fn is_ascii(&self) -> bool { |
| 12 | + is_ascii(self) |
| 13 | + } |
| 14 | + |
| 15 | + /// Checks that two slices are an ASCII case-insensitive match. |
| 16 | + /// |
| 17 | + /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`, |
| 18 | + /// but without allocating and copying temporaries. |
| 19 | + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
| 20 | + #[inline] |
| 21 | + pub fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool { |
| 22 | + self.len() == other.len() && self.iter().zip(other).all(|(a, b)| a.eq_ignore_ascii_case(b)) |
| 23 | + } |
| 24 | + |
| 25 | + /// Converts this slice to its ASCII upper case equivalent in-place. |
| 26 | + /// |
| 27 | + /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z', |
| 28 | + /// but non-ASCII letters are unchanged. |
| 29 | + /// |
| 30 | + /// To return a new uppercased value without modifying the existing one, use |
| 31 | + /// [`to_ascii_uppercase`]. |
| 32 | + /// |
| 33 | + /// [`to_ascii_uppercase`]: #method.to_ascii_uppercase |
| 34 | + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
| 35 | + #[inline] |
| 36 | + pub fn make_ascii_uppercase(&mut self) { |
| 37 | + for byte in self { |
| 38 | + byte.make_ascii_uppercase(); |
| 39 | + } |
| 40 | + } |
| 41 | + |
| 42 | + /// Converts this slice to its ASCII lower case equivalent in-place. |
| 43 | + /// |
| 44 | + /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z', |
| 45 | + /// but non-ASCII letters are unchanged. |
| 46 | + /// |
| 47 | + /// To return a new lowercased value without modifying the existing one, use |
| 48 | + /// [`to_ascii_lowercase`]. |
| 49 | + /// |
| 50 | + /// [`to_ascii_lowercase`]: #method.to_ascii_lowercase |
| 51 | + #[stable(feature = "ascii_methods_on_intrinsics", since = "1.23.0")] |
| 52 | + #[inline] |
| 53 | + pub fn make_ascii_lowercase(&mut self) { |
| 54 | + for byte in self { |
| 55 | + byte.make_ascii_lowercase(); |
| 56 | + } |
| 57 | + } |
| 58 | +} |
| 59 | + |
| 60 | +/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed |
| 61 | +/// from `../str/mod.rs`, which does something similar for utf8 validation. |
| 62 | +#[inline] |
| 63 | +fn contains_nonascii(v: usize) -> bool { |
| 64 | + const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; |
| 65 | + (NONASCII_MASK & v) != 0 |
| 66 | +} |
| 67 | + |
| 68 | +/// Optimized ASCII test that will use usize-at-a-time operations instead of |
| 69 | +/// byte-at-a-time operations (when possible). |
| 70 | +/// |
| 71 | +/// The algorithm we use here is pretty simple. If `s` is too short, we just |
| 72 | +/// check each byte and be done with it. Otherwise: |
| 73 | +/// |
| 74 | +/// - Read the first word with an unaligned load. |
| 75 | +/// - Align the pointer, read subsequent words until end with aligned loads. |
| 76 | +/// - Read the last `usize` from `s` with an unaligned load. |
| 77 | +/// |
| 78 | +/// If any of these loads produces something for which `contains_nonascii` |
| 79 | +/// (above) returns true, then we know the answer is false. |
| 80 | +#[inline] |
| 81 | +fn is_ascii(s: &[u8]) -> bool { |
| 82 | + const USIZE_SIZE: usize = mem::size_of::<usize>(); |
| 83 | + |
| 84 | + let len = s.len(); |
| 85 | + let align_offset = s.as_ptr().align_offset(USIZE_SIZE); |
| 86 | + |
| 87 | + // If we wouldn't gain anything from the word-at-a-time implementation, fall |
| 88 | + // back to a scalar loop. |
| 89 | + // |
| 90 | + // We also do this for architectures where `size_of::<usize>()` isn't |
| 91 | + // sufficient alignment for `usize`, because it's a weird edge case. |
| 92 | + if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() { |
| 93 | + return s.iter().all(|b| b.is_ascii()); |
| 94 | + } |
| 95 | + |
| 96 | + // We always read the first word unaligned, which means `align_offset` is |
| 97 | + // 0, we'd read the same value again for the aligned read. |
| 98 | + let offset_to_aligned = if align_offset == 0 { USIZE_SIZE } else { align_offset }; |
| 99 | + |
| 100 | + let start = s.as_ptr(); |
| 101 | + // SAFETY: We verify `len < USIZE_SIZE` above. |
| 102 | + let first_word = unsafe { (start as *const usize).read_unaligned() }; |
| 103 | + |
| 104 | + if contains_nonascii(first_word) { |
| 105 | + return false; |
| 106 | + } |
| 107 | + // We checked this above, somewhat implicitly. Note that `offset_to_aligned` |
| 108 | + // is either `align_offset` or `USIZE_SIZE`, both of are explicitly checked |
| 109 | + // above. |
| 110 | + debug_assert!(offset_to_aligned <= len); |
| 111 | + |
| 112 | + // SAFETY: word_ptr is the (properly aligned) usize ptr we use to read the |
| 113 | + // middle chunk of the slice. |
| 114 | + let mut word_ptr = unsafe { start.add(offset_to_aligned) as *const usize }; |
| 115 | + |
| 116 | + // `byte_pos` is the byte index of `word_ptr`, used for loop end checks. |
| 117 | + let mut byte_pos = offset_to_aligned; |
| 118 | + |
| 119 | + // Paranoia check about alignment, since we're about to do a bunch of |
| 120 | + // unaligned loads. In practice this should be impossible barring a bug in |
| 121 | + // `align_offset` though. |
| 122 | + debug_assert_eq!((word_ptr as usize) % mem::align_of::<usize>(), 0); |
| 123 | + |
| 124 | + // Read subsequent words until the last aligned word, excluding the last |
| 125 | + // aligned word by itself to be done in tail check later, to ensure that |
| 126 | + // tail is always one `usize` at most to extra branch `byte_pos == len`. |
| 127 | + while byte_pos < len - USIZE_SIZE { |
| 128 | + debug_assert!( |
| 129 | + // Sanity check that the read is in bounds |
| 130 | + (word_ptr as usize + USIZE_SIZE) <= (start.wrapping_add(len) as usize) && |
| 131 | + // And that our assumptions about `byte_pos` hold. |
| 132 | + (word_ptr as usize) - (start as usize) == byte_pos |
| 133 | + ); |
| 134 | + |
| 135 | + // SAFETY: We know `word_ptr` is properly aligned (because of |
| 136 | + // `align_offset`), and we know that we have enough bytes between `word_ptr` and the end |
| 137 | + let word = unsafe { word_ptr.read() }; |
| 138 | + if contains_nonascii(word) { |
| 139 | + return false; |
| 140 | + } |
| 141 | + |
| 142 | + byte_pos += USIZE_SIZE; |
| 143 | + // SAFETY: We know that `byte_pos <= len - USIZE_SIZE`, which means that |
| 144 | + // after this `add`, `word_ptr` will be at most one-past-the-end. |
| 145 | + word_ptr = unsafe { word_ptr.add(1) }; |
| 146 | + } |
| 147 | + |
| 148 | + // Sanity check to ensure there really is only one `usize` left. This should |
| 149 | + // be guaranteed by our loop condition. |
| 150 | + debug_assert!(byte_pos <= len && len - byte_pos <= USIZE_SIZE); |
| 151 | + |
| 152 | + // SAFETY: This relies on `len >= USIZE_SIZE`, which we check at the start. |
| 153 | + let last_word = unsafe { (start.add(len - USIZE_SIZE) as *const usize).read_unaligned() }; |
| 154 | + |
| 155 | + !contains_nonascii(last_word) |
| 156 | +} |
0 commit comments