|
11 | 11 | //! UTF-8 and UTF-16 decoding iterators
|
12 | 12 |
|
13 | 13 | use fmt;
|
14 |
| -use iter::FusedIterator; |
15 | 14 | use super::from_u32_unchecked;
|
16 | 15 |
|
17 |
| -/// An iterator over an iterator of bytes of the characters the bytes represent |
18 |
| -/// as UTF-8 |
19 |
| -#[unstable(feature = "decode_utf8", issue = "33906")] |
20 |
| -#[rustc_deprecated(since = "1.27.0", reason = "Use str::from_utf8 instead: |
21 |
| - https://doc.rust-lang.org/nightly/std/str/struct.Utf8Error.html#examples")] |
22 |
| -#[derive(Clone, Debug)] |
23 |
| -#[allow(deprecated)] |
24 |
| -pub struct DecodeUtf8<I: Iterator<Item = u8>>(::iter::Peekable<I>); |
25 |
| - |
26 |
| -/// Decodes an `Iterator` of bytes as UTF-8. |
27 |
| -#[unstable(feature = "decode_utf8", issue = "33906")] |
28 |
| -#[rustc_deprecated(since = "1.27.0", reason = "Use str::from_utf8 instead: |
29 |
| - https://doc.rust-lang.org/nightly/std/str/struct.Utf8Error.html#examples")] |
30 |
| -#[allow(deprecated)] |
31 |
| -#[inline] |
32 |
| -pub fn decode_utf8<I: IntoIterator<Item = u8>>(i: I) -> DecodeUtf8<I::IntoIter> { |
33 |
| - DecodeUtf8(i.into_iter().peekable()) |
34 |
| -} |
35 |
| - |
36 |
| -/// `<DecodeUtf8 as Iterator>::next` returns this for an invalid input sequence. |
37 |
| -#[unstable(feature = "decode_utf8", issue = "33906")] |
38 |
| -#[rustc_deprecated(since = "1.27.0", reason = "Use str::from_utf8 instead: |
39 |
| - https://doc.rust-lang.org/nightly/std/str/struct.Utf8Error.html#examples")] |
40 |
| -#[derive(PartialEq, Eq, Debug)] |
41 |
| -#[allow(deprecated)] |
42 |
| -pub struct InvalidSequence(()); |
43 |
| - |
44 |
| -#[unstable(feature = "decode_utf8", issue = "33906")] |
45 |
| -#[allow(deprecated)] |
46 |
| -impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> { |
47 |
| - type Item = Result<char, InvalidSequence>; |
48 |
| - #[inline] |
49 |
| - |
50 |
| - fn next(&mut self) -> Option<Result<char, InvalidSequence>> { |
51 |
| - self.0.next().map(|first_byte| { |
52 |
| - // Emit InvalidSequence according to |
53 |
| - // Unicode §5.22 Best Practice for U+FFFD Substitution |
54 |
| - // http://www.unicode.org/versions/Unicode9.0.0/ch05.pdf#G40630 |
55 |
| - |
56 |
| - // Roughly: consume at least one byte, |
57 |
| - // then validate one byte at a time and stop before the first unexpected byte |
58 |
| - // (which might be the valid start of the next byte sequence). |
59 |
| - |
60 |
| - let mut code_point; |
61 |
| - macro_rules! first_byte { |
62 |
| - ($mask: expr) => { |
63 |
| - code_point = u32::from(first_byte & $mask) |
64 |
| - } |
65 |
| - } |
66 |
| - macro_rules! continuation_byte { |
67 |
| - () => { continuation_byte!(0x80..=0xBF) }; |
68 |
| - ($range: pat) => { |
69 |
| - match self.0.peek() { |
70 |
| - Some(&byte @ $range) => { |
71 |
| - code_point = (code_point << 6) | u32::from(byte & 0b0011_1111); |
72 |
| - self.0.next(); |
73 |
| - } |
74 |
| - _ => return Err(InvalidSequence(())) |
75 |
| - } |
76 |
| - } |
77 |
| - } |
78 |
| - |
79 |
| - match first_byte { |
80 |
| - 0x00..=0x7F => { |
81 |
| - first_byte!(0b1111_1111); |
82 |
| - } |
83 |
| - 0xC2..=0xDF => { |
84 |
| - first_byte!(0b0001_1111); |
85 |
| - continuation_byte!(); |
86 |
| - } |
87 |
| - 0xE0 => { |
88 |
| - first_byte!(0b0000_1111); |
89 |
| - continuation_byte!(0xA0..=0xBF); // 0x80..=0x9F here are overlong |
90 |
| - continuation_byte!(); |
91 |
| - } |
92 |
| - 0xE1..=0xEC | 0xEE..=0xEF => { |
93 |
| - first_byte!(0b0000_1111); |
94 |
| - continuation_byte!(); |
95 |
| - continuation_byte!(); |
96 |
| - } |
97 |
| - 0xED => { |
98 |
| - first_byte!(0b0000_1111); |
99 |
| - continuation_byte!(0x80..=0x9F); // 0xA0..0xBF here are surrogates |
100 |
| - continuation_byte!(); |
101 |
| - } |
102 |
| - 0xF0 => { |
103 |
| - first_byte!(0b0000_0111); |
104 |
| - continuation_byte!(0x90..=0xBF); // 0x80..0x8F here are overlong |
105 |
| - continuation_byte!(); |
106 |
| - continuation_byte!(); |
107 |
| - } |
108 |
| - 0xF1..=0xF3 => { |
109 |
| - first_byte!(0b0000_0111); |
110 |
| - continuation_byte!(); |
111 |
| - continuation_byte!(); |
112 |
| - continuation_byte!(); |
113 |
| - } |
114 |
| - 0xF4 => { |
115 |
| - first_byte!(0b0000_0111); |
116 |
| - continuation_byte!(0x80..=0x8F); // 0x90..0xBF here are beyond char::MAX |
117 |
| - continuation_byte!(); |
118 |
| - continuation_byte!(); |
119 |
| - } |
120 |
| - _ => return Err(InvalidSequence(())) // Illegal first byte, overlong, or beyond MAX |
121 |
| - } |
122 |
| - unsafe { |
123 |
| - Ok(from_u32_unchecked(code_point)) |
124 |
| - } |
125 |
| - }) |
126 |
| - } |
127 |
| - |
128 |
| - #[inline] |
129 |
| - fn size_hint(&self) -> (usize, Option<usize>) { |
130 |
| - let (lower, upper) = self.0.size_hint(); |
131 |
| - |
132 |
| - // A code point is at most 4 bytes long. |
133 |
| - let min_code_points = lower / 4; |
134 |
| - |
135 |
| - (min_code_points, upper) |
136 |
| - } |
137 |
| -} |
138 |
| - |
139 |
| -#[unstable(feature = "decode_utf8", issue = "33906")] |
140 |
| -#[allow(deprecated)] |
141 |
| -impl<I: FusedIterator<Item = u8>> FusedIterator for DecodeUtf8<I> {} |
142 |
| - |
143 | 16 | /// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s.
|
144 | 17 | #[stable(feature = "decode_utf16", since = "1.9.0")]
|
145 | 18 | #[derive(Clone, Debug)]
|
|
0 commit comments