diff --git a/Changelog.md b/Changelog.md index 7232379b..1c364f60 100644 --- a/Changelog.md +++ b/Changelog.md @@ -17,8 +17,13 @@ ### Bug Fixes +- [#781]: Fix conditions to start CDATA section. Only uppercase `(&self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> { - for i in memchr::memchr_iter(b'>', chunk) { - match self { - // Need to read at least 6 symbols (`!---->`) for properly finished comment - // - XML comment - // 012345 - i - Self::Comment if buf.len() + i > 4 => { - if chunk[..i].ends_with(b"--") { - // We cannot strip last `--` from the buffer because we need it in case of - // check_comments enabled option. XML standard requires that comment - // will not end with `--->` sequence because this is a special case of - // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `-|->` was splitted at | - // buf --/ \-- chunk - if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { - return Some((&chunk[..i], i + 1)); // +1 for `>` - } - // End sequence `--|>` was splitted at | - // buf --/ \-- chunk - if i == 0 && buf.ends_with(b"--") { - return Some((&[], i + 1)); // +1 for `>` + match self { + Self::Comment => { + for i in memchr::memchr_iter(b'>', chunk) { + // Need to read at least 6 symbols (`!---->`) for properly finished comment + // - XML comment + // 012345 - i + if buf.len() + i > 4 { + if chunk[..i].ends_with(b"--") { + // We cannot strip last `--` from the buffer because we need it in case of + // check_comments enabled option. XML standard requires that comment + // will not end with `--->` sequence because this is a special case of + // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments) + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `-|->` was splitted at | + // buf --/ \-- chunk + if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' { + return Some((&chunk[..i], i + 1)); // +1 for `>` + } + // End sequence `--|>` was splitted at | + // buf --/ \-- chunk + if i == 0 && buf.ends_with(b"--") { + return Some((&[], i + 1)); // +1 for `>` + } } } - Self::Comment => {} - Self::CData => { + } + Self::CData => { + for i in memchr::memchr_iter(b'>', chunk) { if chunk[..i].ends_with(b"]]") { return Some((&chunk[..i], i + 1)); // +1 for `>` } @@ -1014,7 +1017,9 @@ impl BangType { return Some((&[], i + 1)); // +1 for `>` } } - Self::DocType => { + } + Self::DocType => { + for i in memchr::memchr_iter(b'>', chunk) { let content = &chunk[..i]; let balance = memchr::memchr2_iter(b'<', b'>', content) .map(|p| if content[p] == b'<' { 1i32 } else { -1 }) diff --git a/src/reader/state.rs b/src/reader/state.rs index 2f3e8512..0119022d 100644 --- a/src/reader/state.rs +++ b/src/reader/state.rs @@ -128,7 +128,11 @@ impl ReaderState { self.decoder(), ))) } - BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { + // XML requires uppercase only: + // https://www.w3.org/TR/xml11/#sec-cdata-sect + // Even HTML5 required uppercase only: + // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state + BangType::CData if buf.starts_with(b"![CDATA[") => { debug_assert!(buf.ends_with(b"]]")); Ok(Event::CData(BytesCData::wrap( // Cut of `![CDATA[` and `]]` from start and end @@ -136,6 +140,10 @@ impl ReaderState { self.decoder(), ))) } + // XML requires uppercase only, but we will check that on validation stage: + // https://www.w3.org/TR/xml11/#sec-prolog-dtd + // HTML5 allows mixed case for doctype declarations: + // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { match buf[8..].iter().position(|&b| !is_whitespace(b)) { Some(start) => Ok(Event::DocType(BytesText::wrap( diff --git a/tests/fuzzing.rs b/tests/fuzzing.rs index deca382a..2740763c 100644 --- a/tests/fuzzing.rs +++ b/tests/fuzzing.rs @@ -51,7 +51,7 @@ fn fuzz_101() { #[test] fn fuzz_empty_doctype() { - let data: &[u8] = b""; + let data: &[u8] = b""; let mut reader = Reader::from_reader(data); let mut buf = Vec::new(); assert!(matches!( diff --git a/tests/reader-config.rs b/tests/reader-config.rs index bd2465e3..781bca8c 100644 --- a/tests/reader-config.rs +++ b/tests/reader-config.rs @@ -471,7 +471,7 @@ mod trim_markup_names_in_closing_tags { } const XML: &str = " \t\r\n\ - \t\r\n\ + \t\r\n\ \t\r\n\ \t\r\n\ text \t\r\n\ diff --git a/tests/reader-errors.rs b/tests/reader-errors.rs index 18c08a23..34880c15 100644 --- a/tests/reader-errors.rs +++ b/tests/reader-errors.rs @@ -343,6 +343,8 @@ mod syntax { err!(unclosed24(" SyntaxError::UnclosedCData); err!(unclosed25("") => SyntaxError::UnclosedCData); + err!(lowercase("") => SyntaxError::UnclosedCData); + ok!(normal1("") => 12: Event::CData(BytesCData::new(""))); ok!(normal2("rest") => 12: Event::CData(BytesCData::new(""))); }