diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml index f6f3576a..b0a3e711 100644 --- a/html5ever/Cargo.toml +++ b/html5ever/Cargo.toml @@ -13,13 +13,16 @@ readme = "../README.md" rust-version.workspace = true [features] +default = ["encoding"] trace_tokenizer = [] +encoding = ["dep:encoding_rs", "markup5ever/encoding"] [dependencies] log = "0.4" mac = "0.1" markup5ever = { version = "0.16", path = "../markup5ever" } match_token = { workspace = true } +encoding_rs = { version = "0.8", optional = true } [dev-dependencies] criterion = "0.5" diff --git a/html5ever/examples/noop-tokenize.rs b/html5ever/examples/noop-tokenize.rs index a95404df..773b30e2 100644 --- a/html5ever/examples/noop-tokenize.rs +++ b/html5ever/examples/noop-tokenize.rs @@ -15,7 +15,8 @@ use std::cell::RefCell; use std::io; use html5ever::tendril::*; -use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer}; +use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer}; +use markup5ever::buffer_queue::BufferQueue; /// In our case, our sink only contains a tokens vector struct Sink(RefCell>); diff --git a/html5ever/examples/tokenize.rs b/html5ever/examples/tokenize.rs index ba984d8f..f1368604 100644 --- a/html5ever/examples/tokenize.rs +++ b/html5ever/examples/tokenize.rs @@ -13,11 +13,11 @@ use std::cell::Cell; use std::io; use html5ever::tendril::*; -use html5ever::tokenizer::BufferQueue; use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken}; use html5ever::tokenizer::{ ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts, }; +use markup5ever::buffer_queue::BufferQueue; #[derive(Clone)] struct TokenPrinter { diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs index c97780cc..e15bcd2d 100644 --- a/html5ever/src/tokenizer/char_ref/mod.rs +++ b/html5ever/src/tokenizer/char_ref/mod.rs @@ -8,12 +8,12 @@ // except according to those terms. use super::{TokenSink, Tokenizer}; -use crate::buffer_queue::BufferQueue; use crate::data; use crate::tendril::StrTendril; use log::debug; use mac::format_if; +use markup5ever::buffer_queue::BufferQueue; use std::borrow::Cow::Borrowed; use std::char::from_u32; diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs index edc6afb9..91b33634 100644 --- a/html5ever/src/tokenizer/interface.rs +++ b/html5ever/src/tokenizer/interface.rs @@ -77,6 +77,8 @@ pub enum TokenSinkResult { Script(Handle), Plaintext, RawData(states::RawKind), + #[cfg(feature = "encoding")] + MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding), } /// Types which can receive tokens from the tokenizer. diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs index 80b0f6d1..09bc50fd 100644 --- a/html5ever/src/tokenizer/mod.rs +++ b/html5ever/src/tokenizer/mod.rs @@ -22,16 +22,18 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped}; use self::char_ref::{CharRef, CharRefTokenizer}; use crate::util::str::lower_ascii_letter; - use log::{debug, trace}; use mac::format_if; -use markup5ever::{ns, small_char_set, TokenizerResult}; +use markup5ever::{ + buffer_queue::BufferQueue, namespace_url, ns, small_char_set, InputSink, InputSinkResult, + TokenizerResult, +}; use std::borrow::Cow::{self, Borrowed}; use std::cell::{Cell, RefCell, RefMut}; use std::collections::BTreeMap; -use std::mem; +use std::{iter, mem}; -pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; +pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult}; use crate::tendril::StrTendril; use crate::{Attribute, LocalName, QualName, SmallCharSet}; @@ -43,6 +45,8 @@ pub enum ProcessResult { Continue, Suspend, Script(Handle), + #[cfg(feature = "encoding")] + MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding), } fn option_push(opt_str: &mut Option, c: char) { @@ -357,6 +361,10 @@ impl Tokenizer { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(node) => return TokenizerResult::Script(node), + #[cfg(feature = "encoding")] + ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => { + return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) + }, } } } else { @@ -365,6 +373,10 @@ impl Tokenizer { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(node) => return TokenizerResult::Script(node), + #[cfg(feature = "encoding")] + ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => { + return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) + }, } } } @@ -456,6 +468,10 @@ impl Tokenizer { self.state.set(states::RawData(kind)); ProcessResult::Continue }, + #[cfg(feature = "encoding")] + TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => { + ProcessResult::MaybeChangeEncodingAndStartOver(encoding) + }, } } @@ -1680,6 +1696,8 @@ impl Tokenizer { ProcessResult::Continue => (), ProcessResult::Suspend => break, ProcessResult::Script(_) => unreachable!(), + #[cfg(feature = "encoding")] + ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(), } } @@ -1841,13 +1859,27 @@ impl Tokenizer { } } +impl InputSink for Tokenizer +where + Sink: TokenSink, +{ + type Handle = Sink::Handle; + + fn feed<'a>( + &'a self, + input: &'a BufferQueue, + ) -> impl Iterator> + 'a { + iter::from_fn(|| self.feed(input).into()) + } +} + #[cfg(test)] #[allow(non_snake_case)] mod test { use super::option_push; // private items - use crate::tendril::{SliceExt, StrTendril}; - use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; + use crate::tendril::{SliceExt, StrTendril}; + use crate::LocalName; use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; use super::interface::{EndTag, StartTag, Tag, TagKind}; @@ -1856,8 +1888,6 @@ mod test { use markup5ever::buffer_queue::BufferQueue; use std::cell::RefCell; - use crate::LocalName; - // LinesMatch implements the TokenSink trait. It is used for testing to see // if current_line is being updated when process_token is called. The lines // vector is a collection of the line numbers that each token is on. diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs index 52a265a4..3180f854 100644 --- a/html5ever/src/tree_builder/mod.rs +++ b/html5ever/src/tree_builder/mod.rs @@ -396,6 +396,10 @@ where assert!(more_tokens.is_empty()); return tokenizer::TokenSinkResult::RawData(k); }, + #[cfg(feature = "encoding")] + ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => { + return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding); + }, } } } diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs index 3d5e125b..99cc1e65 100644 --- a/html5ever/src/tree_builder/rules.rs +++ b/html5ever/src/tree_builder/rules.rs @@ -10,21 +10,24 @@ // The tree builder rules, as a single, enormous nested match expression. use crate::interface::Quirks; -use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData}; +use crate::tokenizer::states::{Rawtext, Rcdata}; use crate::tokenizer::TagKind::{EndTag, StartTag}; use crate::tree_builder::tag_sets::*; use crate::tree_builder::types::*; -use crate::tree_builder::{ - create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder, - TreeSink, -}; -use crate::QualName; -use markup5ever::{expanded_name, local_name, ns}; +use crate::tree_builder::RawKind::ScriptData; +use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink}; + +use markup5ever::interface::create_element; +use markup5ever::interface::NodeOrText::AppendNode; +use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName}; use std::borrow::Cow::Borrowed; use crate::tendril::SliceExt; use match_token::match_token; +#[cfg(feature = "encoding")] +use encoding_rs::Encoding; + fn any_not_whitespace(x: &StrTendril) -> bool { // FIXME: this might be much faster as a byte scan x.chars().any(|c| !c.is_ascii_whitespace()) @@ -113,8 +116,21 @@ where => self.step(InsertionMode::InBody, token), - tag @ => { - // FIXME: handle and + tag @ => { + // FIXME: handle + #[cfg(feature = "encoding")] + if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) { + if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) { + self.insert_and_pop_element_for(tag); + return ProcessResult::MaybeChangeEncodingAndStartOver(encoding); + } + } + + self.insert_and_pop_element_for(tag); + ProcessResult::DoneAckSelfClosing + }, + + tag @ => { self.insert_and_pop_element_for(tag); ProcessResult::DoneAckSelfClosing } diff --git a/html5ever/src/tree_builder/types.rs b/html5ever/src/tree_builder/types.rs index 684d5b0b..aeb19a7d 100644 --- a/html5ever/src/tree_builder/types.rs +++ b/html5ever/src/tree_builder/types.rs @@ -70,6 +70,8 @@ pub(crate) enum ProcessResult { Script(Handle), ToPlaintext, ToRawData(RawKind), + #[cfg(feature = "encoding")] + MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding), } pub(crate) enum FormatEntry { diff --git a/markup5ever/Cargo.toml b/markup5ever/Cargo.toml index 4ed47a5e..b2c6c18a 100644 --- a/markup5ever/Cargo.toml +++ b/markup5ever/Cargo.toml @@ -13,7 +13,15 @@ rust-version.workspace = true [lib] path = "lib.rs" +[features] +encoding = ["dep:encoding_rs"] + [dependencies] web_atoms = { version = "0.1", path = "../web_atoms" } tendril = "0.4" -log = "0.4" \ No newline at end of file +log = "0.4" +encoding_rs = { version = "0.8", optional = true } + +[build-dependencies] +string_cache_codegen = "0.5.4" +phf_codegen = "0.11" diff --git a/markup5ever/encoding.rs b/markup5ever/encoding.rs new file mode 100644 index 00000000..e8ad8d1b --- /dev/null +++ b/markup5ever/encoding.rs @@ -0,0 +1,133 @@ +// Copyright 2014-2025 The html5ever Project Developers. See the +// COPYRIGHT file at the top-level directory of this distribution. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_8, WINDOWS_1252, X_USER_DEFINED}; +use tendril::{fmt::Bytes, Tendril}; + +use crate::buffer_queue::BufferQueue; + +/// +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Confidence { + Tentative, + Certain, + Irrelevant, +} + +pub struct Decoder { + inner: encoding_rs::Decoder, + confidence: Confidence, +} + +impl Decoder { + pub fn new(encoding: &'static Encoding, confidence: Confidence) -> Self { + Self { + inner: encoding.new_decoder(), + confidence, + } + } + + pub fn confidence(&self) -> Confidence { + self.confidence + } + + /// Returns `None` if the encoding should not be changed and `Some(encoding)` if the current encoding + /// should be changed to `encoding` + pub fn change_the_encoding_to( + &mut self, + mut new_encoding: &'static Encoding, + ) -> Option<&'static Encoding> { + let current_encoding = self.inner.encoding(); + // Step 1. If the encoding that is already being used to interpret the input stream is UTF-16BE/LE, + // then set the confidence to certain and return. The new encoding is ignored; if it was anything + // but the same encoding, then it would be clearly incorrect. + if current_encoding == UTF_16BE || current_encoding == UTF_16BE { + self.confidence = Confidence::Certain; + return None; + } + + // Step 2. If the new encoding is UTF-16BE/LE, then change it to UTF-8. + if new_encoding == UTF_16BE || new_encoding == UTF_16BE { + new_encoding = UTF_8; + } + + // Step 3. If the new encoding is x-user-defined, then change it to windows-1252. + if new_encoding == X_USER_DEFINED { + new_encoding = WINDOWS_1252; + } + + // Step 4. If the new encoding is identical or equivalent to the encoding that is already being used to interpret + // the input stream, then set the confidence to certain and return. This happens when the encoding information found + // in the file matches what the encoding sniffing algorithm determined to be the encoding, and in the second pass + // through the parser if the first pass found that the encoding sniffing algorithm described in the earlier section + // failed to find the right encoding. + if current_encoding == new_encoding { + self.confidence = Confidence::Certain; + return None; + } + + // Step 5. If all the bytes up to the last byte converted by the current decoder have the same + // Unicode interpretations in both the current encoding and the new encoding, and if the user agent + // supports changing the converter on the fly, then the user agent may change to the new converter + // for the encoding on the fly. Set the document's character encoding and the encoding used to convert + // the input stream to the new encoding, set the confidence to certain, and return. + // NOTE: We don't support changing the converter on the fly + + // Step 6. Otherwise, restart the navigate algorithm, with historyHandling set to "replace" and + // other inputs kept the same, but this time skip the encoding sniffing algorithm and instead just + // set the encoding to the new encoding and the confidence to certain. Whenever possible, this should + // be done without actually contacting the network layer (the bytes should be re-parsed from memory), + // even if, e.g., the document is marked as not being cacheable. If this is not possible and contacting + // the network layer would involve repeating a request that uses a method other than `GET`, then instead + // set the confidence to certain and ignore the new encoding. The resource will be misinterpreted. + // User agents may notify the user of the situation, to aid in application development. + Some(new_encoding) + } + + /// Decode the given chunk with the current encoding. The result will be pushed to the end + /// of the input stream. + pub fn decode(&mut self, chunk: &[u8], last: bool, output: &BufferQueue) { + let mut remaining = chunk; + loop { + let mut out: Tendril = Tendril::new(); + let max_len = self + .inner + .max_utf8_buffer_length_without_replacement(remaining.len()) + .unwrap_or(8192) + .min(8192); + + // SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement is going to initialize + // part of the buffer. We are only going to access the initialized segment. + unsafe { + out.push_uninitialized(max_len as u32); + } + + let (result, bytes_read, bytes_written) = self + .inner + .decode_to_utf8_without_replacement(&remaining, &mut out, last); + + if bytes_written > 0 { + let bytes_chunk = out.subtendril(0, bytes_written as u32); + + // SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement writes valid utf8 + let utf8_chunk = unsafe { bytes_chunk.reinterpret_without_validating() }; + output.push_back(utf8_chunk); + } + + if matches!(result, DecoderResult::Malformed(_, _)) { + output.push_back("\u{FFFD}".into()); + } + + remaining = &remaining[bytes_read..]; + if remaining.is_empty() { + return; + } + } + } +} diff --git a/markup5ever/input_stream.rs b/markup5ever/input_stream.rs new file mode 100644 index 00000000..97f0dfe4 --- /dev/null +++ b/markup5ever/input_stream.rs @@ -0,0 +1,167 @@ +use std::cell::RefCell; + +use encoding_rs::Encoding; +use tendril::StrTendril; + +use crate::buffer_queue::BufferQueue; +use crate::encoding::{Confidence, Decoder}; + +/// +pub struct InputStream { + input: BufferQueue, + decoder: RefCell, +} + +impl InputStream { + fn new(encoding: &'static Encoding) -> Self { + Self { + input: Default::default(), + decoder: RefCell::new(Decoder::new(encoding, Confidence::Tentative)), + } + } + + pub fn append(&self, data: StrTendril) { + self.input.push_back(data); + } + + pub fn append_bytes(&self, data: &[u8]) { + self.decoder.borrow_mut().decode(data, false, &self.input); + } + + pub fn code_points(&self) -> &BufferQueue { + &self.input + } + + /// Attempt to switch to another encoding. + /// + /// If the encoding was switched then the new encoding is returned. Note that the new encoding may be + /// different from the one that this function was called with. + pub fn maybe_switch_encoding(&self, encoding: &'static Encoding) -> Option<&'static Encoding> { + if self.decoder.borrow().confidence() == Confidence::Tentative { + if let Some(new_encoding) = self.decoder.borrow_mut().change_the_encoding_to(encoding) { + return Some(new_encoding); + } + } + None + } + + /// Move any input that is left in the decoding stage to the end of the input stream + pub fn finish_decoding_input(&self) { + self.decoder.borrow_mut().decode(&[], true, &self.input); + } + + /// Remove all input from the stream + pub fn clear(&self) { + self.input.clear(); + } +} + +pub struct DecodingParser { + /// Data received from `document.write` + script_input: BufferQueue, + input_stream: InputStream, + input_sink: Sink, +} + +impl DecodingParser +where + Sink: InputSink, +{ + pub fn new(sink: Sink, document_encoding: &'static Encoding) -> Self { + Self { + script_input: Default::default(), + input_stream: InputStream::new(document_encoding), + input_sink: sink, + } + } + + pub fn sink(&self) -> &Sink { + &self.input_sink + } + + pub fn input_stream(&self) -> &InputStream { + &self.input_stream + } + + /// Return an iterator that can be used to drive the parser + pub fn parse(&self) -> impl Iterator> + '_ { + self.input_sink + .feed(self.input_stream.code_points()) + .filter_map(|sink_result| match sink_result { + InputSinkResult::HandleScript(script) => Some(ParserAction::HandleScript(script)), + InputSinkResult::MaybeStartOverWithEncoding(encoding) => self + .input_stream + .maybe_switch_encoding(encoding) + .map(ParserAction::StartOverWithEncoding), + }) + } + + /// Returns an iterator that can be used to drive the parser + pub fn document_write<'a>( + &'a self, + input: &'a BufferQueue, + ) -> impl Iterator> + use<'a, Sink> { + debug_assert!( + self.script_input.is_empty(), + "Should not parse input from document.write while the parser is suspended" + ); + + self.input_sink + .feed(&input) + .filter_map(move |sink_result| match sink_result { + InputSinkResult::HandleScript(script) => Some(ParserAction::HandleScript(script)), + InputSinkResult::MaybeStartOverWithEncoding(encoding) => self + .input_stream + .maybe_switch_encoding(encoding) + .map(ParserAction::StartOverWithEncoding), + }) + } + + /// End a `document.write` transaction, appending any input that was not yet parsed to the + /// current insertion point, behind any input that was received reentrantly during this transaction. + pub fn push_script_input(&self, input: &BufferQueue) { + while let Some(chunk) = input.pop_front() { + self.script_input.push_back(chunk); + } + } + + /// Notifies the parser that it has been unblocked and parsing can resume + pub fn notify_parser_blocking_script_loaded(&self) { + // Move pending script input to the front of the input stream + self.script_input.swap_with(&self.input_stream.input); + while let Some(chunk) = self.script_input.pop_front() { + self.input_stream.input.push_back(chunk); + } + } +} + +pub enum ParserAction { + HandleScript(Handle), + StartOverWithEncoding(&'static Encoding), +} + +pub enum InputSinkResult { + HandleScript(Handle), + MaybeStartOverWithEncoding(&'static Encoding), +} + +pub trait InputSink { + type Handle; + + fn feed<'a>( + &'a self, + input: &'a BufferQueue, + ) -> impl Iterator> + 'a; +} + +impl ParserAction { + pub fn map_script(self, f: F) -> ParserAction + where + F: FnOnce(T) -> U, + { + match self { + Self::HandleScript(script) => ParserAction::HandleScript(f(script)), + Self::StartOverWithEncoding(encoding) => ParserAction::StartOverWithEncoding(encoding), + } + } +} diff --git a/markup5ever/interface/mod.rs b/markup5ever/interface/mod.rs index dbcb0663..c93114d2 100644 --- a/markup5ever/interface/mod.rs +++ b/markup5ever/interface/mod.rs @@ -13,6 +13,8 @@ use std::fmt; use tendril::StrTendril; use web_atoms::{LocalName, Namespace, Prefix}; +use crate::InputSinkResult; + pub use self::tree_builder::{create_element, AppendNode, AppendText, ElementFlags, NodeOrText}; pub use self::tree_builder::{ElemName, Tracer, TreeSink}; pub use self::tree_builder::{LimitedQuirks, NoQuirks, Quirks, QuirksMode}; @@ -65,6 +67,19 @@ impl fmt::Debug for ExpandedName<'_> { pub enum TokenizerResult { Done, Script(Handle), + MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding), +} + +impl From> for Option> { + fn from(value: TokenizerResult) -> Self { + match value { + TokenizerResult::Script(handle) => Some(InputSinkResult::HandleScript(handle)), + TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) => { + Some(InputSinkResult::MaybeStartOverWithEncoding(encoding)) + }, + TokenizerResult::Done => None, + } + } } /// Helper to quickly create an expanded name. diff --git a/markup5ever/lib.rs b/markup5ever/lib.rs index 24aaf411..7b7a2c85 100644 --- a/markup5ever/lib.rs +++ b/markup5ever/lib.rs @@ -57,3 +57,10 @@ mod util { pub use interface::{Attribute, ExpandedName, QualName, TokenizerResult}; pub use util::smallcharset::SmallCharSet; pub use util::*; + +#[cfg(feature = "encoding")] +pub mod encoding; + +mod input_stream; + +pub use input_stream::{DecodingParser, InputSink, InputSinkResult, InputStream, ParserAction}; diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs index 95a571e2..d3e7a713 100644 --- a/markup5ever/util/buffer_queue.rs +++ b/markup5ever/util/buffer_queue.rs @@ -18,9 +18,12 @@ //! //! [`BufferQueue`]: struct.BufferQueue.html -use std::{cell::RefCell, collections::VecDeque, mem}; +use std::{cell::RefCell, collections::VecDeque, fmt, mem}; -use tendril::StrTendril; +use tendril::{ + fmt::{Bytes, SliceFormat, UTF8}, + Atomicity, NonAtomic, StrTendril, Tendril, +}; pub use self::SetResult::{FromSet, NotFromSet}; use crate::util::smallcharset::SmallCharSet; @@ -38,18 +41,30 @@ pub enum SetResult { NotFromSet(StrTendril), } -/// A queue of owned string buffers, which supports incrementally consuming characters. +/// A queue of tendrils, which supports incrementally consuming characters. /// /// Internally it uses [`VecDeque`] and has the same complexity properties. /// /// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html #[derive(Debug)] -pub struct BufferQueue { +pub struct BufferQueue +where + F: SliceFormat + Default, + ::Slice: fmt::Debug, + A: Atomicity, +{ /// Buffers to process. - buffers: RefCell>, + buffers: RefCell>>, } -impl Default for BufferQueue { +pub type ByteBufferQueue = BufferQueue; + +impl Default for BufferQueue +where + F: SliceFormat + Default, + ::Slice: fmt::Debug, + A: Atomicity, +{ /// Create an empty BufferQueue. #[inline] fn default() -> Self { @@ -59,7 +74,20 @@ impl Default for BufferQueue { } } -impl BufferQueue { +impl BufferQueue +where + F: SliceFormat + Default, + ::Slice: fmt::Debug, + A: Atomicity, +{ + /// Swap the contents of the two buffers + pub fn swap(&self, other: &Self) { + mem::swap( + &mut self.buffers.borrow_mut(), + &mut other.buffers.borrow_mut(), + ); + } + /// Returns whether the queue is empty. #[inline] pub fn is_empty(&self) -> bool { @@ -68,14 +96,14 @@ impl BufferQueue { /// Get the buffer at the beginning of the queue. #[inline] - pub fn pop_front(&self) -> Option { + pub fn pop_front(&self) -> Option> { self.buffers.borrow_mut().pop_front() } /// Add a buffer to the beginning of the queue. /// /// If the buffer is empty, it will be skipped. - pub fn push_front(&self, buf: StrTendril) { + pub fn push_front(&self, buf: Tendril) { if buf.len32() == 0 { return; } @@ -85,13 +113,27 @@ impl BufferQueue { /// Add a buffer to the end of the queue. /// /// If the buffer is empty, it will be skipped. - pub fn push_back(&self, buf: StrTendril) { + pub fn push_back(&self, buf: Tendril) { if buf.len32() == 0 { return; } self.buffers.borrow_mut().push_back(buf); } + pub fn insert(&self, index: usize, buffer: Tendril) { + if buffer.len32() == 0 { + return; + } + + self.buffers.borrow_mut().insert(index, buffer); + } + + pub fn clear(&self) { + self.buffers.borrow_mut().clear(); + } +} + +impl BufferQueue { /// Look at the next available character without removing it, if the queue is not empty. pub fn peek(&self) -> Option { debug_assert!( @@ -236,11 +278,11 @@ impl BufferQueue { result } - pub fn replace_with(&self, other: BufferQueue) { + pub fn replace_with(&self, other: Self) { let _ = mem::replace(&mut *self.buffers.borrow_mut(), other.buffers.take()); } - pub fn swap_with(&self, other: &BufferQueue) { + pub fn swap_with(&self, other: &Self) { mem::swap( &mut *self.buffers.borrow_mut(), &mut *other.buffers.borrow_mut(), @@ -248,6 +290,20 @@ impl BufferQueue { } } +impl IntoIterator for BufferQueue +where + F: SliceFormat + Default, + ::Slice: fmt::Debug, + A: Atomicity, +{ + type Item = Tendril; + type IntoIter = > as IntoIterator>::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.buffers.into_inner().into_iter() + } +} + #[cfg(test)] #[allow(non_snake_case)] mod test { diff --git a/rcdom/tests/html-serializer.rs b/rcdom/tests/html-serializer.rs index 720fc6f1..952826a0 100644 --- a/rcdom/tests/html-serializer.rs +++ b/rcdom/tests/html-serializer.rs @@ -68,7 +68,7 @@ impl Serialize for Tokens { fn tokenize_and_serialize(input: StrTendril) -> StrTendril { let input = { - let q = ::html5ever::tokenizer::BufferQueue::default(); + let q = markup5ever::buffer_queue::BufferQueue::default(); q.push_front(input); q }; diff --git a/rcdom/tests/html-tokenizer.rs b/rcdom/tests/html-tokenizer.rs index 2102c98a..cbf56df6 100644 --- a/rcdom/tests/html-tokenizer.rs +++ b/rcdom/tests/html-tokenizer.rs @@ -14,12 +14,12 @@ use html5ever::tendril::*; use html5ever::tokenizer::states::{ CdataSection, Data, Plaintext, RawData, Rawtext, Rcdata, ScriptData, }; -use html5ever::tokenizer::BufferQueue; use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError}; use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token}; use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag}; use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts}; -use html5ever::{ns, Attribute, LocalName, QualName}; +use html5ever::{namespace_url, ns, Attribute, LocalName, QualName}; +use markup5ever::buffer_queue::BufferQueue; use serde_json::{Map, Value}; use std::cell::RefCell; use std::ffi::OsStr; diff --git a/xml5ever/src/tokenizer/mod.rs b/xml5ever/src/tokenizer/mod.rs index 4f7d1a48..3fb4eed0 100644 --- a/xml5ever/src/tokenizer/mod.rs +++ b/xml5ever/src/tokenizer/mod.rs @@ -23,13 +23,17 @@ use crate::tendril::StrTendril; use crate::{buffer_queue, Attribute, QualName, SmallCharSet}; use log::debug; use mac::{format_if, unwrap_or_return}; -use markup5ever::{local_name, namespace_prefix, ns, small_char_set, TokenizerResult}; +use markup5ever::{ + buffer_queue::BufferQueue, local_name, namespace_prefix, namespace_url, ns, small_char_set, + InputSink, InputSinkResult, TokenizerResult, +}; use std::borrow::Cow::{self, Borrowed}; use std::cell::{Cell, RefCell, RefMut}; use std::collections::BTreeMap; +use std::iter; use std::mem::replace; -use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult}; +use self::buffer_queue::{FromSet, NotFromSet, SetResult}; use self::char_ref::{CharRef, CharRefTokenizer}; use self::qname::QualNameTokenizer; use self::states::XmlState; @@ -1297,6 +1301,20 @@ impl XmlTokenizer { } } +impl InputSink for XmlTokenizer +where + Sink: TokenSink, +{ + type Handle = Sink::Handle; + + fn feed<'a>( + &'a self, + input: &'a BufferQueue, + ) -> impl Iterator> + 'a { + iter::from_fn(|| self.feed(input).into()) + } +} + #[cfg(test)] mod test {