diff --git a/html5ever/Cargo.toml b/html5ever/Cargo.toml
index f6f3576a..b0a3e711 100644
--- a/html5ever/Cargo.toml
+++ b/html5ever/Cargo.toml
@@ -13,13 +13,16 @@ readme = "../README.md"
rust-version.workspace = true
[features]
+default = ["encoding"]
trace_tokenizer = []
+encoding = ["dep:encoding_rs", "markup5ever/encoding"]
[dependencies]
log = "0.4"
mac = "0.1"
markup5ever = { version = "0.16", path = "../markup5ever" }
match_token = { workspace = true }
+encoding_rs = { version = "0.8", optional = true }
[dev-dependencies]
criterion = "0.5"
diff --git a/html5ever/examples/noop-tokenize.rs b/html5ever/examples/noop-tokenize.rs
index a95404df..773b30e2 100644
--- a/html5ever/examples/noop-tokenize.rs
+++ b/html5ever/examples/noop-tokenize.rs
@@ -15,7 +15,8 @@ use std::cell::RefCell;
use std::io;
use html5ever::tendril::*;
-use html5ever::tokenizer::{BufferQueue, Token, TokenSink, TokenSinkResult, Tokenizer};
+use html5ever::tokenizer::{Token, TokenSink, TokenSinkResult, Tokenizer};
+use markup5ever::buffer_queue::BufferQueue;
/// In our case, our sink only contains a tokens vector
struct Sink(RefCell>);
diff --git a/html5ever/examples/tokenize.rs b/html5ever/examples/tokenize.rs
index ba984d8f..f1368604 100644
--- a/html5ever/examples/tokenize.rs
+++ b/html5ever/examples/tokenize.rs
@@ -13,11 +13,11 @@ use std::cell::Cell;
use std::io;
use html5ever::tendril::*;
-use html5ever::tokenizer::BufferQueue;
use html5ever::tokenizer::{CharacterTokens, EndTag, NullCharacterToken, StartTag, TagToken};
use html5ever::tokenizer::{
ParseError, Token, TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts,
};
+use markup5ever::buffer_queue::BufferQueue;
#[derive(Clone)]
struct TokenPrinter {
diff --git a/html5ever/src/tokenizer/char_ref/mod.rs b/html5ever/src/tokenizer/char_ref/mod.rs
index c97780cc..e15bcd2d 100644
--- a/html5ever/src/tokenizer/char_ref/mod.rs
+++ b/html5ever/src/tokenizer/char_ref/mod.rs
@@ -8,12 +8,12 @@
// except according to those terms.
use super::{TokenSink, Tokenizer};
-use crate::buffer_queue::BufferQueue;
use crate::data;
use crate::tendril::StrTendril;
use log::debug;
use mac::format_if;
+use markup5ever::buffer_queue::BufferQueue;
use std::borrow::Cow::Borrowed;
use std::char::from_u32;
diff --git a/html5ever/src/tokenizer/interface.rs b/html5ever/src/tokenizer/interface.rs
index edc6afb9..91b33634 100644
--- a/html5ever/src/tokenizer/interface.rs
+++ b/html5ever/src/tokenizer/interface.rs
@@ -77,6 +77,8 @@ pub enum TokenSinkResult {
Script(Handle),
Plaintext,
RawData(states::RawKind),
+ #[cfg(feature = "encoding")]
+ MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}
/// Types which can receive tokens from the tokenizer.
diff --git a/html5ever/src/tokenizer/mod.rs b/html5ever/src/tokenizer/mod.rs
index 80b0f6d1..09bc50fd 100644
--- a/html5ever/src/tokenizer/mod.rs
+++ b/html5ever/src/tokenizer/mod.rs
@@ -22,16 +22,18 @@ use self::states::{Rawtext, Rcdata, ScriptData, ScriptDataEscaped};
use self::char_ref::{CharRef, CharRefTokenizer};
use crate::util::str::lower_ascii_letter;
-
use log::{debug, trace};
use mac::format_if;
-use markup5ever::{ns, small_char_set, TokenizerResult};
+use markup5ever::{
+ buffer_queue::BufferQueue, namespace_url, ns, small_char_set, InputSink, InputSinkResult,
+ TokenizerResult,
+};
use std::borrow::Cow::{self, Borrowed};
use std::cell::{Cell, RefCell, RefMut};
use std::collections::BTreeMap;
-use std::mem;
+use std::{iter, mem};
-pub use crate::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
+pub use crate::buffer_queue::{FromSet, NotFromSet, SetResult};
use crate::tendril::StrTendril;
use crate::{Attribute, LocalName, QualName, SmallCharSet};
@@ -43,6 +45,8 @@ pub enum ProcessResult {
Continue,
Suspend,
Script(Handle),
+ #[cfg(feature = "encoding")]
+ MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}
fn option_push(opt_str: &mut Option, c: char) {
@@ -357,6 +361,10 @@ impl Tokenizer {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
+ #[cfg(feature = "encoding")]
+ ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
+ return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
+ },
}
}
} else {
@@ -365,6 +373,10 @@ impl Tokenizer {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(node) => return TokenizerResult::Script(node),
+ #[cfg(feature = "encoding")]
+ ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
+ return TokenizerResult::MaybeChangeEncodingAndStartOver(encoding)
+ },
}
}
}
@@ -456,6 +468,10 @@ impl Tokenizer {
self.state.set(states::RawData(kind));
ProcessResult::Continue
},
+ #[cfg(feature = "encoding")]
+ TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding) => {
+ ProcessResult::MaybeChangeEncodingAndStartOver(encoding)
+ },
}
}
@@ -1680,6 +1696,8 @@ impl Tokenizer {
ProcessResult::Continue => (),
ProcessResult::Suspend => break,
ProcessResult::Script(_) => unreachable!(),
+ #[cfg(feature = "encoding")]
+ ProcessResult::MaybeChangeEncodingAndStartOver(_) => unreachable!(),
}
}
@@ -1841,13 +1859,27 @@ impl Tokenizer {
}
}
+impl InputSink for Tokenizer
+where
+ Sink: TokenSink,
+{
+ type Handle = Sink::Handle;
+
+ fn feed<'a>(
+ &'a self,
+ input: &'a BufferQueue,
+ ) -> impl Iterator- > + 'a {
+ iter::from_fn(|| self.feed(input).into())
+ }
+}
+
#[cfg(test)]
#[allow(non_snake_case)]
mod test {
use super::option_push; // private items
- use crate::tendril::{SliceExt, StrTendril};
-
use super::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
+ use crate::tendril::{SliceExt, StrTendril};
+ use crate::LocalName;
use super::interface::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use super::interface::{EndTag, StartTag, Tag, TagKind};
@@ -1856,8 +1888,6 @@ mod test {
use markup5ever::buffer_queue::BufferQueue;
use std::cell::RefCell;
- use crate::LocalName;
-
// LinesMatch implements the TokenSink trait. It is used for testing to see
// if current_line is being updated when process_token is called. The lines
// vector is a collection of the line numbers that each token is on.
diff --git a/html5ever/src/tree_builder/mod.rs b/html5ever/src/tree_builder/mod.rs
index 52a265a4..3180f854 100644
--- a/html5ever/src/tree_builder/mod.rs
+++ b/html5ever/src/tree_builder/mod.rs
@@ -396,6 +396,10 @@ where
assert!(more_tokens.is_empty());
return tokenizer::TokenSinkResult::RawData(k);
},
+ #[cfg(feature = "encoding")]
+ ProcessResult::MaybeChangeEncodingAndStartOver(encoding) => {
+ return tokenizer::TokenSinkResult::MaybeChangeEncodingAndStartOver(encoding);
+ },
}
}
}
diff --git a/html5ever/src/tree_builder/rules.rs b/html5ever/src/tree_builder/rules.rs
index 3d5e125b..99cc1e65 100644
--- a/html5ever/src/tree_builder/rules.rs
+++ b/html5ever/src/tree_builder/rules.rs
@@ -10,21 +10,24 @@
// The tree builder rules, as a single, enormous nested match expression.
use crate::interface::Quirks;
-use crate::tokenizer::states::{Rawtext, Rcdata, ScriptData};
+use crate::tokenizer::states::{Rawtext, Rcdata};
use crate::tokenizer::TagKind::{EndTag, StartTag};
use crate::tree_builder::tag_sets::*;
use crate::tree_builder::types::*;
-use crate::tree_builder::{
- create_element, html_elem, ElemName, NodeOrText::AppendNode, StrTendril, Tag, TreeBuilder,
- TreeSink,
-};
-use crate::QualName;
-use markup5ever::{expanded_name, local_name, ns};
+use crate::tree_builder::RawKind::ScriptData;
+use crate::tree_builder::{html_elem, ElemName, StrTendril, Tag, TreeBuilder, TreeSink};
+
+use markup5ever::interface::create_element;
+use markup5ever::interface::NodeOrText::AppendNode;
+use markup5ever::{expanded_name, local_name, namespace_url, ns, QualName};
use std::borrow::Cow::Borrowed;
use crate::tendril::SliceExt;
use match_token::match_token;
+#[cfg(feature = "encoding")]
+use encoding_rs::Encoding;
+
fn any_not_whitespace(x: &StrTendril) -> bool {
// FIXME: this might be much faster as a byte scan
x.chars().any(|c| !c.is_ascii_whitespace())
@@ -113,8 +116,21 @@ where
=> self.step(InsertionMode::InBody, token),
- tag @ => {
- // FIXME: handle and
+ tag @ => {
+ // FIXME: handle
+ #[cfg(feature = "encoding")]
+ if let Some(charset) = tag.attrs.iter().find(|a| a.name == QualName::new(None, ns!(html), local_name!("charset"))) {
+ if let Some(encoding) = Encoding::for_label(charset.value.as_bytes()) {
+ self.insert_and_pop_element_for(tag);
+ return ProcessResult::MaybeChangeEncodingAndStartOver(encoding);
+ }
+ }
+
+ self.insert_and_pop_element_for(tag);
+ ProcessResult::DoneAckSelfClosing
+ },
+
+ tag @ => {
self.insert_and_pop_element_for(tag);
ProcessResult::DoneAckSelfClosing
}
diff --git a/html5ever/src/tree_builder/types.rs b/html5ever/src/tree_builder/types.rs
index 684d5b0b..aeb19a7d 100644
--- a/html5ever/src/tree_builder/types.rs
+++ b/html5ever/src/tree_builder/types.rs
@@ -70,6 +70,8 @@ pub(crate) enum ProcessResult {
Script(Handle),
ToPlaintext,
ToRawData(RawKind),
+ #[cfg(feature = "encoding")]
+ MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
}
pub(crate) enum FormatEntry {
diff --git a/markup5ever/Cargo.toml b/markup5ever/Cargo.toml
index 4ed47a5e..b2c6c18a 100644
--- a/markup5ever/Cargo.toml
+++ b/markup5ever/Cargo.toml
@@ -13,7 +13,15 @@ rust-version.workspace = true
[lib]
path = "lib.rs"
+[features]
+encoding = ["dep:encoding_rs"]
+
[dependencies]
web_atoms = { version = "0.1", path = "../web_atoms" }
tendril = "0.4"
-log = "0.4"
\ No newline at end of file
+log = "0.4"
+encoding_rs = { version = "0.8", optional = true }
+
+[build-dependencies]
+string_cache_codegen = "0.5.4"
+phf_codegen = "0.11"
diff --git a/markup5ever/encoding.rs b/markup5ever/encoding.rs
new file mode 100644
index 00000000..e8ad8d1b
--- /dev/null
+++ b/markup5ever/encoding.rs
@@ -0,0 +1,133 @@
+// Copyright 2014-2025 The html5ever Project Developers. See the
+// COPYRIGHT file at the top-level directory of this distribution.
+//
+// Licensed under the Apache License, Version 2.0 or the MIT license
+// , at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+use encoding_rs::{DecoderResult, Encoding, UTF_16BE, UTF_8, WINDOWS_1252, X_USER_DEFINED};
+use tendril::{fmt::Bytes, Tendril};
+
+use crate::buffer_queue::BufferQueue;
+
+///
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Confidence {
+ Tentative,
+ Certain,
+ Irrelevant,
+}
+
+pub struct Decoder {
+ inner: encoding_rs::Decoder,
+ confidence: Confidence,
+}
+
+impl Decoder {
+ pub fn new(encoding: &'static Encoding, confidence: Confidence) -> Self {
+ Self {
+ inner: encoding.new_decoder(),
+ confidence,
+ }
+ }
+
+ pub fn confidence(&self) -> Confidence {
+ self.confidence
+ }
+
+ /// Returns `None` if the encoding should not be changed and `Some(encoding)` if the current encoding
+ /// should be changed to `encoding`
+ pub fn change_the_encoding_to(
+ &mut self,
+ mut new_encoding: &'static Encoding,
+ ) -> Option<&'static Encoding> {
+ let current_encoding = self.inner.encoding();
+ // Step 1. If the encoding that is already being used to interpret the input stream is UTF-16BE/LE,
+ // then set the confidence to certain and return. The new encoding is ignored; if it was anything
+ // but the same encoding, then it would be clearly incorrect.
+ if current_encoding == UTF_16BE || current_encoding == UTF_16BE {
+ self.confidence = Confidence::Certain;
+ return None;
+ }
+
+ // Step 2. If the new encoding is UTF-16BE/LE, then change it to UTF-8.
+ if new_encoding == UTF_16BE || new_encoding == UTF_16BE {
+ new_encoding = UTF_8;
+ }
+
+ // Step 3. If the new encoding is x-user-defined, then change it to windows-1252.
+ if new_encoding == X_USER_DEFINED {
+ new_encoding = WINDOWS_1252;
+ }
+
+ // Step 4. If the new encoding is identical or equivalent to the encoding that is already being used to interpret
+ // the input stream, then set the confidence to certain and return. This happens when the encoding information found
+ // in the file matches what the encoding sniffing algorithm determined to be the encoding, and in the second pass
+ // through the parser if the first pass found that the encoding sniffing algorithm described in the earlier section
+ // failed to find the right encoding.
+ if current_encoding == new_encoding {
+ self.confidence = Confidence::Certain;
+ return None;
+ }
+
+ // Step 5. If all the bytes up to the last byte converted by the current decoder have the same
+ // Unicode interpretations in both the current encoding and the new encoding, and if the user agent
+ // supports changing the converter on the fly, then the user agent may change to the new converter
+ // for the encoding on the fly. Set the document's character encoding and the encoding used to convert
+ // the input stream to the new encoding, set the confidence to certain, and return.
+ // NOTE: We don't support changing the converter on the fly
+
+ // Step 6. Otherwise, restart the navigate algorithm, with historyHandling set to "replace" and
+ // other inputs kept the same, but this time skip the encoding sniffing algorithm and instead just
+ // set the encoding to the new encoding and the confidence to certain. Whenever possible, this should
+ // be done without actually contacting the network layer (the bytes should be re-parsed from memory),
+ // even if, e.g., the document is marked as not being cacheable. If this is not possible and contacting
+ // the network layer would involve repeating a request that uses a method other than `GET`, then instead
+ // set the confidence to certain and ignore the new encoding. The resource will be misinterpreted.
+ // User agents may notify the user of the situation, to aid in application development.
+ Some(new_encoding)
+ }
+
+ /// Decode the given chunk with the current encoding. The result will be pushed to the end
+ /// of the input stream.
+ pub fn decode(&mut self, chunk: &[u8], last: bool, output: &BufferQueue) {
+ let mut remaining = chunk;
+ loop {
+ let mut out: Tendril = Tendril::new();
+ let max_len = self
+ .inner
+ .max_utf8_buffer_length_without_replacement(remaining.len())
+ .unwrap_or(8192)
+ .min(8192);
+
+ // SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement is going to initialize
+ // part of the buffer. We are only going to access the initialized segment.
+ unsafe {
+ out.push_uninitialized(max_len as u32);
+ }
+
+ let (result, bytes_read, bytes_written) = self
+ .inner
+ .decode_to_utf8_without_replacement(&remaining, &mut out, last);
+
+ if bytes_written > 0 {
+ let bytes_chunk = out.subtendril(0, bytes_written as u32);
+
+ // SAFETY: encoding_rs::Decoder::decode_to_utf8_without_replacement writes valid utf8
+ let utf8_chunk = unsafe { bytes_chunk.reinterpret_without_validating() };
+ output.push_back(utf8_chunk);
+ }
+
+ if matches!(result, DecoderResult::Malformed(_, _)) {
+ output.push_back("\u{FFFD}".into());
+ }
+
+ remaining = &remaining[bytes_read..];
+ if remaining.is_empty() {
+ return;
+ }
+ }
+ }
+}
diff --git a/markup5ever/input_stream.rs b/markup5ever/input_stream.rs
new file mode 100644
index 00000000..97f0dfe4
--- /dev/null
+++ b/markup5ever/input_stream.rs
@@ -0,0 +1,167 @@
+use std::cell::RefCell;
+
+use encoding_rs::Encoding;
+use tendril::StrTendril;
+
+use crate::buffer_queue::BufferQueue;
+use crate::encoding::{Confidence, Decoder};
+
+///
+pub struct InputStream {
+ input: BufferQueue,
+ decoder: RefCell,
+}
+
+impl InputStream {
+ fn new(encoding: &'static Encoding) -> Self {
+ Self {
+ input: Default::default(),
+ decoder: RefCell::new(Decoder::new(encoding, Confidence::Tentative)),
+ }
+ }
+
+ pub fn append(&self, data: StrTendril) {
+ self.input.push_back(data);
+ }
+
+ pub fn append_bytes(&self, data: &[u8]) {
+ self.decoder.borrow_mut().decode(data, false, &self.input);
+ }
+
+ pub fn code_points(&self) -> &BufferQueue {
+ &self.input
+ }
+
+ /// Attempt to switch to another encoding.
+ ///
+ /// If the encoding was switched then the new encoding is returned. Note that the new encoding may be
+ /// different from the one that this function was called with.
+ pub fn maybe_switch_encoding(&self, encoding: &'static Encoding) -> Option<&'static Encoding> {
+ if self.decoder.borrow().confidence() == Confidence::Tentative {
+ if let Some(new_encoding) = self.decoder.borrow_mut().change_the_encoding_to(encoding) {
+ return Some(new_encoding);
+ }
+ }
+ None
+ }
+
+ /// Move any input that is left in the decoding stage to the end of the input stream
+ pub fn finish_decoding_input(&self) {
+ self.decoder.borrow_mut().decode(&[], true, &self.input);
+ }
+
+ /// Remove all input from the stream
+ pub fn clear(&self) {
+ self.input.clear();
+ }
+}
+
+pub struct DecodingParser {
+ /// Data received from `document.write`
+ script_input: BufferQueue,
+ input_stream: InputStream,
+ input_sink: Sink,
+}
+
+impl DecodingParser
+where
+ Sink: InputSink,
+{
+ pub fn new(sink: Sink, document_encoding: &'static Encoding) -> Self {
+ Self {
+ script_input: Default::default(),
+ input_stream: InputStream::new(document_encoding),
+ input_sink: sink,
+ }
+ }
+
+ pub fn sink(&self) -> &Sink {
+ &self.input_sink
+ }
+
+ pub fn input_stream(&self) -> &InputStream {
+ &self.input_stream
+ }
+
+ /// Return an iterator that can be used to drive the parser
+ pub fn parse(&self) -> impl Iterator
- > + '_ {
+ self.input_sink
+ .feed(self.input_stream.code_points())
+ .filter_map(|sink_result| match sink_result {
+ InputSinkResult::HandleScript(script) => Some(ParserAction::HandleScript(script)),
+ InputSinkResult::MaybeStartOverWithEncoding(encoding) => self
+ .input_stream
+ .maybe_switch_encoding(encoding)
+ .map(ParserAction::StartOverWithEncoding),
+ })
+ }
+
+ /// Returns an iterator that can be used to drive the parser
+ pub fn document_write<'a>(
+ &'a self,
+ input: &'a BufferQueue,
+ ) -> impl Iterator
- > + use<'a, Sink> {
+ debug_assert!(
+ self.script_input.is_empty(),
+ "Should not parse input from document.write while the parser is suspended"
+ );
+
+ self.input_sink
+ .feed(&input)
+ .filter_map(move |sink_result| match sink_result {
+ InputSinkResult::HandleScript(script) => Some(ParserAction::HandleScript(script)),
+ InputSinkResult::MaybeStartOverWithEncoding(encoding) => self
+ .input_stream
+ .maybe_switch_encoding(encoding)
+ .map(ParserAction::StartOverWithEncoding),
+ })
+ }
+
+ /// End a `document.write` transaction, appending any input that was not yet parsed to the
+ /// current insertion point, behind any input that was received reentrantly during this transaction.
+ pub fn push_script_input(&self, input: &BufferQueue) {
+ while let Some(chunk) = input.pop_front() {
+ self.script_input.push_back(chunk);
+ }
+ }
+
+ /// Notifies the parser that it has been unblocked and parsing can resume
+ pub fn notify_parser_blocking_script_loaded(&self) {
+ // Move pending script input to the front of the input stream
+ self.script_input.swap_with(&self.input_stream.input);
+ while let Some(chunk) = self.script_input.pop_front() {
+ self.input_stream.input.push_back(chunk);
+ }
+ }
+}
+
+pub enum ParserAction {
+ HandleScript(Handle),
+ StartOverWithEncoding(&'static Encoding),
+}
+
+pub enum InputSinkResult {
+ HandleScript(Handle),
+ MaybeStartOverWithEncoding(&'static Encoding),
+}
+
+pub trait InputSink {
+ type Handle;
+
+ fn feed<'a>(
+ &'a self,
+ input: &'a BufferQueue,
+ ) -> impl Iterator
- > + 'a;
+}
+
+impl ParserAction {
+ pub fn map_script(self, f: F) -> ParserAction
+ where
+ F: FnOnce(T) -> U,
+ {
+ match self {
+ Self::HandleScript(script) => ParserAction::HandleScript(f(script)),
+ Self::StartOverWithEncoding(encoding) => ParserAction::StartOverWithEncoding(encoding),
+ }
+ }
+}
diff --git a/markup5ever/interface/mod.rs b/markup5ever/interface/mod.rs
index dbcb0663..c93114d2 100644
--- a/markup5ever/interface/mod.rs
+++ b/markup5ever/interface/mod.rs
@@ -13,6 +13,8 @@ use std::fmt;
use tendril::StrTendril;
use web_atoms::{LocalName, Namespace, Prefix};
+use crate::InputSinkResult;
+
pub use self::tree_builder::{create_element, AppendNode, AppendText, ElementFlags, NodeOrText};
pub use self::tree_builder::{ElemName, Tracer, TreeSink};
pub use self::tree_builder::{LimitedQuirks, NoQuirks, Quirks, QuirksMode};
@@ -65,6 +67,19 @@ impl fmt::Debug for ExpandedName<'_> {
pub enum TokenizerResult {
Done,
Script(Handle),
+ MaybeChangeEncodingAndStartOver(&'static encoding_rs::Encoding),
+}
+
+impl From> for Option> {
+ fn from(value: TokenizerResult) -> Self {
+ match value {
+ TokenizerResult::Script(handle) => Some(InputSinkResult::HandleScript(handle)),
+ TokenizerResult::MaybeChangeEncodingAndStartOver(encoding) => {
+ Some(InputSinkResult::MaybeStartOverWithEncoding(encoding))
+ },
+ TokenizerResult::Done => None,
+ }
+ }
}
/// Helper to quickly create an expanded name.
diff --git a/markup5ever/lib.rs b/markup5ever/lib.rs
index 24aaf411..7b7a2c85 100644
--- a/markup5ever/lib.rs
+++ b/markup5ever/lib.rs
@@ -57,3 +57,10 @@ mod util {
pub use interface::{Attribute, ExpandedName, QualName, TokenizerResult};
pub use util::smallcharset::SmallCharSet;
pub use util::*;
+
+#[cfg(feature = "encoding")]
+pub mod encoding;
+
+mod input_stream;
+
+pub use input_stream::{DecodingParser, InputSink, InputSinkResult, InputStream, ParserAction};
diff --git a/markup5ever/util/buffer_queue.rs b/markup5ever/util/buffer_queue.rs
index 95a571e2..d3e7a713 100644
--- a/markup5ever/util/buffer_queue.rs
+++ b/markup5ever/util/buffer_queue.rs
@@ -18,9 +18,12 @@
//!
//! [`BufferQueue`]: struct.BufferQueue.html
-use std::{cell::RefCell, collections::VecDeque, mem};
+use std::{cell::RefCell, collections::VecDeque, fmt, mem};
-use tendril::StrTendril;
+use tendril::{
+ fmt::{Bytes, SliceFormat, UTF8},
+ Atomicity, NonAtomic, StrTendril, Tendril,
+};
pub use self::SetResult::{FromSet, NotFromSet};
use crate::util::smallcharset::SmallCharSet;
@@ -38,18 +41,30 @@ pub enum SetResult {
NotFromSet(StrTendril),
}
-/// A queue of owned string buffers, which supports incrementally consuming characters.
+/// A queue of tendrils, which supports incrementally consuming characters.
///
/// Internally it uses [`VecDeque`] and has the same complexity properties.
///
/// [`VecDeque`]: https://doc.rust-lang.org/std/collections/struct.VecDeque.html
#[derive(Debug)]
-pub struct BufferQueue {
+pub struct BufferQueue
+where
+ F: SliceFormat + Default,
+ ::Slice: fmt::Debug,
+ A: Atomicity,
+{
/// Buffers to process.
- buffers: RefCell>,
+ buffers: RefCell>>,
}
-impl Default for BufferQueue {
+pub type ByteBufferQueue = BufferQueue;
+
+impl Default for BufferQueue
+where
+ F: SliceFormat + Default,
+ ::Slice: fmt::Debug,
+ A: Atomicity,
+{
/// Create an empty BufferQueue.
#[inline]
fn default() -> Self {
@@ -59,7 +74,20 @@ impl Default for BufferQueue {
}
}
-impl BufferQueue {
+impl BufferQueue
+where
+ F: SliceFormat + Default,
+ ::Slice: fmt::Debug,
+ A: Atomicity,
+{
+ /// Swap the contents of the two buffers
+ pub fn swap(&self, other: &Self) {
+ mem::swap(
+ &mut self.buffers.borrow_mut(),
+ &mut other.buffers.borrow_mut(),
+ );
+ }
+
/// Returns whether the queue is empty.
#[inline]
pub fn is_empty(&self) -> bool {
@@ -68,14 +96,14 @@ impl BufferQueue {
/// Get the buffer at the beginning of the queue.
#[inline]
- pub fn pop_front(&self) -> Option {
+ pub fn pop_front(&self) -> Option> {
self.buffers.borrow_mut().pop_front()
}
/// Add a buffer to the beginning of the queue.
///
/// If the buffer is empty, it will be skipped.
- pub fn push_front(&self, buf: StrTendril) {
+ pub fn push_front(&self, buf: Tendril) {
if buf.len32() == 0 {
return;
}
@@ -85,13 +113,27 @@ impl BufferQueue {
/// Add a buffer to the end of the queue.
///
/// If the buffer is empty, it will be skipped.
- pub fn push_back(&self, buf: StrTendril) {
+ pub fn push_back(&self, buf: Tendril) {
if buf.len32() == 0 {
return;
}
self.buffers.borrow_mut().push_back(buf);
}
+ pub fn insert(&self, index: usize, buffer: Tendril) {
+ if buffer.len32() == 0 {
+ return;
+ }
+
+ self.buffers.borrow_mut().insert(index, buffer);
+ }
+
+ pub fn clear(&self) {
+ self.buffers.borrow_mut().clear();
+ }
+}
+
+impl BufferQueue {
/// Look at the next available character without removing it, if the queue is not empty.
pub fn peek(&self) -> Option {
debug_assert!(
@@ -236,11 +278,11 @@ impl BufferQueue {
result
}
- pub fn replace_with(&self, other: BufferQueue) {
+ pub fn replace_with(&self, other: Self) {
let _ = mem::replace(&mut *self.buffers.borrow_mut(), other.buffers.take());
}
- pub fn swap_with(&self, other: &BufferQueue) {
+ pub fn swap_with(&self, other: &Self) {
mem::swap(
&mut *self.buffers.borrow_mut(),
&mut *other.buffers.borrow_mut(),
@@ -248,6 +290,20 @@ impl BufferQueue {
}
}
+impl IntoIterator for BufferQueue
+where
+ F: SliceFormat + Default,
+ ::Slice: fmt::Debug,
+ A: Atomicity,
+{
+ type Item = Tendril;
+ type IntoIter = > as IntoIterator>::IntoIter;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.buffers.into_inner().into_iter()
+ }
+}
+
#[cfg(test)]
#[allow(non_snake_case)]
mod test {
diff --git a/rcdom/tests/html-serializer.rs b/rcdom/tests/html-serializer.rs
index 720fc6f1..952826a0 100644
--- a/rcdom/tests/html-serializer.rs
+++ b/rcdom/tests/html-serializer.rs
@@ -68,7 +68,7 @@ impl Serialize for Tokens {
fn tokenize_and_serialize(input: StrTendril) -> StrTendril {
let input = {
- let q = ::html5ever::tokenizer::BufferQueue::default();
+ let q = markup5ever::buffer_queue::BufferQueue::default();
q.push_front(input);
q
};
diff --git a/rcdom/tests/html-tokenizer.rs b/rcdom/tests/html-tokenizer.rs
index 2102c98a..cbf56df6 100644
--- a/rcdom/tests/html-tokenizer.rs
+++ b/rcdom/tests/html-tokenizer.rs
@@ -14,12 +14,12 @@ use html5ever::tendril::*;
use html5ever::tokenizer::states::{
CdataSection, Data, Plaintext, RawData, Rawtext, Rcdata, ScriptData,
};
-use html5ever::tokenizer::BufferQueue;
use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token};
use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag};
use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
-use html5ever::{ns, Attribute, LocalName, QualName};
+use html5ever::{namespace_url, ns, Attribute, LocalName, QualName};
+use markup5ever::buffer_queue::BufferQueue;
use serde_json::{Map, Value};
use std::cell::RefCell;
use std::ffi::OsStr;
diff --git a/xml5ever/src/tokenizer/mod.rs b/xml5ever/src/tokenizer/mod.rs
index 4f7d1a48..3fb4eed0 100644
--- a/xml5ever/src/tokenizer/mod.rs
+++ b/xml5ever/src/tokenizer/mod.rs
@@ -23,13 +23,17 @@ use crate::tendril::StrTendril;
use crate::{buffer_queue, Attribute, QualName, SmallCharSet};
use log::debug;
use mac::{format_if, unwrap_or_return};
-use markup5ever::{local_name, namespace_prefix, ns, small_char_set, TokenizerResult};
+use markup5ever::{
+ buffer_queue::BufferQueue, local_name, namespace_prefix, namespace_url, ns, small_char_set,
+ InputSink, InputSinkResult, TokenizerResult,
+};
use std::borrow::Cow::{self, Borrowed};
use std::cell::{Cell, RefCell, RefMut};
use std::collections::BTreeMap;
+use std::iter;
use std::mem::replace;
-use self::buffer_queue::{BufferQueue, FromSet, NotFromSet, SetResult};
+use self::buffer_queue::{FromSet, NotFromSet, SetResult};
use self::char_ref::{CharRef, CharRefTokenizer};
use self::qname::QualNameTokenizer;
use self::states::XmlState;
@@ -1297,6 +1301,20 @@ impl XmlTokenizer {
}
}
+impl InputSink for XmlTokenizer
+where
+ Sink: TokenSink,
+{
+ type Handle = Sink::Handle;
+
+ fn feed<'a>(
+ &'a self,
+ input: &'a BufferQueue,
+ ) -> impl Iterator
- > + 'a {
+ iter::from_fn(|| self.feed(input).into())
+ }
+}
+
#[cfg(test)]
mod test {