Skip to content

Update encoding detection #257

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 11, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ Released on XXX

* **Drop support of charade, now that chardet is supported once more.**

* **Replace the charset keyword argument on parse and related methods
with a set of keyword arguments: override_encoding, transport_encoding,
same_origin_parent_encoding, likely_encoding, and default_encoding.**


0.9999999/1.0b8
~~~~~~~~~~~~~~~
Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pass into html5lib as follows:
import html5lib

with closing(urlopen("http://example.com/")) as f:
document = html5lib.parse(f, encoding=f.info().getparam("charset"))
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))

When using with ``urllib.request`` (Python 3), the charset from HTTP
should be pass into html5lib as follows:
Expand All @@ -62,7 +62,7 @@ should be pass into html5lib as follows:
import html5lib

with urlopen("http://example.com/") as f:
document = html5lib.parse(f, encoding=f.info().get_content_charset())
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())

To have more control over the parser, create a parser object explicitly.
For instance, to make the parser raise exceptions on parse errors, use:
Expand Down
39 changes: 11 additions & 28 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,17 @@
)


def parse(doc, treebuilder="etree", encoding=None,
namespaceHTMLElements=True, scripting=False):
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse a string or file-like object into a tree"""
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, encoding=encoding, scripting=scripting)
return p.parse(doc, **kwargs)


def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
namespaceHTMLElements=True, scripting=False):
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parseFragment(doc, container=container, encoding=encoding, scripting=scripting)
return p.parseFragment(doc, container=container, **kwargs)


def method_decorator_metaclass(function):
Expand All @@ -59,18 +57,13 @@ class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""

def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
strict=False, namespaceHTMLElements=True, debug=False):
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
"""
strict - raise an exception when a parse error is encountered

tree - a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)

tokenizer - a class that provides a stream of tokens to the treebuilder.
This may be replaced for e.g. a sanitizer which converts some tags to
text
"""

# Raise an exception on the first error encountered
Expand All @@ -79,22 +72,17 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
if tree is None:
tree = treebuilders.getTreeBuilder("etree")
self.tree = tree(namespaceHTMLElements)
self.tokenizer_class = tokenizer
self.errors = []

self.phases = dict([(name, cls(self, self.tree)) for name, cls in
getPhases(debug).items()])

def _parse(self, stream, innerHTML=False, container="div", encoding=None,
parseMeta=True, useChardet=True, scripting=False, **kwargs):
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):

self.innerHTMLMode = innerHTML
self.container = container
self.scripting = scripting
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
parseMeta=parseMeta,
useChardet=useChardet,
parser=self, **kwargs)
self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
self.reset()

try:
Expand Down Expand Up @@ -232,8 +220,7 @@ def normalizedTokens(self):
for token in self.tokenizer:
yield self.normalizeToken(token)

def parse(self, stream, encoding=None, parseMeta=True,
useChardet=True, scripting=False):
def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree

stream - a filelike object or string containing the HTML to be parsed
Expand All @@ -245,13 +232,10 @@ def parse(self, stream, encoding=None, parseMeta=True,

scripting - treat noscript elements as if javascript was turned on
"""
self._parse(stream, innerHTML=False, encoding=encoding,
parseMeta=parseMeta, useChardet=useChardet, scripting=scripting)
self._parse(stream, False, None, *args, **kwargs)
return self.tree.getDocument()

def parseFragment(self, stream, container="div", encoding=None,
parseMeta=False, useChardet=True, scripting=False):
# pylint:disable=unused-argument
def parseFragment(self, stream, *args, **kwargs):
"""Parse a HTML fragment into a well-formed tree fragment

container - name of the element we're setting the innerHTML property
Expand All @@ -266,8 +250,7 @@ def parseFragment(self, stream, container="div", encoding=None,

scripting - treat noscript elements as if javascript was turned on
"""
self._parse(stream, True, container=container,
encoding=encoding, scripting=scripting)
self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment()

def parseError(self, errorcode="XXX-undefined-error", datavars=None):
Expand Down
98 changes: 62 additions & 36 deletions html5lib/inputstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
return b"".join(rv)


def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
def HTMLInputStream(source, **kwargs):
# Work around Python bug #20007: read(0) closes the connection.
# http://bugs.python.org/issue20007
if (isinstance(source, http_client.HTTPResponse) or
Expand All @@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
isUnicode = isinstance(source, text_type)

if isUnicode:
if encoding is not None:
raise TypeError("Cannot explicitly set an encoding with a unicode string")
encodings = [x for x in kwargs if x.endswith("_encoding")]
if encodings:
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)

return HTMLUnicodeInputStream(source)
return HTMLUnicodeInputStream(source, **kwargs)
else:
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
return HTMLBinaryInputStream(source, **kwargs)


class HTMLUnicodeInputStream(object):
Expand All @@ -173,8 +174,6 @@ def __init__(self, source):
regardless of any BOM or later declaration (such as in a meta
element)

parseMeta - Look for a <meta> element containing encoding information

"""

if not utils.supports_lone_surrogates:
Expand Down Expand Up @@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):

"""

def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
def __init__(self, source, override_encoding=None, transport_encoding=None,
same_origin_parent_encoding=None, likely_encoding=None,
default_encoding="windows-1252", useChardet=True):
"""Initialises the HTMLInputStream.

HTMLInputStream(source, [encoding]) -> Normalized stream from source
Expand All @@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
regardless of any BOM or later declaration (such as in a meta
element)

parseMeta - Look for a <meta> element containing encoding information

"""
# Raw Stream - for unicode objects this will encode to utf-8 and set
# self.charEncoding as appropriate
self.rawStream = self.openStream(source)

HTMLUnicodeInputStream.__init__(self, self.rawStream)

self.charEncoding = (lookupEncoding(encoding), "certain")

# Encoding Information
# Number of bytes to use when looking for a meta element with
# encoding information
self.numBytesMeta = 1024
# Number of bytes to use when using detecting encoding using chardet
self.numBytesChardet = 100
# Encoding to use if no other information can be found
self.defaultEncoding = "windows-1252"
# Things from args
self.override_encoding = override_encoding
self.transport_encoding = transport_encoding
self.same_origin_parent_encoding = same_origin_parent_encoding
self.likely_encoding = likely_encoding
self.default_encoding = default_encoding

# Detect encoding iff no explicit "transport level" encoding is supplied
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)
assert self.charEncoding[0] is not None
# Determine encoding
self.charEncoding = self.determineEncoding(useChardet)
assert self.charEncoding[0] is not None

# Call superclass
self.reset()
Expand Down Expand Up @@ -454,21 +454,45 @@ def openStream(self, source):

return stream

def detectEncoding(self, parseMeta=True, chardet=True):
# First look for a BOM
def determineEncoding(self, chardet=True):
# BOMs take precedence over everything
# This will also read past the BOM if present
encoding = self.detectBOM()
confidence = "certain"
# If there is no BOM need to look for meta elements with encoding
# information
if encoding is None and parseMeta:
encoding = self.detectEncodingMeta()
confidence = "tentative"
charEncoding = self.detectBOM(), "certain"
if charEncoding[0] is not None:
return charEncoding

# If we've been overriden, we've been overriden
charEncoding = lookupEncoding(self.override_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding

# Now check the transport layer
charEncoding = lookupEncoding(self.transport_encoding), "certain"
if charEncoding[0] is not None:
return charEncoding

# Look for meta elements with encoding information
charEncoding = self.detectEncodingMeta(), "tentative"
if charEncoding[0] is not None:
return charEncoding

# Parent document encoding
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
return charEncoding

# "likely" encoding
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding

# Guess with chardet, if available
if encoding is None and chardet:
confidence = "tentative"
if chardet:
try:
from chardet.universaldetector import UniversalDetector
except ImportError:
pass
else:
buffers = []
detector = UniversalDetector()
while not detector.done:
Expand All @@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
detector.close()
encoding = lookupEncoding(detector.result['encoding'])
self.rawStream.seek(0)
except ImportError:
pass
# If all else fails use the default encoding
if encoding is None:
confidence = "tentative"
encoding = lookupEncoding(self.defaultEncoding)
if encoding is not None:
return encoding, "tentative"

# Try the default encoding
charEncoding = lookupEncoding(self.default_encoding), "tentative"
if charEncoding[0] is not None:
return charEncoding

return encoding, confidence
# Fallback to html5lib's default if even that hasn't worked
return lookupEncoding("windows-1252"), "tentative"

def changeEncoding(self, newEncoding):
assert self.charEncoding[1] != "certain"
Expand Down
54 changes: 51 additions & 3 deletions html5lib/tests/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import os

import pytest

from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
from html5lib import HTMLParser, inputstream

Expand All @@ -11,7 +13,7 @@ def test_basic_prescan_length():
pad = 1024 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 1024 # Sanity
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'utf-8' == stream.charEncoding[0].name


Expand All @@ -20,14 +22,59 @@ def test_parser_reparse():
pad = 10240 - len(data) + 1
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
assert len(data) == 10240 # Sanity
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
assert 'windows-1252' == stream.charEncoding[0].name
p = HTMLParser(namespaceHTMLElements=False)
doc = p.parse(data, useChardet=False)
assert 'utf-8' == p.documentEncoding
assert doc.find(".//title").text == "Caf\u00E9"


@pytest.mark.parametrize("expected,data,kwargs", [
("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}),
("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}),
("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}),
("iso-8859-2", b"<meta charset=iso-8859-3>", {"transport_encoding": "iso-8859-2"}),
("iso-8859-2", b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding": "iso-8859-3"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}),
("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}),
("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}),
("windows-1252", b"", {"default_encoding": "totally-bogus-string"}),
("windows-1252", b"", {}),
])
def test_parser_args(expected, data, kwargs):
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
assert expected == stream.charEncoding[0].name
p = HTMLParser()
p.parse(data, useChardet=False, **kwargs)
assert expected == p.documentEncoding


@pytest.mark.parametrize("kwargs", [
{"override_encoding": "iso-8859-2"},
{"override_encoding": None},
{"transport_encoding": "iso-8859-2"},
{"transport_encoding": None},
{"same_origin_parent_encoding": "iso-8859-2"},
{"same_origin_parent_encoding": None},
{"likely_encoding": "iso-8859-2"},
{"likely_encoding": None},
{"default_encoding": "iso-8859-2"},
{"default_encoding": None},
{"foo_encoding": "iso-8859-2"},
{"foo_encoding": None},
])
def test_parser_args_raises(kwargs):
with pytest.raises(TypeError) as exc_info:
p = HTMLParser()
p.parse("", useChardet=False, **kwargs)
assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")


def runParserEncodingTest(data, encoding):
p = HTMLParser()
assert p.documentEncoding is None
Expand All @@ -38,7 +85,7 @@ def runParserEncodingTest(data, encoding):


def runPreScanEncodingTest(data, encoding):
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
encoding = encoding.lower().decode("ascii")

# Very crude way to ignore irrelevant tests
Expand All @@ -55,6 +102,7 @@ def test_encoding():
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])


# pylint:disable=wrong-import-position
try:
import chardet # noqa
Expand Down
Loading