Skip to content

Commit 699276b

Browse files
authored
Merge pull request #257 from gsnedders/det_encoding
Update encoding detection; r=nobody!
2 parents dce9d62 + fc9f63b commit 699276b

File tree

7 files changed

+137
-83
lines changed

7 files changed

+137
-83
lines changed

Diff for: CHANGES.rst

+4
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ Released on XXX
4646

4747
* **Drop support of charade, now that chardet is supported once more.**
4848

49+
* **Replace the charset keyword argument on parse and related methods
50+
with a set of keyword arguments: override_encoding, transport_encoding,
51+
same_origin_parent_encoding, likely_encoding, and default_encoding.**
52+
4953

5054
0.9999999/1.0b8
5155
~~~~~~~~~~~~~~~

Diff for: README.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pass into html5lib as follows:
5151
import html5lib
5252
5353
with closing(urlopen("http://example.com/")) as f:
54-
document = html5lib.parse(f, encoding=f.info().getparam("charset"))
54+
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
5555
5656
When using with ``urllib.request`` (Python 3), the charset from HTTP
5757
should be pass into html5lib as follows:
@@ -62,7 +62,7 @@ should be pass into html5lib as follows:
6262
import html5lib
6363
6464
with urlopen("http://example.com/") as f:
65-
document = html5lib.parse(f, encoding=f.info().get_content_charset())
65+
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
6666
6767
To have more control over the parser, create a parser object explicitly.
6868
For instance, to make the parser raise exceptions on parse errors, use:

Diff for: html5lib/html5parser.py

+11-28
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,17 @@
2828
)
2929

3030

31-
def parse(doc, treebuilder="etree", encoding=None,
32-
namespaceHTMLElements=True, scripting=False):
31+
def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
3332
"""Parse a string or file-like object into a tree"""
3433
tb = treebuilders.getTreeBuilder(treebuilder)
3534
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
36-
return p.parse(doc, encoding=encoding, scripting=scripting)
35+
return p.parse(doc, **kwargs)
3736

3837

39-
def parseFragment(doc, container="div", treebuilder="etree", encoding=None,
40-
namespaceHTMLElements=True, scripting=False):
38+
def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
4139
tb = treebuilders.getTreeBuilder(treebuilder)
4240
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
43-
return p.parseFragment(doc, container=container, encoding=encoding, scripting=scripting)
41+
return p.parseFragment(doc, container=container, **kwargs)
4442

4543

4644
def method_decorator_metaclass(function):
@@ -59,18 +57,13 @@ class HTMLParser(object):
5957
"""HTML parser. Generates a tree structure from a stream of (possibly
6058
malformed) HTML"""
6159

62-
def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
63-
strict=False, namespaceHTMLElements=True, debug=False):
60+
def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
6461
"""
6562
strict - raise an exception when a parse error is encountered
6663
6764
tree - a treebuilder class controlling the type of tree that will be
6865
returned. Built in treebuilders can be accessed through
6966
html5lib.treebuilders.getTreeBuilder(treeType)
70-
71-
tokenizer - a class that provides a stream of tokens to the treebuilder.
72-
This may be replaced for e.g. a sanitizer which converts some tags to
73-
text
7467
"""
7568

7669
# Raise an exception on the first error encountered
@@ -79,22 +72,17 @@ def __init__(self, tree=None, tokenizer=tokenizer.HTMLTokenizer,
7972
if tree is None:
8073
tree = treebuilders.getTreeBuilder("etree")
8174
self.tree = tree(namespaceHTMLElements)
82-
self.tokenizer_class = tokenizer
8375
self.errors = []
8476

8577
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
8678
getPhases(debug).items()])
8779

88-
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
89-
parseMeta=True, useChardet=True, scripting=False, **kwargs):
80+
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
9081

9182
self.innerHTMLMode = innerHTML
9283
self.container = container
9384
self.scripting = scripting
94-
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
95-
parseMeta=parseMeta,
96-
useChardet=useChardet,
97-
parser=self, **kwargs)
85+
self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
9886
self.reset()
9987

10088
try:
@@ -232,8 +220,7 @@ def normalizedTokens(self):
232220
for token in self.tokenizer:
233221
yield self.normalizeToken(token)
234222

235-
def parse(self, stream, encoding=None, parseMeta=True,
236-
useChardet=True, scripting=False):
223+
def parse(self, stream, *args, **kwargs):
237224
"""Parse a HTML document into a well-formed tree
238225
239226
stream - a filelike object or string containing the HTML to be parsed
@@ -245,13 +232,10 @@ def parse(self, stream, encoding=None, parseMeta=True,
245232
246233
scripting - treat noscript elements as if javascript was turned on
247234
"""
248-
self._parse(stream, innerHTML=False, encoding=encoding,
249-
parseMeta=parseMeta, useChardet=useChardet, scripting=scripting)
235+
self._parse(stream, False, None, *args, **kwargs)
250236
return self.tree.getDocument()
251237

252-
def parseFragment(self, stream, container="div", encoding=None,
253-
parseMeta=False, useChardet=True, scripting=False):
254-
# pylint:disable=unused-argument
238+
def parseFragment(self, stream, *args, **kwargs):
255239
"""Parse a HTML fragment into a well-formed tree fragment
256240
257241
container - name of the element we're setting the innerHTML property
@@ -266,8 +250,7 @@ def parseFragment(self, stream, container="div", encoding=None,
266250
267251
scripting - treat noscript elements as if javascript was turned on
268252
"""
269-
self._parse(stream, True, container=container,
270-
encoding=encoding, scripting=scripting)
253+
self._parse(stream, True, *args, **kwargs)
271254
return self.tree.getFragment()
272255

273256
def parseError(self, errorcode="XXX-undefined-error", datavars=None):

Diff for: html5lib/inputstream.py

+62-36
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128
return b"".join(rv)
129129

130130

131-
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
131+
def HTMLInputStream(source, **kwargs):
132132
# Work around Python bug #20007: read(0) closes the connection.
133133
# http://bugs.python.org/issue20007
134134
if (isinstance(source, http_client.HTTPResponse) or
@@ -142,12 +142,13 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142
isUnicode = isinstance(source, text_type)
143143

144144
if isUnicode:
145-
if encoding is not None:
146-
raise TypeError("Cannot explicitly set an encoding with a unicode string")
145+
encodings = [x for x in kwargs if x.endswith("_encoding")]
146+
if encodings:
147+
raise TypeError("Cannot set an encoding with a unicode input, set %r" % encodings)
147148

148-
return HTMLUnicodeInputStream(source)
149+
return HTMLUnicodeInputStream(source, **kwargs)
149150
else:
150-
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
151+
return HTMLBinaryInputStream(source, **kwargs)
151152

152153

153154
class HTMLUnicodeInputStream(object):
@@ -173,8 +174,6 @@ def __init__(self, source):
173174
regardless of any BOM or later declaration (such as in a meta
174175
element)
175176
176-
parseMeta - Look for a <meta> element containing encoding information
177-
178177
"""
179178

180179
if not utils.supports_lone_surrogates:
@@ -390,7 +389,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390389
391390
"""
392391

393-
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
392+
def __init__(self, source, override_encoding=None, transport_encoding=None,
393+
same_origin_parent_encoding=None, likely_encoding=None,
394+
default_encoding="windows-1252", useChardet=True):
394395
"""Initialises the HTMLInputStream.
395396
396397
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +404,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403404
regardless of any BOM or later declaration (such as in a meta
404405
element)
405406
406-
parseMeta - Look for a <meta> element containing encoding information
407-
408407
"""
409408
# Raw Stream - for unicode objects this will encode to utf-8 and set
410409
# self.charEncoding as appropriate
411410
self.rawStream = self.openStream(source)
412411

413412
HTMLUnicodeInputStream.__init__(self, self.rawStream)
414413

415-
self.charEncoding = (lookupEncoding(encoding), "certain")
416-
417414
# Encoding Information
418415
# Number of bytes to use when looking for a meta element with
419416
# encoding information
420417
self.numBytesMeta = 1024
421418
# Number of bytes to use when using detecting encoding using chardet
422419
self.numBytesChardet = 100
423-
# Encoding to use if no other information can be found
424-
self.defaultEncoding = "windows-1252"
420+
# Things from args
421+
self.override_encoding = override_encoding
422+
self.transport_encoding = transport_encoding
423+
self.same_origin_parent_encoding = same_origin_parent_encoding
424+
self.likely_encoding = likely_encoding
425+
self.default_encoding = default_encoding
425426

426-
# Detect encoding iff no explicit "transport level" encoding is supplied
427-
if (self.charEncoding[0] is None):
428-
self.charEncoding = self.detectEncoding(parseMeta, chardet)
429-
assert self.charEncoding[0] is not None
427+
# Determine encoding
428+
self.charEncoding = self.determineEncoding(useChardet)
429+
assert self.charEncoding[0] is not None
430430

431431
# Call superclass
432432
self.reset()
@@ -454,21 +454,45 @@ def openStream(self, source):
454454

455455
return stream
456456

457-
def detectEncoding(self, parseMeta=True, chardet=True):
458-
# First look for a BOM
457+
def determineEncoding(self, chardet=True):
458+
# BOMs take precedence over everything
459459
# This will also read past the BOM if present
460-
encoding = self.detectBOM()
461-
confidence = "certain"
462-
# If there is no BOM need to look for meta elements with encoding
463-
# information
464-
if encoding is None and parseMeta:
465-
encoding = self.detectEncodingMeta()
466-
confidence = "tentative"
460+
charEncoding = self.detectBOM(), "certain"
461+
if charEncoding[0] is not None:
462+
return charEncoding
463+
464+
# If we've been overriden, we've been overriden
465+
charEncoding = lookupEncoding(self.override_encoding), "certain"
466+
if charEncoding[0] is not None:
467+
return charEncoding
468+
469+
# Now check the transport layer
470+
charEncoding = lookupEncoding(self.transport_encoding), "certain"
471+
if charEncoding[0] is not None:
472+
return charEncoding
473+
474+
# Look for meta elements with encoding information
475+
charEncoding = self.detectEncodingMeta(), "tentative"
476+
if charEncoding[0] is not None:
477+
return charEncoding
478+
479+
# Parent document encoding
480+
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
481+
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
482+
return charEncoding
483+
484+
# "likely" encoding
485+
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
486+
if charEncoding[0] is not None:
487+
return charEncoding
488+
467489
# Guess with chardet, if available
468-
if encoding is None and chardet:
469-
confidence = "tentative"
490+
if chardet:
470491
try:
471492
from chardet.universaldetector import UniversalDetector
493+
except ImportError:
494+
pass
495+
else:
472496
buffers = []
473497
detector = UniversalDetector()
474498
while not detector.done:
@@ -481,14 +505,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481505
detector.close()
482506
encoding = lookupEncoding(detector.result['encoding'])
483507
self.rawStream.seek(0)
484-
except ImportError:
485-
pass
486-
# If all else fails use the default encoding
487-
if encoding is None:
488-
confidence = "tentative"
489-
encoding = lookupEncoding(self.defaultEncoding)
508+
if encoding is not None:
509+
return encoding, "tentative"
510+
511+
# Try the default encoding
512+
charEncoding = lookupEncoding(self.default_encoding), "tentative"
513+
if charEncoding[0] is not None:
514+
return charEncoding
490515

491-
return encoding, confidence
516+
# Fallback to html5lib's default if even that hasn't worked
517+
return lookupEncoding("windows-1252"), "tentative"
492518

493519
def changeEncoding(self, newEncoding):
494520
assert self.charEncoding[1] != "certain"

Diff for: html5lib/tests/test_encoding.py

+51-3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
import os
44

5+
import pytest
6+
57
from .support import get_data_files, test_dir, errorMessage, TestData as _TestData
68
from html5lib import HTMLParser, inputstream
79

@@ -11,7 +13,7 @@ def test_basic_prescan_length():
1113
pad = 1024 - len(data) + 1
1214
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
1315
assert len(data) == 1024 # Sanity
14-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
16+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
1517
assert 'utf-8' == stream.charEncoding[0].name
1618

1719

@@ -20,14 +22,59 @@ def test_parser_reparse():
2022
pad = 10240 - len(data) + 1
2123
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
2224
assert len(data) == 10240 # Sanity
23-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
25+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
2426
assert 'windows-1252' == stream.charEncoding[0].name
2527
p = HTMLParser(namespaceHTMLElements=False)
2628
doc = p.parse(data, useChardet=False)
2729
assert 'utf-8' == p.documentEncoding
2830
assert doc.find(".//title").text == "Caf\u00E9"
2931

3032

33+
@pytest.mark.parametrize("expected,data,kwargs", [
34+
("utf-16le", b"\xFF\xFE", {"override_encoding": "iso-8859-2"}),
35+
("utf-16be", b"\xFE\xFF", {"override_encoding": "iso-8859-2"}),
36+
("utf-8", b"\xEF\xBB\xBF", {"override_encoding": "iso-8859-2"}),
37+
("iso-8859-2", b"", {"override_encoding": "iso-8859-2", "transport_encoding": "iso-8859-3"}),
38+
("iso-8859-2", b"<meta charset=iso-8859-3>", {"transport_encoding": "iso-8859-2"}),
39+
("iso-8859-2", b"<meta charset=iso-8859-2>", {"same_origin_parent_encoding": "iso-8859-3"}),
40+
("iso-8859-2", b"", {"same_origin_parent_encoding": "iso-8859-2", "likely_encoding": "iso-8859-3"}),
41+
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16", "likely_encoding": "iso-8859-2"}),
42+
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16be", "likely_encoding": "iso-8859-2"}),
43+
("iso-8859-2", b"", {"same_origin_parent_encoding": "utf-16le", "likely_encoding": "iso-8859-2"}),
44+
("iso-8859-2", b"", {"likely_encoding": "iso-8859-2", "default_encoding": "iso-8859-3"}),
45+
("iso-8859-2", b"", {"default_encoding": "iso-8859-2"}),
46+
("windows-1252", b"", {"default_encoding": "totally-bogus-string"}),
47+
("windows-1252", b"", {}),
48+
])
49+
def test_parser_args(expected, data, kwargs):
50+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False, **kwargs)
51+
assert expected == stream.charEncoding[0].name
52+
p = HTMLParser()
53+
p.parse(data, useChardet=False, **kwargs)
54+
assert expected == p.documentEncoding
55+
56+
57+
@pytest.mark.parametrize("kwargs", [
58+
{"override_encoding": "iso-8859-2"},
59+
{"override_encoding": None},
60+
{"transport_encoding": "iso-8859-2"},
61+
{"transport_encoding": None},
62+
{"same_origin_parent_encoding": "iso-8859-2"},
63+
{"same_origin_parent_encoding": None},
64+
{"likely_encoding": "iso-8859-2"},
65+
{"likely_encoding": None},
66+
{"default_encoding": "iso-8859-2"},
67+
{"default_encoding": None},
68+
{"foo_encoding": "iso-8859-2"},
69+
{"foo_encoding": None},
70+
])
71+
def test_parser_args_raises(kwargs):
72+
with pytest.raises(TypeError) as exc_info:
73+
p = HTMLParser()
74+
p.parse("", useChardet=False, **kwargs)
75+
assert exc_info.value.args[0].startswith("Cannot set an encoding with a unicode input")
76+
77+
3178
def runParserEncodingTest(data, encoding):
3279
p = HTMLParser()
3380
assert p.documentEncoding is None
@@ -38,7 +85,7 @@ def runParserEncodingTest(data, encoding):
3885

3986

4087
def runPreScanEncodingTest(data, encoding):
41-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
88+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
4289
encoding = encoding.lower().decode("ascii")
4390

4491
# Very crude way to ignore irrelevant tests
@@ -55,6 +102,7 @@ def test_encoding():
55102
yield (runParserEncodingTest, test[b'data'], test[b'encoding'])
56103
yield (runPreScanEncodingTest, test[b'data'], test[b'encoding'])
57104

105+
58106
# pylint:disable=wrong-import-position
59107
try:
60108
import chardet # noqa

0 commit comments

Comments
 (0)