Skip to content

Commit 64f3217

Browse files
committed
Fix html5lib#120: introduce keyword arguments for encodings by source
1 parent 244a6eb commit 64f3217

File tree

7 files changed

+79
-58
lines changed

7 files changed

+79
-58
lines changed

CHANGES.rst

+4
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ Released on XXX
4646

4747
* **Drop support of charade, now that chardet is supported once more.**
4848

49+
* **Replace the charset keyword argument on parse and related methods
50+
with a set of keyword arguments: override_encoding, transport_encoding,
51+
same_origin_parent_encoding, default_encoding.**
52+
4953

5054
0.9999999/1.0b8
5155
~~~~~~~~~~~~~~~

README.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ pass into html5lib as follows:
5151
import html5lib
5252
5353
with closing(urlopen("http://example.com/")) as f:
54-
document = html5lib.parse(f, encoding=f.info().getparam("charset"))
54+
document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
5555
5656
When using with ``urllib.request`` (Python 3), the charset from HTTP
5757
should be pass into html5lib as follows:
@@ -62,7 +62,7 @@ should be pass into html5lib as follows:
6262
import html5lib
6363
6464
with urlopen("http://example.com/") as f:
65-
document = html5lib.parse(f, encoding=f.info().get_content_charset())
65+
document = html5lib.parse(f, transport_encoding=f.info().get_content_charset())
6666
6767
To have more control over the parser, create a parser object explicitly.
6868
For instance, to make the parser raise exceptions on parse errors, use:

html5lib/html5parser.py

+6-14
Original file line numberDiff line numberDiff line change
@@ -79,15 +79,12 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa
7979
self.phases = dict([(name, cls(self, self.tree)) for name, cls in
8080
getPhases(debug).items()])
8181

82-
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
83-
useChardet=True, scripting=False, **kwargs):
82+
def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
8483

8584
self.innerHTMLMode = innerHTML
8685
self.container = container
8786
self.scripting = scripting
88-
self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding=encoding,
89-
useChardet=useChardet,
90-
parser=self, **kwargs)
87+
self.tokenizer = tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
9188
self.reset()
9289

9390
try:
@@ -222,8 +219,7 @@ def normalizedTokens(self):
222219
for token in self.tokenizer:
223220
yield self.normalizeToken(token)
224221

225-
def parse(self, stream, encoding=None,
226-
useChardet=True, scripting=False):
222+
def parse(self, stream, *args, **kwargs):
227223
"""Parse a HTML document into a well-formed tree
228224
229225
stream - a filelike object or string containing the HTML to be parsed
@@ -235,13 +231,10 @@ def parse(self, stream, encoding=None,
235231
236232
scripting - treat noscript elements as if javascript was turned on
237233
"""
238-
self._parse(stream, innerHTML=False, encoding=encoding,
239-
useChardet=useChardet, scripting=scripting)
234+
self._parse(stream, False, None, *args, **kwargs)
240235
return self.tree.getDocument()
241236

242-
def parseFragment(self, stream, container="div", encoding=None,
243-
useChardet=True, scripting=False):
244-
# pylint:disable=unused-argument
237+
def parseFragment(self, stream, *args, **kwargs):
245238
"""Parse a HTML fragment into a well-formed tree fragment
246239
247240
container - name of the element we're setting the innerHTML property
@@ -256,8 +249,7 @@ def parseFragment(self, stream, container="div", encoding=None,
256249
257250
scripting - treat noscript elements as if javascript was turned on
258251
"""
259-
self._parse(stream, True, container=container,
260-
encoding=encoding, scripting=scripting)
252+
self._parse(stream, True, *args, **kwargs)
261253
return self.tree.getFragment()
262254

263255
def parseError(self, errorcode="XXX-undefined-error", datavars=None):

html5lib/inputstream.py

+60-35
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _readFromBuffer(self, bytes):
128128
return b"".join(rv)
129129

130130

131-
def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
131+
def HTMLInputStream(source, override_encoding=None, **kwargs):
132132
# Work around Python bug #20007: read(0) closes the connection.
133133
# http://bugs.python.org/issue20007
134134
if (isinstance(source, http_client.HTTPResponse) or
@@ -142,12 +142,12 @@ def HTMLInputStream(source, encoding=None, parseMeta=True, chardet=True):
142142
isUnicode = isinstance(source, text_type)
143143

144144
if isUnicode:
145-
if encoding is not None:
146-
raise TypeError("Cannot explicitly set an encoding with a unicode string")
145+
if override_encoding is not None:
146+
raise TypeError("Cannot set an override encoding with a unicode input")
147147

148148
return HTMLUnicodeInputStream(source)
149149
else:
150-
return HTMLBinaryInputStream(source, encoding, parseMeta, chardet)
150+
return HTMLBinaryInputStream(source, override_encoding=override_encoding, **kwargs)
151151

152152

153153
class HTMLUnicodeInputStream(object):
@@ -173,8 +173,6 @@ def __init__(self, source):
173173
regardless of any BOM or later declaration (such as in a meta
174174
element)
175175
176-
parseMeta - Look for a <meta> element containing encoding information
177-
178176
"""
179177

180178
if not utils.supports_lone_surrogates:
@@ -390,7 +388,9 @@ class HTMLBinaryInputStream(HTMLUnicodeInputStream):
390388
391389
"""
392390

393-
def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
391+
def __init__(self, source, override_encoding=None, transport_encoding=None,
392+
same_origin_parent_encoding=None, likely_encoding=None,
393+
default_encoding="windows-1252", useChardet=True):
394394
"""Initialises the HTMLInputStream.
395395
396396
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -403,30 +403,29 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
403403
regardless of any BOM or later declaration (such as in a meta
404404
element)
405405
406-
parseMeta - Look for a <meta> element containing encoding information
407-
408406
"""
409407
# Raw Stream - for unicode objects this will encode to utf-8 and set
410408
# self.charEncoding as appropriate
411409
self.rawStream = self.openStream(source)
412410

413411
HTMLUnicodeInputStream.__init__(self, self.rawStream)
414412

415-
self.charEncoding = (lookupEncoding(encoding), "certain")
416-
417413
# Encoding Information
418414
# Number of bytes to use when looking for a meta element with
419415
# encoding information
420416
self.numBytesMeta = 1024
421417
# Number of bytes to use when using detecting encoding using chardet
422418
self.numBytesChardet = 100
423-
# Encoding to use if no other information can be found
424-
self.defaultEncoding = "windows-1252"
419+
# Things from args
420+
self.override_encoding = override_encoding
421+
self.transport_encoding = transport_encoding
422+
self.same_origin_parent_encoding = same_origin_parent_encoding
423+
self.likely_encoding = likely_encoding
424+
self.default_encoding = default_encoding
425425

426-
# Detect encoding iff no explicit "transport level" encoding is supplied
427-
if (self.charEncoding[0] is None):
428-
self.charEncoding = self.detectEncoding(parseMeta, chardet)
429-
assert self.charEncoding[0] is not None
426+
# Determine encoding
427+
self.charEncoding = self.determineEncoding(useChardet)
428+
assert self.charEncoding[0] is not None
430429

431430
# Call superclass
432431
self.reset()
@@ -454,21 +453,45 @@ def openStream(self, source):
454453

455454
return stream
456455

457-
def detectEncoding(self, parseMeta=True, chardet=True):
458-
# First look for a BOM
456+
def determineEncoding(self, chardet=True):
457+
# BOMs take precedence over everything
459458
# This will also read past the BOM if present
460-
encoding = self.detectBOM()
461-
confidence = "certain"
462-
# If there is no BOM need to look for meta elements with encoding
463-
# information
464-
if encoding is None and parseMeta:
465-
encoding = self.detectEncodingMeta()
466-
confidence = "tentative"
459+
charEncoding = self.detectBOM(), "certain"
460+
if charEncoding[0] is not None:
461+
return charEncoding
462+
463+
# If we've been overriden, we've been overriden
464+
charEncoding = lookupEncoding(self.override_encoding), "certain"
465+
if charEncoding[0] is not None:
466+
return charEncoding
467+
468+
# Now check the transport layer
469+
charEncoding = lookupEncoding(self.transport_encoding), "certain"
470+
if charEncoding[0] is not None:
471+
return charEncoding
472+
473+
# Look for meta elements with encoding information
474+
charEncoding = self.detectEncodingMeta(), "tentative"
475+
if charEncoding[0] is not None:
476+
return charEncoding
477+
478+
# Parent document encoding
479+
charEncoding = lookupEncoding(self.same_origin_parent_encoding), "tentative"
480+
if charEncoding[0] is not None and not charEncoding[0].name.startswith("utf-16"):
481+
return charEncoding
482+
483+
# "likely" encoding
484+
charEncoding = lookupEncoding(self.likely_encoding), "tentative"
485+
if charEncoding[0] is not None:
486+
return charEncoding
487+
467488
# Guess with chardet, if available
468-
if encoding is None and chardet:
469-
confidence = "tentative"
489+
if chardet:
470490
try:
471491
from chardet.universaldetector import UniversalDetector
492+
except ImportError:
493+
pass
494+
else:
472495
buffers = []
473496
detector = UniversalDetector()
474497
while not detector.done:
@@ -481,14 +504,16 @@ def detectEncoding(self, parseMeta=True, chardet=True):
481504
detector.close()
482505
encoding = lookupEncoding(detector.result['encoding'])
483506
self.rawStream.seek(0)
484-
except ImportError:
485-
pass
486-
# If all else fails use the default encoding
487-
if encoding is None:
488-
confidence = "tentative"
489-
encoding = lookupEncoding(self.defaultEncoding)
507+
if encoding is not None:
508+
return encoding, "tentative"
509+
510+
# Try the default encoding
511+
charEncoding = lookupEncoding(self.default_encoding), "tentative"
512+
if charEncoding[0] is not None:
513+
return charEncoding
490514

491-
return encoding, confidence
515+
# Fallback to html5lib's default if even that hasn't worked
516+
return lookupEncoding("windows-1252"), "tentative"
492517

493518
def changeEncoding(self, newEncoding):
494519
assert self.charEncoding[1] != "certain"

html5lib/tests/test_encoding.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ def test_basic_prescan_length():
1111
pad = 1024 - len(data) + 1
1212
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
1313
assert len(data) == 1024 # Sanity
14-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
14+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
1515
assert 'utf-8' == stream.charEncoding[0].name
1616

1717

@@ -20,7 +20,7 @@ def test_parser_reparse():
2020
pad = 10240 - len(data) + 1
2121
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
2222
assert len(data) == 10240 # Sanity
23-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
23+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
2424
assert 'windows-1252' == stream.charEncoding[0].name
2525
p = HTMLParser(namespaceHTMLElements=False)
2626
doc = p.parse(data, useChardet=False)
@@ -38,7 +38,7 @@ def runParserEncodingTest(data, encoding):
3838

3939

4040
def runPreScanEncodingTest(data, encoding):
41-
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
41+
stream = inputstream.HTMLBinaryInputStream(data, useChardet=False)
4242
encoding = encoding.lower().decode("ascii")
4343

4444
# Very crude way to ignore irrelevant tests

html5lib/tests/test_stream.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,13 @@ class HTMLBinaryInputStreamShortChunk(HTMLBinaryInputStream):
9999

100100

101101
def test_char_ascii():
102-
stream = HTMLInputStream(b"'", encoding='ascii')
102+
stream = HTMLInputStream(b"'", override_encoding='ascii')
103103
assert stream.charEncoding[0].name == 'windows-1252'
104104
assert stream.char() == "'"
105105

106106

107107
def test_char_utf8():
108-
stream = HTMLInputStream('\u2018'.encode('utf-8'), encoding='utf-8')
108+
stream = HTMLInputStream('\u2018'.encode('utf-8'), override_encoding='utf-8')
109109
assert stream.charEncoding[0].name == 'utf-8'
110110
assert stream.char() == '\u2018'
111111

html5lib/tokenizer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ class HTMLTokenizer(object):
3131
Points to HTMLInputStream object.
3232
"""
3333

34-
def __init__(self, stream, encoding=None, useChardet=True, parser=None):
34+
def __init__(self, stream, parser=None, **kwargs):
3535

36-
self.stream = HTMLInputStream(stream, encoding, True, useChardet)
36+
self.stream = HTMLInputStream(stream, **kwargs)
3737
self.parser = parser
3838

3939
# Setup the initial tokenizer state

0 commit comments

Comments
 (0)