Skip to content

Commit 9aab922

Browse files
committed
Add a usedEncoding method to HTML5Parser, fix html5lib#121
1 parent e269a2f commit 9aab922

File tree

3 files changed

+40
-1
lines changed

3 files changed

+40
-1
lines changed

html5lib/html5parser.py

+10
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,16 @@ def reset(self):
129129

130130
self.framesetOK = True
131131

132+
def usedEncoding(self):
133+
"""Return the name of the character encoding
134+
that was used to decode the input stream,
135+
or :obj:`None` if that is not determined yet.
136+
137+
"""
138+
if not hasattr(self, 'tokenizer'):
139+
return None
140+
return self.tokenizer.stream.usedEncoding()
141+
132142
def isHTMLIntegrationPoint(self, element):
133143
if (element.name == "annotation-xml" and
134144
element.namespace == namespaces["mathml"]):

html5lib/inputstream.py

+6
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,9 @@ def __init__(self, source):
175175

176176
self.reset()
177177

178+
def usedEncoding(self):
179+
return None # No encoding involved for Unicode input.
180+
178181
def reset(self):
179182
self.chunk = ""
180183
self.chunkSize = 0
@@ -413,6 +416,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
413416
# Call superclass
414417
self.reset()
415418

419+
def usedEncoding(self):
420+
return self.charEncoding[0]
421+
416422
def reset(self):
417423
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
418424
'replace')

html5lib/tests/test_encoding.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,35 @@ def test_codec_name_d(self):
2626
self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
2727

2828

29+
def test_unicode_input_encoding():
30+
p = HTMLParser()
31+
assert p.usedEncoding() is None
32+
p.parse(b'<meta charset=latin2>', useChardet=False)
33+
assert p.usedEncoding() == 'iso8859-2'
34+
35+
p = HTMLParser()
36+
assert p.usedEncoding() is None
37+
p.parse('<meta charset=latin2>')
38+
assert p.usedEncoding() is None
39+
40+
p = HTMLParser()
41+
assert p.usedEncoding() is None
42+
try:
43+
p.parse('<meta charset=latin2>', encoding='latin3')
44+
except TypeError:
45+
pass
46+
else:
47+
assert 0, 'Expected TypeError'
48+
assert p.usedEncoding() is None
49+
50+
2951
def runParserEncodingTest(data, encoding):
3052
p = HTMLParser()
53+
assert p.usedEncoding() is None
3154
p.parse(data, useChardet=False)
3255
encoding = encoding.lower().decode("ascii")
3356

34-
assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0])
57+
assert encoding == p.usedEncoding(), errorMessage(data, encoding, p.usedEncoding())
3558

3659

3760
def runPreScanEncodingTest(data, encoding):

0 commit comments

Comments
 (0)