Skip to content

Commit 911cf45

Browse files
committed
Merge pull request #241 from gsnedders/encoding_tests
Get encoding reparsing actually working; r=nobody!
2 parents 9dc49f6 + b0ae0c2 commit 911cf45

File tree

3 files changed

+28
-7
lines changed

3 files changed

+28
-7
lines changed

html5lib/html5parser.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,11 @@ def _parse(self, stream, innerHTML=False, container="div",
8989
parser=self, **kwargs)
9090
self.reset()
9191

92-
while True:
93-
try:
94-
self.mainLoop()
95-
break
96-
except ReparseException:
97-
self.reset()
92+
try:
93+
self.mainLoop()
94+
except ReparseException:
95+
self.reset()
96+
self.mainLoop()
9897

9998
def reset(self):
10099
self.tree.reset()

html5lib/inputstream.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -509,8 +509,8 @@ def changeEncoding(self, newEncoding):
509509
self.charEncoding = (self.charEncoding[0], "certain")
510510
else:
511511
self.rawStream.seek(0)
512-
self.reset()
513512
self.charEncoding = (newEncoding, "certain")
513+
self.reset()
514514
raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding))
515515

516516
def detectBOM(self):

html5lib/tests/test_encoding.py

+22
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,28 @@
1212
from html5lib import HTMLParser, inputstream
1313

1414

15+
def test_basic_prescan_length():
16+
data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
17+
pad = 1024 - len(data) + 1
18+
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
19+
assert len(data) == 1024 # Sanity
20+
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
21+
assert 'utf-8' == stream.charEncoding[0].name
22+
23+
24+
def test_parser_reparse():
25+
data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
26+
pad = 10240 - len(data) + 1
27+
data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
28+
assert len(data) == 10240 # Sanity
29+
stream = inputstream.HTMLBinaryInputStream(data, chardet=False)
30+
assert 'windows-1252' == stream.charEncoding[0].name
31+
p = HTMLParser(namespaceHTMLElements=False)
32+
doc = p.parse(data, useChardet=False)
33+
assert 'utf-8' == p.documentEncoding
34+
assert doc.find(".//title").text == "Caf\u00E9"
35+
36+
1537
def runParserEncodingTest(data, encoding):
1638
p = HTMLParser()
1739
assert p.documentEncoding is None

0 commit comments

Comments
 (0)