Add a usedEncoding method to HTML5Parser, fix html5lib#121

SimonSapin · SimonSapin · commit 9aab9221cbc1 · 2013-11-30T23:01:43.000Z
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -129,6 +129,16 @@ def reset(self):
 
         self.framesetOK = True
 
+    def usedEncoding(self):
+        """Return the name of the character encoding
+        that was used to decode the input stream,
+        or :obj:`None` if that is not determined yet.
+
+        """
+        if not hasattr(self, 'tokenizer'):
+            return None
+        return self.tokenizer.stream.usedEncoding()
+
     def isHTMLIntegrationPoint(self, element):
         if (element.name == "annotation-xml" and
                 element.namespace == namespaces["mathml"]):
diff --git a/html5lib/inputstream.py b/html5lib/inputstream.py
@@ -175,6 +175,9 @@ def __init__(self, source):
 
         self.reset()
 
+    def usedEncoding(self):
+        return None  # No encoding involved for Unicode input.
+
     def reset(self):
         self.chunk = ""
         self.chunkSize = 0
@@ -413,6 +416,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         # Call superclass
         self.reset()
 
+    def usedEncoding(self):
+        return self.charEncoding[0]
+
     def reset(self):
         self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
                                                                  'replace')
diff --git a/html5lib/tests/test_encoding.py b/html5lib/tests/test_encoding.py
@@ -26,12 +26,35 @@ def test_codec_name_d(self):
         self.assertEqual(inputstream.codecName("ISO_8859--1"), "windows-1252")
 
 
+def test_unicode_input_encoding():
+    p = HTMLParser()
+    assert p.usedEncoding() is None
+    p.parse(b'<meta charset=latin2>', useChardet=False)
+    assert p.usedEncoding() == 'iso8859-2'
+
+    p = HTMLParser()
+    assert p.usedEncoding() is None
+    p.parse('<meta charset=latin2>')
+    assert p.usedEncoding() is None
+
+    p = HTMLParser()
+    assert p.usedEncoding() is None
+    try:
+        p.parse('<meta charset=latin2>', encoding='latin3')
+    except TypeError:
+        pass
+    else:
+        assert 0, 'Expected TypeError'
+    assert p.usedEncoding() is None
+
+
 def runParserEncodingTest(data, encoding):
     p = HTMLParser()
+    assert p.usedEncoding() is None
     p.parse(data, useChardet=False)
     encoding = encoding.lower().decode("ascii")
 
-    assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(data, encoding, p.tokenizer.stream.charEncoding[0])
+    assert encoding == p.usedEncoding(), errorMessage(data, encoding, p.usedEncoding())
 
 
 def runPreScanEncodingTest(data, encoding):