Merge pull request dcramer#5 from byroot/fix-bom-detection

Fix BOM detection dcramer#4 Thanks @byroot
puzzlet · Dec 14, 2012 · 0e70614 · 0e70614
2 parents a621369 + e5e8add
commit 0e70614
Show file tree

Hide file tree

Showing 7 changed files with 43 additions and 8 deletions.
diff --git a/charade/universaldetector.py b/charade/universaldetector.py
@@ -70,31 +70,31 @@ def feed(self, aBuf):
 
         if not self._mGotData:
             # If the data starts with BOM, we know it is UTF
-            if aBuf[:3] == '\xEF\xBB\xBF':
+            if aBuf[:3] == b'\xEF\xBB\xBF':
                 # EF BB BF  UTF-8 with BOM
                 self.result = {'encoding': "UTF-8", 'confidence': 1.0}
-            elif aBuf[:4] == '\xFF\xFE\x00\x00':
+            elif aBuf[:4] == b'\xFF\xFE\x00\x00':
                 # FF FE 00 00  UTF-32, little-endian BOM
                 self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
-            elif aBuf[:4] == '\x00\x00\xFE\xFF':
+            elif aBuf[:4] == b'\x00\x00\xFE\xFF':
                 # 00 00 FE FF  UTF-32, big-endian BOM
                 self.result = {'encoding': "UTF-32BE", 'confidence': 1.0}
-            elif aBuf[:4] == '\xFE\xFF\x00\x00':
+            elif aBuf[:4] == b'\xFE\xFF\x00\x00':
                 # FE FF 00 00  UCS-4, unusual octet order BOM (3412)
                 self.result = {
                     'encoding': "X-ISO-10646-UCS-4-3412",
                     'confidence': 1.0
                 }
-            elif aBuf[:4] == '\x00\x00\xFF\xFE':
+            elif aBuf[:4] == b'\x00\x00\xFF\xFE':
                 # 00 00 FF FE  UCS-4, unusual octet order BOM (2143)
                 self.result = {
                     'encoding': "X-ISO-10646-UCS-4-2143",
                     'confidence': 1.0
                 }
-            elif aBuf[:2] == '\xFF\xFE':
+            elif aBuf[:2] == b'\xFF\xFE':
                 # FF FE  UTF-16, little endian BOM
                 self.result = {'encoding': "UTF-16LE", 'confidence': 1.0}
-            elif aBuf[:2] == '\xFE\xFF':
+            elif aBuf[:2] == b'\xFE\xFF':
                 # FE FF  UTF-16, big endian BOM
                 self.result = {'encoding': "UTF-16BE", 'confidence': 1.0}
 

diff --git a/test.py b/test.py
@@ -49,7 +49,7 @@ def main():
             continue
         for file_name in os.listdir(path):
             _, ext = os.path.splitext(file_name)
-            if ext not in ['.html', '.txt', '.xml']:
+            if ext not in ['.html', '.txt', '.xml', '.srt']:
                 continue
             suite.addTest(TestCase(os.path.join(path, file_name), encoding))
     unittest.TextTestRunner().run(suite)

diff --git a/tests/UTF-16BE/bom-utf-16-be.srt b/tests/UTF-16BE/bom-utf-16-be.srt
diff --git a/tests/UTF-16LE/bom-utf-16-le.srt b/tests/UTF-16LE/bom-utf-16-le.srt
diff --git a/tests/UTF-32BE/bom-utf-32-be.srt b/tests/UTF-32BE/bom-utf-32-be.srt
diff --git a/tests/UTF-32LE/bom-utf-32-le.srt b/tests/UTF-32LE/bom-utf-32-le.srt
diff --git a/tests/utf-8/bom-utf-8.srt b/tests/utf-8/bom-utf-8.srt
@@ -0,0 +1,35 @@
+1
+00:00:06,500 --> 00:00:09,000
+About 2 months ago I found myself on
+the comment section of YouTube
+
+2
+00:00:11,000 --> 00:00:17,000
+And I was commenting,
+unfortunately I was commenting,
+on a video about the famous Ayn Rand
+
+3
+00:00:19,000 --> 00:00:24,000
+And I
+posted underneath against
+this woman's tirades,
+against what is essentially
+the human race.
+
+4
+00:00:25,000 --> 00:00:31,000
+that, this monetary system seems to have no point, seems to actually hinder people
+
+5
+00:00:31,000 --> 00:00:36,000
+and hinder progress, and one of the responses I got, I didn't answer it, was:
+
+6
+00:00:37,000 --> 00:00:43,000
+what actually money creates is an incentive to invent the new items, that's the driving force behind it
+
+7
+00:00:43,000 --> 00:00:50,000
+So what I thought I do is instead if answering on a YouTube comment is organize a global awareness day
+