Skip to content

Commit 85723e2

Browse files
committed
Fix html5lib#124: Move to webencodings for decoding the input byte stream.
1 parent 44b0bbc commit 85723e2

11 files changed

+49
-306
lines changed

.pytest.expect

-3.55 KB
Binary file not shown.

CHANGES.rst

+4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ Released on XXX
2222

2323
* Move testsuite to ``py.test``.
2424

25+
* Fix #124: move to webencodings for decoding the input byte stream;
26+
this makes html5lib compliant with the Encoding Standard, and
27+
introduces a required dependency on webencodings.
28+
2529

2630
0.9999999/1.0b8
2731
~~~~~~~~~~~~~~~

html5lib/constants.py

-229
Original file line numberDiff line numberDiff line change
@@ -2846,235 +2846,6 @@
28462846
0x9F: "\u0178",
28472847
}
28482848

2849-
encodings = {
2850-
'437': 'cp437',
2851-
'850': 'cp850',
2852-
'852': 'cp852',
2853-
'855': 'cp855',
2854-
'857': 'cp857',
2855-
'860': 'cp860',
2856-
'861': 'cp861',
2857-
'862': 'cp862',
2858-
'863': 'cp863',
2859-
'865': 'cp865',
2860-
'866': 'cp866',
2861-
'869': 'cp869',
2862-
'ansix341968': 'ascii',
2863-
'ansix341986': 'ascii',
2864-
'arabic': 'iso8859-6',
2865-
'ascii': 'ascii',
2866-
'asmo708': 'iso8859-6',
2867-
'big5': 'big5',
2868-
'big5hkscs': 'big5hkscs',
2869-
'chinese': 'gbk',
2870-
'cp037': 'cp037',
2871-
'cp1026': 'cp1026',
2872-
'cp154': 'ptcp154',
2873-
'cp367': 'ascii',
2874-
'cp424': 'cp424',
2875-
'cp437': 'cp437',
2876-
'cp500': 'cp500',
2877-
'cp775': 'cp775',
2878-
'cp819': 'windows-1252',
2879-
'cp850': 'cp850',
2880-
'cp852': 'cp852',
2881-
'cp855': 'cp855',
2882-
'cp857': 'cp857',
2883-
'cp860': 'cp860',
2884-
'cp861': 'cp861',
2885-
'cp862': 'cp862',
2886-
'cp863': 'cp863',
2887-
'cp864': 'cp864',
2888-
'cp865': 'cp865',
2889-
'cp866': 'cp866',
2890-
'cp869': 'cp869',
2891-
'cp936': 'gbk',
2892-
'cpgr': 'cp869',
2893-
'cpis': 'cp861',
2894-
'csascii': 'ascii',
2895-
'csbig5': 'big5',
2896-
'cseuckr': 'cp949',
2897-
'cseucpkdfmtjapanese': 'euc_jp',
2898-
'csgb2312': 'gbk',
2899-
'cshproman8': 'hp-roman8',
2900-
'csibm037': 'cp037',
2901-
'csibm1026': 'cp1026',
2902-
'csibm424': 'cp424',
2903-
'csibm500': 'cp500',
2904-
'csibm855': 'cp855',
2905-
'csibm857': 'cp857',
2906-
'csibm860': 'cp860',
2907-
'csibm861': 'cp861',
2908-
'csibm863': 'cp863',
2909-
'csibm864': 'cp864',
2910-
'csibm865': 'cp865',
2911-
'csibm866': 'cp866',
2912-
'csibm869': 'cp869',
2913-
'csiso2022jp': 'iso2022_jp',
2914-
'csiso2022jp2': 'iso2022_jp_2',
2915-
'csiso2022kr': 'iso2022_kr',
2916-
'csiso58gb231280': 'gbk',
2917-
'csisolatin1': 'windows-1252',
2918-
'csisolatin2': 'iso8859-2',
2919-
'csisolatin3': 'iso8859-3',
2920-
'csisolatin4': 'iso8859-4',
2921-
'csisolatin5': 'windows-1254',
2922-
'csisolatin6': 'iso8859-10',
2923-
'csisolatinarabic': 'iso8859-6',
2924-
'csisolatincyrillic': 'iso8859-5',
2925-
'csisolatingreek': 'iso8859-7',
2926-
'csisolatinhebrew': 'iso8859-8',
2927-
'cskoi8r': 'koi8-r',
2928-
'csksc56011987': 'cp949',
2929-
'cspc775baltic': 'cp775',
2930-
'cspc850multilingual': 'cp850',
2931-
'cspc862latinhebrew': 'cp862',
2932-
'cspc8codepage437': 'cp437',
2933-
'cspcp852': 'cp852',
2934-
'csptcp154': 'ptcp154',
2935-
'csshiftjis': 'shift_jis',
2936-
'csunicode11utf7': 'utf-7',
2937-
'cyrillic': 'iso8859-5',
2938-
'cyrillicasian': 'ptcp154',
2939-
'ebcdiccpbe': 'cp500',
2940-
'ebcdiccpca': 'cp037',
2941-
'ebcdiccpch': 'cp500',
2942-
'ebcdiccphe': 'cp424',
2943-
'ebcdiccpnl': 'cp037',
2944-
'ebcdiccpus': 'cp037',
2945-
'ebcdiccpwt': 'cp037',
2946-
'ecma114': 'iso8859-6',
2947-
'ecma118': 'iso8859-7',
2948-
'elot928': 'iso8859-7',
2949-
'eucjp': 'euc_jp',
2950-
'euckr': 'cp949',
2951-
'extendedunixcodepackedformatforjapanese': 'euc_jp',
2952-
'gb18030': 'gb18030',
2953-
'gb2312': 'gbk',
2954-
'gb231280': 'gbk',
2955-
'gbk': 'gbk',
2956-
'greek': 'iso8859-7',
2957-
'greek8': 'iso8859-7',
2958-
'hebrew': 'iso8859-8',
2959-
'hproman8': 'hp-roman8',
2960-
'hzgb2312': 'hz',
2961-
'ibm037': 'cp037',
2962-
'ibm1026': 'cp1026',
2963-
'ibm367': 'ascii',
2964-
'ibm424': 'cp424',
2965-
'ibm437': 'cp437',
2966-
'ibm500': 'cp500',
2967-
'ibm775': 'cp775',
2968-
'ibm819': 'windows-1252',
2969-
'ibm850': 'cp850',
2970-
'ibm852': 'cp852',
2971-
'ibm855': 'cp855',
2972-
'ibm857': 'cp857',
2973-
'ibm860': 'cp860',
2974-
'ibm861': 'cp861',
2975-
'ibm862': 'cp862',
2976-
'ibm863': 'cp863',
2977-
'ibm864': 'cp864',
2978-
'ibm865': 'cp865',
2979-
'ibm866': 'cp866',
2980-
'ibm869': 'cp869',
2981-
'iso2022jp': 'iso2022_jp',
2982-
'iso2022jp2': 'iso2022_jp_2',
2983-
'iso2022kr': 'iso2022_kr',
2984-
'iso646irv1991': 'ascii',
2985-
'iso646us': 'ascii',
2986-
'iso88591': 'windows-1252',
2987-
'iso885910': 'iso8859-10',
2988-
'iso8859101992': 'iso8859-10',
2989-
'iso885911987': 'windows-1252',
2990-
'iso885913': 'iso8859-13',
2991-
'iso885914': 'iso8859-14',
2992-
'iso8859141998': 'iso8859-14',
2993-
'iso885915': 'iso8859-15',
2994-
'iso885916': 'iso8859-16',
2995-
'iso8859162001': 'iso8859-16',
2996-
'iso88592': 'iso8859-2',
2997-
'iso885921987': 'iso8859-2',
2998-
'iso88593': 'iso8859-3',
2999-
'iso885931988': 'iso8859-3',
3000-
'iso88594': 'iso8859-4',
3001-
'iso885941988': 'iso8859-4',
3002-
'iso88595': 'iso8859-5',
3003-
'iso885951988': 'iso8859-5',
3004-
'iso88596': 'iso8859-6',
3005-
'iso885961987': 'iso8859-6',
3006-
'iso88597': 'iso8859-7',
3007-
'iso885971987': 'iso8859-7',
3008-
'iso88598': 'iso8859-8',
3009-
'iso885981988': 'iso8859-8',
3010-
'iso88599': 'windows-1254',
3011-
'iso885991989': 'windows-1254',
3012-
'isoceltic': 'iso8859-14',
3013-
'isoir100': 'windows-1252',
3014-
'isoir101': 'iso8859-2',
3015-
'isoir109': 'iso8859-3',
3016-
'isoir110': 'iso8859-4',
3017-
'isoir126': 'iso8859-7',
3018-
'isoir127': 'iso8859-6',
3019-
'isoir138': 'iso8859-8',
3020-
'isoir144': 'iso8859-5',
3021-
'isoir148': 'windows-1254',
3022-
'isoir149': 'cp949',
3023-
'isoir157': 'iso8859-10',
3024-
'isoir199': 'iso8859-14',
3025-
'isoir226': 'iso8859-16',
3026-
'isoir58': 'gbk',
3027-
'isoir6': 'ascii',
3028-
'koi8r': 'koi8-r',
3029-
'koi8u': 'koi8-u',
3030-
'korean': 'cp949',
3031-
'ksc5601': 'cp949',
3032-
'ksc56011987': 'cp949',
3033-
'ksc56011989': 'cp949',
3034-
'l1': 'windows-1252',
3035-
'l10': 'iso8859-16',
3036-
'l2': 'iso8859-2',
3037-
'l3': 'iso8859-3',
3038-
'l4': 'iso8859-4',
3039-
'l5': 'windows-1254',
3040-
'l6': 'iso8859-10',
3041-
'l8': 'iso8859-14',
3042-
'latin1': 'windows-1252',
3043-
'latin10': 'iso8859-16',
3044-
'latin2': 'iso8859-2',
3045-
'latin3': 'iso8859-3',
3046-
'latin4': 'iso8859-4',
3047-
'latin5': 'windows-1254',
3048-
'latin6': 'iso8859-10',
3049-
'latin8': 'iso8859-14',
3050-
'latin9': 'iso8859-15',
3051-
'ms936': 'gbk',
3052-
'mskanji': 'shift_jis',
3053-
'pt154': 'ptcp154',
3054-
'ptcp154': 'ptcp154',
3055-
'r8': 'hp-roman8',
3056-
'roman8': 'hp-roman8',
3057-
'shiftjis': 'shift_jis',
3058-
'tis620': 'cp874',
3059-
'unicode11utf7': 'utf-7',
3060-
'us': 'ascii',
3061-
'usascii': 'ascii',
3062-
'utf16': 'utf-16',
3063-
'utf16be': 'utf-16-be',
3064-
'utf16le': 'utf-16-le',
3065-
'utf8': 'utf-8',
3066-
'windows1250': 'cp1250',
3067-
'windows1251': 'cp1251',
3068-
'windows1252': 'cp1252',
3069-
'windows1253': 'cp1253',
3070-
'windows1254': 'cp1254',
3071-
'windows1255': 'cp1255',
3072-
'windows1256': 'cp1256',
3073-
'windows1257': 'cp1257',
3074-
'windows1258': 'cp1258',
3075-
'windows936': 'gbk',
3076-
'x-x-big5': 'big5'}
3077-
30782849
tokenTypes = {
30792850
"Doctype": 0,
30802851
"Characters": 1,

html5lib/html5parser.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ def documentEncoding(self):
139139
"""
140140
if not hasattr(self, 'tokenizer'):
141141
return None
142-
return self.tokenizer.stream.charEncoding[0]
142+
return self.tokenizer.stream.charEncoding[0].name
143143

144144
def isHTMLIntegrationPoint(self, element):
145145
if (element.name == "annotation-xml" and

0 commit comments

Comments
 (0)