From f4f1fb8cf044fd1616648821d13247450901f62a Mon Sep 17 00:00:00 2001 From: fantasai Date: Tue, 27 Jul 2010 21:30:17 +0100 Subject: [PATCH 1/2] Google Code Issue 157: Add "escape invisible characters" option Vaguely updated, but basically working. --- html5lib/constants.py | 16 ++++++++++++++++ html5lib/serializer/htmlserializer.py | 10 +++++++++- html5lib/utils.py | 26 ++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/html5lib/constants.py b/html5lib/constants.py index 1866dd78..431c2c12 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -4,6 +4,9 @@ import gettext _ = gettext.gettext +from itertools import chain + + EOF = None E = { @@ -3078,6 +3081,19 @@ prefixes["http://www.w3.org/1998/Math/MathML"] = "math" +invisibleChars = frozenset(chain( + # ASCII control chars + range(0x0, 0x9), range(0xB, 0xD), range(0xE, 0x20), + # Other control chars + # fixed-width spaces, zero-width marks, bidi marks + range(0x2000, 0x2010), + # LS, PS, bidi control codes + range(0x2028, 0x2030), + # nbsp, mathsp, ideosp, WJ, interlinear + [0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB] +)) + + class DataLossWarning(UserWarning): pass diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 18344aed..adc1bf59 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -94,6 +94,7 @@ class HTMLSerializer(object): # escaping options escape_lt_in_attrs = False escape_rcdata = False + escape_invisible = False resolve_entities = True # miscellaneous options @@ -105,7 +106,8 @@ class HTMLSerializer(object): "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", - "escape_rcdata", "resolve_entities", "sanitize") + "escape_rcdata", "escape_invisible", "resolve_entities", + "sanitize") def __init__(self, **kwargs): """Initialize HTMLSerializer. @@ -127,6 +129,10 @@ def __init__(self, **kwargs): escape_rcdata=False|True Whether to escape characters that need to be escaped within normal elements within rcdata elements such as style. + escape_invisible=False|True|'numeric'|'named' + Whether to escape invisible characters (such as nbsp, fixed-width + spaces, and control codes). Uses named HTML escapes if 'named' + is specified, otherwise uses numeric codes. resolve_entities=True|False Whether to resolve named character entities that appear in the source tree. The XML predefined entities < > & " ' @@ -160,6 +166,8 @@ def __init__(self, **kwargs): def encode(self, string): assert(isinstance(string, text_type)) + if self.escape_invisible: + text = utils.escapeInvisible(text, self.escape_invisible == 'named') if self.encoding: return string.encode(self.encoding, unicode_encode_errors) else: diff --git a/html5lib/utils.py b/html5lib/utils.py index 9841aebf..3f3fee01 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -2,6 +2,8 @@ from types import ModuleType +from .constants import invisibleChars + class MethodDispatcher(dict): """Dict with 2 special properties: @@ -71,3 +73,27 @@ def moduleFactory(baseModule, *args, **kwargs): return mod return moduleFactory + + +def escapeInvisible(text, useNamedEntities=False): + """Escape invisible characters other than Tab, LF, CR, and ASCII space + """ + assert type(text) == text_type + # This algorithm is O(MN) for M len(text) and N num escapable + # But it doesn't modify the text when N is zero (common case) and + # N is expected to be small (usually 1 or 2) in most other cases. + escapable = set() + for c in text: + if ord(c) in invisibleChars: + escapable.add(c) + if useNamedEntities: + raise NotImplementedError("This doesn't work on Python 3") + for c in escapable: + name = codepoint2name.get(ord(c)) + escape = "&%s;" % name if name else "&#x%X;" % ord(c) + text = text.replace(c, escape) + else: + for c in escapable: + text = text.replace(c, "&#x%X;" % ord(c)) + + return text From 93440015e6e41f1bab0162ce27f323b65f4cd6e8 Mon Sep 17 00:00:00 2001 From: Geoffrey Sneddon Date: Sat, 4 May 2013 13:56:03 +0100 Subject: [PATCH 2/2] fixup! Google Code Issue 157: Add "escape invisible characters" option --- html5lib/serializer/htmlserializer.py | 2 +- html5lib/utils.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index adc1bf59..e6056f0c 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -167,7 +167,7 @@ def __init__(self, **kwargs): def encode(self, string): assert(isinstance(string, text_type)) if self.escape_invisible: - text = utils.escapeInvisible(text, self.escape_invisible == 'named') + string = utils.escapeInvisible(string, self.escape_invisible == 'named') if self.encoding: return string.encode(self.encoding, unicode_encode_errors) else: diff --git a/html5lib/utils.py b/html5lib/utils.py index 3f3fee01..ae0d9fbc 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -2,6 +2,8 @@ from types import ModuleType +from six import text_type + from .constants import invisibleChars @@ -87,11 +89,11 @@ def escapeInvisible(text, useNamedEntities=False): if ord(c) in invisibleChars: escapable.add(c) if useNamedEntities: + # for c in escapable: + # name = codepoint2name.get(ord(c)) + # escape = "&%s;" % name if name else "&#x%X;" % ord(c) + # text = text.replace(c, escape) raise NotImplementedError("This doesn't work on Python 3") - for c in escapable: - name = codepoint2name.get(ord(c)) - escape = "&%s;" % name if name else "&#x%X;" % ord(c) - text = text.replace(c, escape) else: for c in escapable: text = text.replace(c, "&#x%X;" % ord(c))