diff --git a/html5lib/constants.py b/html5lib/constants.py index 1866dd78..431c2c12 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -4,6 +4,9 @@ import gettext _ = gettext.gettext +from itertools import chain + + EOF = None E = { @@ -3078,6 +3081,19 @@ prefixes["http://www.w3.org/1998/Math/MathML"] = "math" +invisibleChars = frozenset(chain( + # ASCII control chars + range(0x0, 0x9), range(0xB, 0xD), range(0xE, 0x20), + # Other control chars + # fixed-width spaces, zero-width marks, bidi marks + range(0x2000, 0x2010), + # LS, PS, bidi control codes + range(0x2028, 0x2030), + # nbsp, mathsp, ideosp, WJ, interlinear + [0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB] +)) + + class DataLossWarning(UserWarning): pass diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py index 18344aed..e6056f0c 100644 --- a/html5lib/serializer/htmlserializer.py +++ b/html5lib/serializer/htmlserializer.py @@ -94,6 +94,7 @@ class HTMLSerializer(object): # escaping options escape_lt_in_attrs = False escape_rcdata = False + escape_invisible = False resolve_entities = True # miscellaneous options @@ -105,7 +106,8 @@ class HTMLSerializer(object): "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", - "escape_rcdata", "resolve_entities", "sanitize") + "escape_rcdata", "escape_invisible", "resolve_entities", + "sanitize") def __init__(self, **kwargs): """Initialize HTMLSerializer. @@ -127,6 +129,10 @@ def __init__(self, **kwargs): escape_rcdata=False|True Whether to escape characters that need to be escaped within normal elements within rcdata elements such as style. + escape_invisible=False|True|'numeric'|'named' + Whether to escape invisible characters (such as nbsp, fixed-width + spaces, and control codes). Uses named HTML escapes if 'named' + is specified, otherwise uses numeric codes. resolve_entities=True|False Whether to resolve named character entities that appear in the source tree. The XML predefined entities < > & " ' @@ -160,6 +166,8 @@ def __init__(self, **kwargs): def encode(self, string): assert(isinstance(string, text_type)) + if self.escape_invisible: + string = utils.escapeInvisible(string, self.escape_invisible == 'named') if self.encoding: return string.encode(self.encoding, unicode_encode_errors) else: diff --git a/html5lib/utils.py b/html5lib/utils.py index 9841aebf..ae0d9fbc 100644 --- a/html5lib/utils.py +++ b/html5lib/utils.py @@ -2,6 +2,10 @@ from types import ModuleType +from six import text_type + +from .constants import invisibleChars + class MethodDispatcher(dict): """Dict with 2 special properties: @@ -71,3 +75,27 @@ def moduleFactory(baseModule, *args, **kwargs): return mod return moduleFactory + + +def escapeInvisible(text, useNamedEntities=False): + """Escape invisible characters other than Tab, LF, CR, and ASCII space + """ + assert type(text) == text_type + # This algorithm is O(MN) for M len(text) and N num escapable + # But it doesn't modify the text when N is zero (common case) and + # N is expected to be small (usually 1 or 2) in most other cases. + escapable = set() + for c in text: + if ord(c) in invisibleChars: + escapable.add(c) + if useNamedEntities: + # for c in escapable: + # name = codepoint2name.get(ord(c)) + # escape = "&%s;" % name if name else "&#x%X;" % ord(c) + # text = text.replace(c, escape) + raise NotImplementedError("This doesn't work on Python 3") + else: + for c in escapable: + text = text.replace(c, "&#x%X;" % ord(c)) + + return text