diff --git a/html5lib/constants.py b/html5lib/constants.py
index 1866dd78..431c2c12 100644
--- a/html5lib/constants.py
+++ b/html5lib/constants.py
@@ -4,6 +4,9 @@
import gettext
_ = gettext.gettext
+from itertools import chain
+
+
EOF = None
E = {
@@ -3078,6 +3081,19 @@
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
+invisibleChars = frozenset(chain(
+ # ASCII control chars
+ range(0x0, 0x9), range(0xB, 0xD), range(0xE, 0x20),
+ # Other control chars
+ # fixed-width spaces, zero-width marks, bidi marks
+ range(0x2000, 0x2010),
+ # LS, PS, bidi control codes
+ range(0x2028, 0x2030),
+ # nbsp, mathsp, ideosp, WJ, interlinear
+ [0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB]
+))
+
+
class DataLossWarning(UserWarning):
pass
diff --git a/html5lib/serializer/htmlserializer.py b/html5lib/serializer/htmlserializer.py
index 18344aed..e6056f0c 100644
--- a/html5lib/serializer/htmlserializer.py
+++ b/html5lib/serializer/htmlserializer.py
@@ -94,6 +94,7 @@ class HTMLSerializer(object):
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
+ escape_invisible = False
resolve_entities = True
# miscellaneous options
@@ -105,7 +106,8 @@ class HTMLSerializer(object):
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
- "escape_rcdata", "resolve_entities", "sanitize")
+ "escape_rcdata", "escape_invisible", "resolve_entities",
+ "sanitize")
def __init__(self, **kwargs):
"""Initialize HTMLSerializer.
@@ -127,6 +129,10 @@ def __init__(self, **kwargs):
escape_rcdata=False|True
Whether to escape characters that need to be escaped within normal
elements within rcdata elements such as style.
+ escape_invisible=False|True|'numeric'|'named'
+ Whether to escape invisible characters (such as nbsp, fixed-width
+ spaces, and control codes). Uses named HTML escapes if 'named'
+ is specified, otherwise uses numeric codes.
resolve_entities=True|False
Whether to resolve named character entities that appear in the
source tree. The XML predefined entities < > & " '
@@ -160,6 +166,8 @@ def __init__(self, **kwargs):
def encode(self, string):
assert(isinstance(string, text_type))
+ if self.escape_invisible:
+ string = utils.escapeInvisible(string, self.escape_invisible == 'named')
if self.encoding:
return string.encode(self.encoding, unicode_encode_errors)
else:
diff --git a/html5lib/utils.py b/html5lib/utils.py
index 9841aebf..ae0d9fbc 100644
--- a/html5lib/utils.py
+++ b/html5lib/utils.py
@@ -2,6 +2,10 @@
from types import ModuleType
+from six import text_type
+
+from .constants import invisibleChars
+
class MethodDispatcher(dict):
"""Dict with 2 special properties:
@@ -71,3 +75,27 @@ def moduleFactory(baseModule, *args, **kwargs):
return mod
return moduleFactory
+
+
+def escapeInvisible(text, useNamedEntities=False):
+ """Escape invisible characters other than Tab, LF, CR, and ASCII space
+ """
+ assert type(text) == text_type
+ # This algorithm is O(MN) for M len(text) and N num escapable
+ # But it doesn't modify the text when N is zero (common case) and
+ # N is expected to be small (usually 1 or 2) in most other cases.
+ escapable = set()
+ for c in text:
+ if ord(c) in invisibleChars:
+ escapable.add(c)
+ if useNamedEntities:
+ # for c in escapable:
+ # name = codepoint2name.get(ord(c))
+ # escape = "&%s;" % name if name else "%X;" % ord(c)
+ # text = text.replace(c, escape)
+ raise NotImplementedError("This doesn't work on Python 3")
+ else:
+ for c in escapable:
+ text = text.replace(c, "%X;" % ord(c))
+
+ return text