|
1 | 1 | from __future__ import absolute_import, division, unicode_literals
|
2 | 2 | from six import text_type
|
3 | 3 |
|
4 |
| -try: |
5 |
| - from functools import reduce |
6 |
| -except ImportError: |
7 |
| - pass |
| 4 | +import re |
8 | 5 |
|
9 | 6 | from ..constants import voidElements, booleanAttributes, spaceCharacters
|
10 | 7 | from ..constants import rcdataElements, entities, xmlEntities
|
|
13 | 10 |
|
14 | 11 | spaceCharacters = "".join(spaceCharacters)
|
15 | 12 |
|
| 13 | +quoteAttributeSpecChars = spaceCharacters + "\"'=<>`" |
| 14 | +quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]") |
| 15 | +quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars + |
| 16 | + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" |
| 17 | + "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" |
| 18 | + "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" |
| 19 | + "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" |
| 20 | + "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" |
| 21 | + "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" |
| 22 | + "\u3000]") |
| 23 | + |
16 | 24 | try:
|
17 | 25 | from codecs import register_error, xmlcharrefreplace_errors
|
18 | 26 | except ImportError:
|
@@ -73,7 +81,7 @@ def htmlentityreplace_errors(exc):
|
73 | 81 | class HTMLSerializer(object):
|
74 | 82 |
|
75 | 83 | # attribute quoting options
|
76 |
| - quote_attr_values = False |
| 84 | + quote_attr_values = "legacy" # be secure by default |
77 | 85 | quote_char = '"'
|
78 | 86 | use_best_quote_char = True
|
79 | 87 |
|
@@ -109,9 +117,9 @@ def __init__(self, **kwargs):
|
109 | 117 | inject_meta_charset=True|False
|
110 | 118 | Whether it insert a meta element to define the character set of the
|
111 | 119 | document.
|
112 |
| - quote_attr_values=True|False |
| 120 | + quote_attr_values="legacy"|"spec"|"always" |
113 | 121 | Whether to quote attribute values that don't require quoting
|
114 |
| - per HTML5 parsing rules. |
| 122 | + per legacy browser behaviour, when required by the standard, or always. |
115 | 123 | quote_char=u'"'|u"'"
|
116 | 124 | Use given quote character for attribute quoting. Default is to
|
117 | 125 | use double quote unless attribute value contains a double quote,
|
@@ -240,11 +248,15 @@ def serialize(self, treewalker, encoding=None):
|
240 | 248 | (k not in booleanAttributes.get(name, tuple()) and
|
241 | 249 | k not in booleanAttributes.get("", tuple())):
|
242 | 250 | yield self.encodeStrict("=")
|
243 |
| - if self.quote_attr_values or not v: |
| 251 | + if self.quote_attr_values == "always" or len(v) == 0: |
244 | 252 | quote_attr = True
|
| 253 | + elif self.quote_attr_values == "spec": |
| 254 | + quote_attr = quoteAttributeSpec.search(v) is not None |
| 255 | + elif self.quote_attr_values == "legacy": |
| 256 | + quote_attr = quoteAttributeLegacy.search(v) is not None |
245 | 257 | else:
|
246 |
| - quote_attr = reduce(lambda x, y: x or (y in v), |
247 |
| - spaceCharacters + ">\"'=", False) |
| 258 | + raise ValueError("quote_attr_values must be one of: " |
| 259 | + "'always', 'spec', or 'legacy'") |
248 | 260 | v = v.replace("&", "&")
|
249 | 261 | if self.escape_lt_in_attrs:
|
250 | 262 | v = v.replace("<", "<")
|
|
0 commit comments