Skip to content

Commit f6741ea

Browse files
committed
Merge pull request #95 from gsnedders/escape-characters-serializer
Fix #11 by escaping enough to be safe in legacy browsers; r=nobody!
2 parents b48d0c1 + 9b8d8eb commit f6741ea

File tree

10 files changed

+4597
-74
lines changed

10 files changed

+4597
-74
lines changed

CHANGES.rst

+7
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,13 @@ Released on XXX
3333
* **Use scripting disabled by default (as we don't implement
3434
scripting).**
3535

36+
* **Fix #11, avoiding the XSS bug potentially caused by serializer
37+
allowing attribute values to be escaped out of in old browser versions,
38+
changing the quote_attr_values option on serializer to take one of
39+
three values, "always" (the old True value), "legacy" (the new option,
40+
and the new default), and "spec" (the old False value, and the old
41+
default).**
42+
3643

3744
0.9999999/1.0b8
3845
~~~~~~~~~~~~~~~

html5lib/filters/lint.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@
1010

1111

1212
class Filter(_base.Filter):
13+
def __init__(self, source, require_matching_tags=True):
14+
super(Filter, self).__init__(source)
15+
self.require_matching_tags = require_matching_tags
16+
1317
def __iter__(self):
1418
open_elements = []
1519
for token in _base.Filter.__iter__(self):
@@ -26,7 +30,7 @@ def __iter__(self):
2630
assert type == "EmptyTag"
2731
else:
2832
assert type == "StartTag"
29-
if type == "StartTag":
33+
if type == "StartTag" and self.require_matching_tags:
3034
open_elements.append((namespace, name))
3135
for (namespace, name), value in token["data"].items():
3236
assert namespace is None or isinstance(namespace, text_type)
@@ -44,7 +48,7 @@ def __iter__(self):
4448
assert name != ""
4549
if (not namespace or namespace == namespaces["html"]) and name in voidElements:
4650
assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
47-
else:
51+
elif self.require_matching_tags:
4852
start = open_elements.pop()
4953
assert start == (namespace, name)
5054

html5lib/serializer/htmlserializer.py

+22-10
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
from __future__ import absolute_import, division, unicode_literals
22
from six import text_type
33

4-
try:
5-
from functools import reduce
6-
except ImportError:
7-
pass
4+
import re
85

96
from ..constants import voidElements, booleanAttributes, spaceCharacters
107
from ..constants import rcdataElements, entities, xmlEntities
@@ -13,6 +10,17 @@
1310

1411
spaceCharacters = "".join(spaceCharacters)
1512

13+
quoteAttributeSpecChars = spaceCharacters + "\"'=<>`"
14+
quoteAttributeSpec = re.compile("[" + quoteAttributeSpecChars + "]")
15+
quoteAttributeLegacy = re.compile("[" + quoteAttributeSpecChars +
16+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"
17+
"\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
18+
"\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
19+
"\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"
20+
"\u2001\u2002\u2003\u2004\u2005\u2006\u2007"
21+
"\u2008\u2009\u200a\u2028\u2029\u202f\u205f"
22+
"\u3000]")
23+
1624
try:
1725
from codecs import register_error, xmlcharrefreplace_errors
1826
except ImportError:
@@ -73,7 +81,7 @@ def htmlentityreplace_errors(exc):
7381
class HTMLSerializer(object):
7482

7583
# attribute quoting options
76-
quote_attr_values = False
84+
quote_attr_values = "legacy" # be secure by default
7785
quote_char = '"'
7886
use_best_quote_char = True
7987

@@ -109,9 +117,9 @@ def __init__(self, **kwargs):
109117
inject_meta_charset=True|False
110118
Whether it insert a meta element to define the character set of the
111119
document.
112-
quote_attr_values=True|False
120+
quote_attr_values="legacy"|"spec"|"always"
113121
Whether to quote attribute values that don't require quoting
114-
per HTML5 parsing rules.
122+
per legacy browser behaviour, when required by the standard, or always.
115123
quote_char=u'"'|u"'"
116124
Use given quote character for attribute quoting. Default is to
117125
use double quote unless attribute value contains a double quote,
@@ -240,11 +248,15 @@ def serialize(self, treewalker, encoding=None):
240248
(k not in booleanAttributes.get(name, tuple()) and
241249
k not in booleanAttributes.get("", tuple())):
242250
yield self.encodeStrict("=")
243-
if self.quote_attr_values or not v:
251+
if self.quote_attr_values == "always" or len(v) == 0:
244252
quote_attr = True
253+
elif self.quote_attr_values == "spec":
254+
quote_attr = quoteAttributeSpec.search(v) is not None
255+
elif self.quote_attr_values == "legacy":
256+
quote_attr = quoteAttributeLegacy.search(v) is not None
245257
else:
246-
quote_attr = reduce(lambda x, y: x or (y in v),
247-
spaceCharacters + ">\"'=", False)
258+
raise ValueError("quote_attr_values must be one of: "
259+
"'always', 'spec', or 'legacy'")
248260
v = v.replace("&", "&amp;")
249261
if self.escape_lt_in_attrs:
250262
v = v.replace("<", "&lt;")

0 commit comments

Comments
 (0)