Skip to content

Commit 95a0be3

Browse files
committed
fixup! squash! Fix #72: rewrite the sanitizer to be a treewalker filter only.
1 parent 08a5eca commit 95a0be3

File tree

1 file changed

+41
-74
lines changed

1 file changed

+41
-74
lines changed

html5lib/tests/test_sanitizer.py

+41-74
Original file line numberDiff line numberDiff line change
@@ -1,134 +1,101 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3-
try:
4-
import json
5-
except ImportError:
6-
import simplejson as json
7-
8-
from html5lib import html5parser, sanitizer, constants, treebuilders
9-
10-
11-
def toxmlFactory():
12-
tree = treebuilders.getTreeBuilder("etree")
13-
14-
def toxml(element):
15-
# encode/decode roundtrip required for Python 2.6 compatibility
16-
result_bytes = tree.implementation.tostring(element, encoding="utf-8")
17-
return result_bytes.decode("utf-8")
18-
19-
return toxml
20-
21-
22-
def runSanitizerTest(name, expected, input, toxml=None):
23-
if toxml is None:
24-
toxml = toxmlFactory()
25-
expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
26-
parseFragment(expected)])
27-
expected = json.loads(json.dumps(expected))
3+
from html5lib import constants
4+
from html5lib import parseFragment, serialize
5+
from html5lib.filters import sanitizer
6+
7+
8+
def runSanitizerTest(name, expected, input):
9+
parsed = parseFragment(expected)
10+
expected = serialize(parsed,
11+
omit_optional_tags=False,
12+
use_trailing_solidus=True,
13+
space_before_trailing_solidus=False,
14+
quote_attr_values=True,
15+
quote_char='"')
2816
assert expected == sanitize_html(input)
2917

3018

31-
def sanitize_html(stream, toxml=None):
32-
if toxml is None:
33-
toxml = toxmlFactory()
34-
return ''.join([toxml(token) for token in
35-
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
36-
parseFragment(stream)])
19+
def sanitize_html(stream):
20+
parsed = parseFragment(stream)
21+
serialized = serialize(parsed,
22+
sanitize=True,
23+
omit_optional_tags=False,
24+
use_trailing_solidus=True,
25+
space_before_trailing_solidus=False,
26+
quote_attr_values=True,
27+
quote_char='"')
28+
return serialized
3729

3830

3931
def test_should_handle_astral_plane_characters():
40-
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
32+
assert '<p>\U0001d4b5 \U0001d538</p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
4133

4234

4335
def test_should_allow_relative_uris():
44-
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>')
36+
assert '<p><a href="/example.com"></a></p>' == sanitize_html('<p><a href="/example.com"></a></p>')
4537

4638

4739
def test_sanitizer():
48-
toxml = toxmlFactory()
49-
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
40+
for ns, tag_name in sanitizer.allowed_elements:
5041
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
5142
continue # TODO
5243
if tag_name != tag_name.lower():
5344
continue # TODO
5445
if tag_name == 'image':
5546
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
5647
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
57-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
58-
toxml)
48+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
5949
elif tag_name == 'br':
6050
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
6151
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
62-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
63-
toxml)
52+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
6453
elif tag_name in constants.voidElements:
6554
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
6655
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
67-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
68-
toxml)
56+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
6957
else:
7058
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
7159
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
72-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
73-
toxml)
60+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name))
7461

75-
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
76-
tag_name = tag_name.upper()
77-
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
78-
"&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name, tag_name),
79-
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
80-
toxml)
81-
82-
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
62+
for ns, attribute_name in sanitizer.allowed_attributes:
8363
if attribute_name != attribute_name.lower():
8464
continue # TODO
8565
if attribute_name == 'style':
8666
continue
8767
attribute_value = 'foo'
88-
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
89-
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
68+
if attribute_name in sanitizer.attr_val_is_uri:
69+
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0]
9070
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
9171
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
92-
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
93-
toxml)
94-
95-
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
96-
attribute_name = attribute_name.upper()
97-
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
98-
"<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
99-
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
100-
toxml)
72+
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value))
10173

102-
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
74+
for protocol in sanitizer.allowed_protocols:
10375
rest_of_uri = '//sub.domain.tld/path/object.ext'
10476
if protocol == 'data':
10577
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
10678
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
10779
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
108-
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
109-
toxml)
80+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))
11081

11182
yield (runSanitizerTest, "test_invalid_data_uri",
11283
"<audio controls=\"\"></audio>",
113-
"<audio controls=\"\" src=\"data:foobar\"></audio>",
114-
toxml)
84+
"<audio controls=\"\" src=\"data:foobar\"></audio>")
11585

11686
yield (runSanitizerTest, "test_invalid_ipv6_url",
11787
"<a>",
118-
"<a href=\"h://]\">",
119-
toxml)
88+
"<a href=\"h://]\">")
12089

12190
yield (runSanitizerTest, "test_data_uri_disallowed_type",
12291
"<audio controls=\"\"></audio>",
123-
"<audio controls=\"\" src=\"data:text/html,<html>\"></audio>",
124-
toxml)
92+
"<audio controls=\"\" src=\"data:text/html,<html>\"></audio>")
12593

126-
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
94+
for protocol in sanitizer.allowed_protocols:
12795
rest_of_uri = '//sub.domain.tld/path/object.ext'
12896
if protocol == 'data':
12997
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
13098
protocol = protocol.upper()
13199
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
132100
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
133-
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
134-
toxml)
101+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri))

0 commit comments

Comments
 (0)