Skip to content

Commit 08a5eca

Browse files
committed
squash! Fix #72: rewrite the sanitizer to be a treewalker filter only.
Undoes deletion of the testsuite
1 parent 0e8792b commit 08a5eca

File tree

1 file changed

+132
-28
lines changed

1 file changed

+132
-28
lines changed

html5lib/tests/test_sanitizer.py

+132-28
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,134 @@
11
from __future__ import absolute_import, division, unicode_literals
22

3-
import json
4-
5-
from .support import get_data_files
6-
7-
from html5lib import parseFragment, serialize
8-
9-
10-
def runSanitizerTest(name, input, expected):
11-
parsed = parseFragment(input)
12-
serialized = serialize(parsed,
13-
sanitize=True,
14-
omit_optional_tags=False,
15-
use_trailing_solidus=True,
16-
space_before_trailing_solidus=False,
17-
quote_attr_values=True,
18-
quote_char="'")
19-
errorMsg = "\n".join(["\n\nInput:", input,
20-
"\nExpected:", expected,
21-
"\nReceived:", serialized])
22-
assert expected == serialized, errorMsg
23-
24-
25-
def testSanitizer():
26-
for filename in get_data_files('sanitizer', '*.dat'):
27-
with open(filename) as fp:
28-
tests = json.load(fp)
29-
for test in tests:
30-
yield runSanitizerTest, test["name"], test["input"], test["output"]
3+
try:
4+
import json
5+
except ImportError:
6+
import simplejson as json
7+
8+
from html5lib import html5parser, sanitizer, constants, treebuilders
9+
10+
11+
def toxmlFactory():
12+
tree = treebuilders.getTreeBuilder("etree")
13+
14+
def toxml(element):
15+
# encode/decode roundtrip required for Python 2.6 compatibility
16+
result_bytes = tree.implementation.tostring(element, encoding="utf-8")
17+
return result_bytes.decode("utf-8")
18+
19+
return toxml
20+
21+
22+
def runSanitizerTest(name, expected, input, toxml=None):
23+
if toxml is None:
24+
toxml = toxmlFactory()
25+
expected = ''.join([toxml(token) for token in html5parser.HTMLParser().
26+
parseFragment(expected)])
27+
expected = json.loads(json.dumps(expected))
28+
assert expected == sanitize_html(input)
29+
30+
31+
def sanitize_html(stream, toxml=None):
32+
if toxml is None:
33+
toxml = toxmlFactory()
34+
return ''.join([toxml(token) for token in
35+
html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer).
36+
parseFragment(stream)])
37+
38+
39+
def test_should_handle_astral_plane_characters():
40+
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>&#x1d4b5; &#x1d538;</p>")
41+
42+
43+
def test_should_allow_relative_uris():
44+
assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>')
45+
46+
47+
def test_sanitizer():
48+
toxml = toxmlFactory()
49+
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
50+
if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
51+
continue # TODO
52+
if tag_name != tag_name.lower():
53+
continue # TODO
54+
if tag_name == 'image':
55+
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
56+
"<img title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz",
57+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
58+
toxml)
59+
elif tag_name == 'br':
60+
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
61+
"<br title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz<br/>",
62+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
63+
toxml)
64+
elif tag_name in constants.voidElements:
65+
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
66+
"<%s title=\"1\"/>foo &lt;bad&gt;bar&lt;/bad&gt; baz" % tag_name,
67+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
68+
toxml)
69+
else:
70+
yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
71+
"<%s title=\"1\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</%s>" % (tag_name, tag_name),
72+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
73+
toxml)
74+
75+
for tag_name in sanitizer.HTMLSanitizer.allowed_elements:
76+
tag_name = tag_name.upper()
77+
yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name,
78+
"&lt;%s title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/%s&gt;" % (tag_name, tag_name),
79+
"<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
80+
toxml)
81+
82+
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
83+
if attribute_name != attribute_name.lower():
84+
continue # TODO
85+
if attribute_name == 'style':
86+
continue
87+
attribute_value = 'foo'
88+
if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri:
89+
attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0]
90+
yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
91+
"<p %s=\"%s\">foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>" % (attribute_name, attribute_value),
92+
"<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
93+
toxml)
94+
95+
for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes:
96+
attribute_name = attribute_name.upper()
97+
yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name,
98+
"<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>",
99+
"<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name,
100+
toxml)
101+
102+
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
103+
rest_of_uri = '//sub.domain.tld/path/object.ext'
104+
if protocol == 'data':
105+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
106+
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
107+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
108+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
109+
toxml)
110+
111+
yield (runSanitizerTest, "test_invalid_data_uri",
112+
"<audio controls=\"\"></audio>",
113+
"<audio controls=\"\" src=\"data:foobar\"></audio>",
114+
toxml)
115+
116+
yield (runSanitizerTest, "test_invalid_ipv6_url",
117+
"<a>",
118+
"<a href=\"h://]\">",
119+
toxml)
120+
121+
yield (runSanitizerTest, "test_data_uri_disallowed_type",
122+
"<audio controls=\"\"></audio>",
123+
"<audio controls=\"\" src=\"data:text/html,<html>\"></audio>",
124+
toxml)
125+
126+
for protocol in sanitizer.HTMLSanitizer.allowed_protocols:
127+
rest_of_uri = '//sub.domain.tld/path/object.ext'
128+
if protocol == 'data':
129+
rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
130+
protocol = protocol.upper()
131+
yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
132+
"<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
133+
"""<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri),
134+
toxml)

0 commit comments

Comments
 (0)