|
1 | 1 | from __future__ import absolute_import, division, unicode_literals
|
2 | 2 |
|
3 |
| -import json |
4 |
| - |
5 |
| -from .support import get_data_files |
6 |
| - |
7 |
| -from html5lib import parseFragment, serialize |
8 |
| - |
9 |
| - |
10 |
| -def runSanitizerTest(name, input, expected): |
11 |
| - parsed = parseFragment(input) |
12 |
| - serialized = serialize(parsed, |
13 |
| - sanitize=True, |
14 |
| - omit_optional_tags=False, |
15 |
| - use_trailing_solidus=True, |
16 |
| - space_before_trailing_solidus=False, |
17 |
| - quote_attr_values=True, |
18 |
| - quote_char="'") |
19 |
| - errorMsg = "\n".join(["\n\nInput:", input, |
20 |
| - "\nExpected:", expected, |
21 |
| - "\nReceived:", serialized]) |
22 |
| - assert expected == serialized, errorMsg |
23 |
| - |
24 |
| - |
25 |
| -def testSanitizer(): |
26 |
| - for filename in get_data_files('sanitizer', '*.dat'): |
27 |
| - with open(filename) as fp: |
28 |
| - tests = json.load(fp) |
29 |
| - for test in tests: |
30 |
| - yield runSanitizerTest, test["name"], test["input"], test["output"] |
| 3 | +try: |
| 4 | + import json |
| 5 | +except ImportError: |
| 6 | + import simplejson as json |
| 7 | + |
| 8 | +from html5lib import html5parser, sanitizer, constants, treebuilders |
| 9 | + |
| 10 | + |
| 11 | +def toxmlFactory(): |
| 12 | + tree = treebuilders.getTreeBuilder("etree") |
| 13 | + |
| 14 | + def toxml(element): |
| 15 | + # encode/decode roundtrip required for Python 2.6 compatibility |
| 16 | + result_bytes = tree.implementation.tostring(element, encoding="utf-8") |
| 17 | + return result_bytes.decode("utf-8") |
| 18 | + |
| 19 | + return toxml |
| 20 | + |
| 21 | + |
| 22 | +def runSanitizerTest(name, expected, input, toxml=None): |
| 23 | + if toxml is None: |
| 24 | + toxml = toxmlFactory() |
| 25 | + expected = ''.join([toxml(token) for token in html5parser.HTMLParser(). |
| 26 | + parseFragment(expected)]) |
| 27 | + expected = json.loads(json.dumps(expected)) |
| 28 | + assert expected == sanitize_html(input) |
| 29 | + |
| 30 | + |
| 31 | +def sanitize_html(stream, toxml=None): |
| 32 | + if toxml is None: |
| 33 | + toxml = toxmlFactory() |
| 34 | + return ''.join([toxml(token) for token in |
| 35 | + html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer). |
| 36 | + parseFragment(stream)]) |
| 37 | + |
| 38 | + |
| 39 | +def test_should_handle_astral_plane_characters(): |
| 40 | + assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>𝒵 𝔸</p>") |
| 41 | + |
| 42 | + |
| 43 | +def test_should_allow_relative_uris(): |
| 44 | + assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>') |
| 45 | + |
| 46 | + |
| 47 | +def test_sanitizer(): |
| 48 | + toxml = toxmlFactory() |
| 49 | + for tag_name in sanitizer.HTMLSanitizer.allowed_elements: |
| 50 | + if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']: |
| 51 | + continue # TODO |
| 52 | + if tag_name != tag_name.lower(): |
| 53 | + continue # TODO |
| 54 | + if tag_name == 'image': |
| 55 | + yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, |
| 56 | + "<img title=\"1\"/>foo <bad>bar</bad> baz", |
| 57 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
| 58 | + toxml) |
| 59 | + elif tag_name == 'br': |
| 60 | + yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, |
| 61 | + "<br title=\"1\"/>foo <bad>bar</bad> baz<br/>", |
| 62 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
| 63 | + toxml) |
| 64 | + elif tag_name in constants.voidElements: |
| 65 | + yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, |
| 66 | + "<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name, |
| 67 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
| 68 | + toxml) |
| 69 | + else: |
| 70 | + yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name, |
| 71 | + "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
| 72 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
| 73 | + toxml) |
| 74 | + |
| 75 | + for tag_name in sanitizer.HTMLSanitizer.allowed_elements: |
| 76 | + tag_name = tag_name.upper() |
| 77 | + yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name, |
| 78 | + "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
| 79 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
| 80 | + toxml) |
| 81 | + |
| 82 | + for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: |
| 83 | + if attribute_name != attribute_name.lower(): |
| 84 | + continue # TODO |
| 85 | + if attribute_name == 'style': |
| 86 | + continue |
| 87 | + attribute_value = 'foo' |
| 88 | + if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: |
| 89 | + attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] |
| 90 | + yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name, |
| 91 | + "<p %s=\"%s\">foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), |
| 92 | + "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), |
| 93 | + toxml) |
| 94 | + |
| 95 | + for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: |
| 96 | + attribute_name = attribute_name.upper() |
| 97 | + yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name, |
| 98 | + "<p>foo <bad>bar</bad> baz</p>", |
| 99 | + "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name, |
| 100 | + toxml) |
| 101 | + |
| 102 | + for protocol in sanitizer.HTMLSanitizer.allowed_protocols: |
| 103 | + rest_of_uri = '//sub.domain.tld/path/object.ext' |
| 104 | + if protocol == 'data': |
| 105 | + rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' |
| 106 | + yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, |
| 107 | + "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), |
| 108 | + """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), |
| 109 | + toxml) |
| 110 | + |
| 111 | + yield (runSanitizerTest, "test_invalid_data_uri", |
| 112 | + "<audio controls=\"\"></audio>", |
| 113 | + "<audio controls=\"\" src=\"data:foobar\"></audio>", |
| 114 | + toxml) |
| 115 | + |
| 116 | + yield (runSanitizerTest, "test_invalid_ipv6_url", |
| 117 | + "<a>", |
| 118 | + "<a href=\"h://]\">", |
| 119 | + toxml) |
| 120 | + |
| 121 | + yield (runSanitizerTest, "test_data_uri_disallowed_type", |
| 122 | + "<audio controls=\"\"></audio>", |
| 123 | + "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>", |
| 124 | + toxml) |
| 125 | + |
| 126 | + for protocol in sanitizer.HTMLSanitizer.allowed_protocols: |
| 127 | + rest_of_uri = '//sub.domain.tld/path/object.ext' |
| 128 | + if protocol == 'data': |
| 129 | + rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ=' |
| 130 | + protocol = protocol.upper() |
| 131 | + yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, |
| 132 | + "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri), |
| 133 | + """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), |
| 134 | + toxml) |
0 commit comments