|
1 | 1 | from __future__ import absolute_import, division, unicode_literals
|
2 | 2 |
|
3 |
| -try: |
4 |
| - import json |
5 |
| -except ImportError: |
6 |
| - import simplejson as json |
7 |
| - |
8 |
| -from html5lib import html5parser, sanitizer, constants, treebuilders |
9 |
| - |
10 |
| - |
11 |
| -def toxmlFactory(): |
12 |
| - tree = treebuilders.getTreeBuilder("etree") |
13 |
| - |
14 |
| - def toxml(element): |
15 |
| - # encode/decode roundtrip required for Python 2.6 compatibility |
16 |
| - result_bytes = tree.implementation.tostring(element, encoding="utf-8") |
17 |
| - return result_bytes.decode("utf-8") |
18 |
| - |
19 |
| - return toxml |
20 |
| - |
21 |
| - |
22 |
| -def runSanitizerTest(name, expected, input, toxml=None): |
23 |
| - if toxml is None: |
24 |
| - toxml = toxmlFactory() |
25 |
| - expected = ''.join([toxml(token) for token in html5parser.HTMLParser(). |
26 |
| - parseFragment(expected)]) |
27 |
| - expected = json.loads(json.dumps(expected)) |
| 3 | +from html5lib import constants |
| 4 | +from html5lib import parseFragment, serialize |
| 5 | +from html5lib.filters import sanitizer |
| 6 | + |
| 7 | + |
| 8 | +def runSanitizerTest(name, expected, input): |
| 9 | + parsed = parseFragment(expected) |
| 10 | + expected = serialize(parsed, |
| 11 | + omit_optional_tags=False, |
| 12 | + use_trailing_solidus=True, |
| 13 | + space_before_trailing_solidus=False, |
| 14 | + quote_attr_values=True, |
| 15 | + quote_char='"') |
28 | 16 | assert expected == sanitize_html(input)
|
29 | 17 |
|
30 | 18 |
|
31 |
| -def sanitize_html(stream, toxml=None): |
32 |
| - if toxml is None: |
33 |
| - toxml = toxmlFactory() |
34 |
| - return ''.join([toxml(token) for token in |
35 |
| - html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer). |
36 |
| - parseFragment(stream)]) |
| 19 | +def sanitize_html(stream): |
| 20 | + parsed = parseFragment(stream) |
| 21 | + serialized = serialize(parsed, |
| 22 | + sanitize=True, |
| 23 | + omit_optional_tags=False, |
| 24 | + use_trailing_solidus=True, |
| 25 | + space_before_trailing_solidus=False, |
| 26 | + quote_attr_values=True, |
| 27 | + quote_char='"') |
| 28 | + return serialized |
37 | 29 |
|
38 | 30 |
|
39 | 31 | def test_should_handle_astral_plane_characters():
|
40 |
| - assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml">\U0001d4b5 \U0001d538</html:p>' == sanitize_html("<p>𝒵 𝔸</p>") |
| 32 | + assert '<p>\U0001d4b5 \U0001d538</p>' == sanitize_html("<p>𝒵 𝔸</p>") |
41 | 33 |
|
42 | 34 |
|
43 | 35 | def test_should_allow_relative_uris():
|
44 |
| - assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml"><html:a href="/example.com" /></html:p>' == sanitize_html('<p><a href="/example.com"></a></p>') |
| 36 | + assert '<p><a href="/example.com"></a></p>' == sanitize_html('<p><a href="/example.com"></a></p>') |
45 | 37 |
|
46 | 38 |
|
47 | 39 | def test_sanitizer():
|
48 |
| - toxml = toxmlFactory() |
49 |
| - for tag_name in sanitizer.HTMLSanitizer.allowed_elements: |
| 40 | + for ns, tag_name in sanitizer.allowed_elements: |
50 | 41 | if tag_name in ['caption', 'col', 'colgroup', 'optgroup', 'option', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr']:
|
51 | 42 | continue # TODO
|
52 | 43 | if tag_name != tag_name.lower():
|
53 | 44 | continue # TODO
|
54 | 45 | if tag_name == 'image':
|
55 | 46 | yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
56 | 47 | "<img title=\"1\"/>foo <bad>bar</bad> baz",
|
57 |
| - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
58 |
| - toxml) |
| 48 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) |
59 | 49 | elif tag_name == 'br':
|
60 | 50 | yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
61 | 51 | "<br title=\"1\"/>foo <bad>bar</bad> baz<br/>",
|
62 |
| - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
63 |
| - toxml) |
| 52 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) |
64 | 53 | elif tag_name in constants.voidElements:
|
65 | 54 | yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
66 | 55 | "<%s title=\"1\"/>foo <bad>bar</bad> baz" % tag_name,
|
67 |
| - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
68 |
| - toxml) |
| 56 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) |
69 | 57 | else:
|
70 | 58 | yield (runSanitizerTest, "test_should_allow_%s_tag" % tag_name,
|
71 | 59 | "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name),
|
72 |
| - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
73 |
| - toxml) |
| 60 | + "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name)) |
74 | 61 |
|
75 |
| - for tag_name in sanitizer.HTMLSanitizer.allowed_elements: |
76 |
| - tag_name = tag_name.upper() |
77 |
| - yield (runSanitizerTest, "test_should_forbid_%s_tag" % tag_name, |
78 |
| - "<%s title=\"1\">foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
79 |
| - "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name, tag_name), |
80 |
| - toxml) |
81 |
| - |
82 |
| - for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: |
| 62 | + for ns, attribute_name in sanitizer.allowed_attributes: |
83 | 63 | if attribute_name != attribute_name.lower():
|
84 | 64 | continue # TODO
|
85 | 65 | if attribute_name == 'style':
|
86 | 66 | continue
|
87 | 67 | attribute_value = 'foo'
|
88 |
| - if attribute_name in sanitizer.HTMLSanitizer.attr_val_is_uri: |
89 |
| - attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.HTMLSanitizer.allowed_protocols[0] |
| 68 | + if attribute_name in sanitizer.attr_val_is_uri: |
| 69 | + attribute_value = '%s://sub.domain.tld/path/object.ext' % sanitizer.allowed_protocols[0] |
90 | 70 | yield (runSanitizerTest, "test_should_allow_%s_attribute" % attribute_name,
|
91 | 71 | "<p %s=\"%s\">foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value),
|
92 |
| - "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value), |
93 |
| - toxml) |
94 |
| - |
95 |
| - for attribute_name in sanitizer.HTMLSanitizer.allowed_attributes: |
96 |
| - attribute_name = attribute_name.upper() |
97 |
| - yield (runSanitizerTest, "test_should_forbid_%s_attribute" % attribute_name, |
98 |
| - "<p>foo <bad>bar</bad> baz</p>", |
99 |
| - "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name, |
100 |
| - toxml) |
| 72 | + "<p %s='%s'>foo <bad>bar</bad> baz</p>" % (attribute_name, attribute_value)) |
101 | 73 |
|
102 |
| - for protocol in sanitizer.HTMLSanitizer.allowed_protocols: |
| 74 | + for protocol in sanitizer.allowed_protocols: |
103 | 75 | rest_of_uri = '//sub.domain.tld/path/object.ext'
|
104 | 76 | if protocol == 'data':
|
105 | 77 | rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
|
106 | 78 | yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
|
107 | 79 | "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
|
108 |
| - """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), |
109 |
| - toxml) |
| 80 | + """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri)) |
110 | 81 |
|
111 | 82 | yield (runSanitizerTest, "test_invalid_data_uri",
|
112 | 83 | "<audio controls=\"\"></audio>",
|
113 |
| - "<audio controls=\"\" src=\"data:foobar\"></audio>", |
114 |
| - toxml) |
| 84 | + "<audio controls=\"\" src=\"data:foobar\"></audio>") |
115 | 85 |
|
116 | 86 | yield (runSanitizerTest, "test_invalid_ipv6_url",
|
117 | 87 | "<a>",
|
118 |
| - "<a href=\"h://]\">", |
119 |
| - toxml) |
| 88 | + "<a href=\"h://]\">") |
120 | 89 |
|
121 | 90 | yield (runSanitizerTest, "test_data_uri_disallowed_type",
|
122 | 91 | "<audio controls=\"\"></audio>",
|
123 |
| - "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>", |
124 |
| - toxml) |
| 92 | + "<audio controls=\"\" src=\"data:text/html,<html>\"></audio>") |
125 | 93 |
|
126 |
| - for protocol in sanitizer.HTMLSanitizer.allowed_protocols: |
| 94 | + for protocol in sanitizer.allowed_protocols: |
127 | 95 | rest_of_uri = '//sub.domain.tld/path/object.ext'
|
128 | 96 | if protocol == 'data':
|
129 | 97 | rest_of_uri = 'image/png;base64,aGVsbG8gd29ybGQ='
|
130 | 98 | protocol = protocol.upper()
|
131 | 99 | yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol,
|
132 | 100 | "<img src=\"%s:%s\">foo</a>" % (protocol, rest_of_uri),
|
133 |
| - """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri), |
134 |
| - toxml) |
| 101 | + """<img src="%s:%s">foo</a>""" % (protocol, rest_of_uri)) |
0 commit comments