Skip to content

Commit 2abdafc

Browse files
authored
Merge pull request #284 from josecolella/jc-th-add-breakpoint-scrubber
[RubyConf] Create scrubber for replacing double breakpoints into paragraph nodes
2 parents 868a852 + 4d94183 commit 2abdafc

File tree

3 files changed

+67
-0
lines changed

3 files changed

+67
-0
lines changed

Diff for: README.md

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Active Record extensions for HTML sanitization are available in the [`loofah-act
3131
* Add the _nofollow_ attribute to all hyperlinks.
3232
* Add the _target=\_blank_ attribute to all hyperlinks.
3333
* Remove _unprintable_ characters from text nodes.
34+
* Modify _double breakpoints_ characters to paragraph nodes.
3435
* Format markup as plain text, with (or without) sensible whitespace handling around block elements.
3536
* Replace Rails's `strip_tags` and `sanitize` view helper methods.
3637

@@ -235,6 +236,7 @@ doc.scrub!(:noopener) # adds rel="noopener" attribute to links
235236
doc.scrub!(:noreferrer) # adds rel="noreferrer" attribute to links
236237
doc.scrub!(:unprintable) # removes unprintable characters from text nodes
237238
doc.scrub!(:targetblank) # adds target="_blank" attribute to links
239+
doc.scrub!(:double_breakpoint) # removes double breakpoints to paragraph nodes
238240
```
239241

240242
See `Loofah::Scrubbers` for more details and example usage.

Diff for: lib/loofah/scrubbers.rb

+52
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,57 @@ def scrub(node)
350350
end
351351
end
352352

353+
#
354+
# === scrub!(:double_breakpoint)
355+
#
356+
# +:double_breakpoint+ replaces double-break tags with closing/opening paragraph tags.
357+
#
358+
# double_breakpoint_markup = "<p>Some text here in a logical paragraph.<br><br>Some more text, apparently a second paragraph.</p>"
359+
# Loofah.html5_fragment(messy_markup).scrub!(:double_breakpoint)
360+
# => "<p>Some text here in a logical paragraph.</p><p>Some more text, apparently a second paragraph.</p>"
361+
#
362+
class DoubleBreakpoint < Scrubber
363+
def initialize # rubocop:disable Lint/MissingSuper
364+
@direction = :top_down
365+
end
366+
367+
def scrub(node)
368+
return CONTINUE unless (node.type == Nokogiri::XML::Node::ELEMENT_NODE) && (node.name == "p")
369+
370+
paragraph_with_break_point_nodes = node.xpath("//p[br[following-sibling::br]]")
371+
372+
paragraph_with_break_point_nodes.each do |paragraph_node|
373+
new_paragraph = paragraph_node.add_previous_sibling("<p>").first
374+
375+
paragraph_node.children.each do |child|
376+
remove_blank_text_nodes(child)
377+
end
378+
379+
paragraph_node.children.each do |child|
380+
# already unlinked
381+
next if child.parent.nil?
382+
383+
if child.name == "br" && child.next_sibling.name == "br"
384+
new_paragraph = paragraph_node.add_previous_sibling("<p>").first
385+
child.next_sibling.unlink
386+
child.unlink
387+
else
388+
child.parent = new_paragraph
389+
end
390+
end
391+
392+
paragraph_node.unlink
393+
end
394+
395+
CONTINUE
396+
end
397+
398+
private
399+
400+
def remove_blank_text_nodes(node)
401+
node.unlink if node.text? && node.blank?
402+
end
403+
end
353404
#
354405
# A hash that maps a symbol (like +:prune+) to the appropriate Scrubber (Loofah::Scrubbers::Prune).
355406
#
@@ -364,6 +415,7 @@ def scrub(node)
364415
targetblank: TargetBlank,
365416
newline_block_elements: NewlineBlockElements,
366417
unprintable: Unprintable,
418+
double_breakpoint: DoubleBreakpoint,
367419
}
368420

369421
class << self

Diff for: test/integration/test_scrubbers.rb

+13
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ class IntegrationTestScrubbers < Loofah::TestCase
5050
ENTITY_HACK_ATTACK_TEXT_SCRUB = "Hack attack!&lt;script&gt;alert('evil')&lt;/script&gt;"
5151
ENTITY_HACK_ATTACK_TEXT_SCRUB_UNESC = "Hack attack!<script>alert('evil')</script>"
5252

53+
BREAKPOINT_FRAGMENT = "<p>Some text here in a logical paragraph.<br><br>Some more text, apparently a second paragraph.<br><br>Et cetera...</p>"
54+
BREAKPOINT_RESULT = "<p>Some text here in a logical paragraph.</p><p>Some more text, apparently a second paragraph.</p><p>Et cetera...</p>"
55+
5356
context "scrubbing shortcuts" do
5457
context "#scrub_document" do
5558
it "is a shortcut for parse-and-scrub" do
@@ -236,6 +239,16 @@ def html5?
236239
assert_equal doc, result
237240
end
238241
end
242+
243+
context ":double_breakpoint" do
244+
it "replaces double line breaks with paragraph tags" do
245+
doc = klass.parse("<html><body>#{BREAKPOINT_FRAGMENT}</body></html>")
246+
result = doc.scrub!(:double_breakpoint)
247+
248+
assert_equal BREAKPOINT_RESULT, doc.xpath("/html/body").inner_html.delete("\n")
249+
assert_equal doc, result
250+
end
251+
end
239252
end
240253

241254
context "#text" do

0 commit comments

Comments
 (0)