From df767c4fc139bace59517b118816ae4630f88fdc Mon Sep 17 00:00:00 2001 From: Sten Larsson Date: Fri, 5 Apr 2024 11:40:58 +0200 Subject: [PATCH] Preserve control characters If a control character like `\u0002` appears in the XML it is preserved by the REXML parser, but Nokogiri parser bails out with an incomplete XML. Note that scrubbing the string does not help in this case since this is a valid Unicode character, but it is invalid in XML 1.0. To handle this we extract the character from the error message. For parsing to continue we must also tell Nokogiri to recover from errors. --- lib/nori/parser/nokogiri.rb | 10 ++++++++-- spec/nori/nori_spec.rb | 4 ++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/nori/parser/nokogiri.rb b/lib/nori/parser/nokogiri.rb index 533de70..6db8ebc 100644 --- a/lib/nori/parser/nokogiri.rb +++ b/lib/nori/parser/nokogiri.rb @@ -46,7 +46,11 @@ def characters(string) alias cdata_block characters def error(message) - @last_error = message + if (invalid_chr = message[/PCDATA invalid Char value (\d+)/, 1]) + characters(invalid_chr.to_i.chr) + else + @last_error = message + end end end @@ -54,7 +58,9 @@ def self.parse(xml, options) document = Document.new document.options = options parser = ::Nokogiri::XML::SAX::Parser.new document - parser.parse xml + parser.parse xml do |ctx| + ctx.recovery = true + end raise ParseError, document.last_error if document.last_error document.stack.length > 0 ? document.stack.pop.to_hash : {} end diff --git a/spec/nori/nori_spec.rb b/spec/nori/nori_spec.rb index d11a6f4..dd21e6c 100644 --- a/spec/nori/nori_spec.rb +++ b/spec/nori/nori_spec.rb @@ -644,6 +644,10 @@ expect { parse('foo bar') }.to raise_error(Nori::ParseError) end + it "should preserve control characters" do + xml = "a\u0002c".force_encoding('UTF-8') + expect(parse(xml)["tag"]).to eq("a\u0002c") + end end end