From 855fb6baafb22a5a3bfe1c129032cf04845d83a1 Mon Sep 17 00:00:00 2001 From: Sten Larsson Date: Fri, 5 Apr 2024 11:40:58 +0200 Subject: [PATCH] Preserve control characters If a control character like `\u0002` appears in the XML it is preserved by the REXML parser, but Nokogiri parser bails out with an incomplete XML. Note that scrubbing the string does not help in this case since this is a valid Unicode character, but it is invalid in XML 1.0. To handle this we extract the character from the error message. For parsing to continue we must also tell Nokogiri to recover from errors. --- lib/nori/parser/nokogiri.rb | 9 ++++++++- spec/nori/nori_spec.rb | 4 ++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/lib/nori/parser/nokogiri.rb b/lib/nori/parser/nokogiri.rb index af45ecb..8dd9699 100644 --- a/lib/nori/parser/nokogiri.rb +++ b/lib/nori/parser/nokogiri.rb @@ -44,13 +44,20 @@ def characters(string) alias cdata_block characters + def error(message) + raise message unless (invalid_chr = message[/PCDATA invalid Char value (\d+)/, 1]) + + characters(invalid_chr.to_i.chr) + end end def self.parse(xml, options) document = Document.new document.options = options parser = ::Nokogiri::XML::SAX::Parser.new document - parser.parse xml + parser.parse xml do |ctx| + ctx.recovery = true + end document.stack.length > 0 ? document.stack.pop.to_hash : {} end diff --git a/spec/nori/nori_spec.rb b/spec/nori/nori_spec.rb index e673722..ce6f1b1 100644 --- a/spec/nori/nori_spec.rb +++ b/spec/nori/nori_spec.rb @@ -640,6 +640,10 @@ expect(parse(' ')).to eq({}) end + it "should preserve control characters" do + xml = "a\u0002c".force_encoding('UTF-8') + expect(parse(xml)["tag"]).to eq("a\u0002c") + end end end