diff --git a/lib/nori/parser/nokogiri.rb b/lib/nori/parser/nokogiri.rb index af45ecb..8dd9699 100644 --- a/lib/nori/parser/nokogiri.rb +++ b/lib/nori/parser/nokogiri.rb @@ -44,13 +44,20 @@ def characters(string) alias cdata_block characters + def error(message) + raise message unless (invalid_chr = message[/PCDATA invalid Char value (\d+)/, 1]) + + characters(invalid_chr.to_i.chr) + end end def self.parse(xml, options) document = Document.new document.options = options parser = ::Nokogiri::XML::SAX::Parser.new document - parser.parse xml + parser.parse xml do |ctx| + ctx.recovery = true + end document.stack.length > 0 ? document.stack.pop.to_hash : {} end diff --git a/spec/nori/nori_spec.rb b/spec/nori/nori_spec.rb index e673722..ce6f1b1 100644 --- a/spec/nori/nori_spec.rb +++ b/spec/nori/nori_spec.rb @@ -640,6 +640,10 @@ expect(parse(' ')).to eq({}) end + it "should preserve control characters" do + xml = "a\u0002c".force_encoding('UTF-8') + expect(parse(xml)["tag"]).to eq("a\u0002c") + end end end