diff --git a/lib/nori/parser/nokogiri.rb b/lib/nori/parser/nokogiri.rb index 533de70..6db8ebc 100644 --- a/lib/nori/parser/nokogiri.rb +++ b/lib/nori/parser/nokogiri.rb @@ -46,7 +46,11 @@ def characters(string) alias cdata_block characters def error(message) - @last_error = message + if (invalid_chr = message[/PCDATA invalid Char value (\d+)/, 1]) + characters(invalid_chr.to_i.chr) + else + @last_error = message + end end end @@ -54,7 +58,9 @@ def self.parse(xml, options) document = Document.new document.options = options parser = ::Nokogiri::XML::SAX::Parser.new document - parser.parse xml + parser.parse xml do |ctx| + ctx.recovery = true + end raise ParseError, document.last_error if document.last_error document.stack.length > 0 ? document.stack.pop.to_hash : {} end diff --git a/spec/nori/nori_spec.rb b/spec/nori/nori_spec.rb index d11a6f4..dd21e6c 100644 --- a/spec/nori/nori_spec.rb +++ b/spec/nori/nori_spec.rb @@ -644,6 +644,10 @@ expect { parse('foo bar') }.to raise_error(Nori::ParseError) end + it "should preserve control characters" do + xml = "a\u0002c".force_encoding('UTF-8') + expect(parse(xml)["tag"]).to eq("a\u0002c") + end end end