- lexer-strings.rb: Avoid an exception on utf8 surrogate pair codepoints (#1051)

Earlopain · web-flow · commit e261316b84b9 · 2025-03-31T11:20:32.000+09:00
Starting from Ruby 2.4, these are a syntax error.
I don't see an easy way of representing such strings.
Right now the parser actually crashses (in all versions) so I'd say it's an improvement.
diff --git a/lib/parser/lexer-strings.rl b/lib/parser/lexer-strings.rl
@@ -429,6 +429,15 @@ class Parser::LexerStrings
           break
         end
 
+        # UTF-16 surrogate pairs. These are actually accepted before Ruby 2.4
+        # but can't be represented in the AST. Make them a syntax error in
+        # all versions instead, Ruby would raise an exception otherwise.
+        if codepoint & 0xfffff800 == 0xd800
+          diagnostic :error, :invalid_unicode_escape, nil,
+                     range(codepoint_s, codepoint_s + codepoint_str.length)
+          break
+        end
+
         @escape += codepoint.chr(Encoding::UTF_8)
         codepoint_s += codepoint_str.length
       end
diff --git a/test/test_parser.rb b/test/test_parser.rb
@@ -5782,6 +5782,25 @@ def test_codepoint_too_large
       SINCE_1_9)
   end
 
+  def test_codepoint_surrogate
+    assert_diagnoses(
+      [:error, :invalid_unicode_escape],
+      %q{"\u{D800}"},
+      %q{    ~~~~ location})
+
+    assert_diagnoses(
+      [:error, :invalid_unicode_escape],
+      %q{"\u{DFFF}"},
+      %q{    ~~~~ location})
+
+    [
+      %q{"\u{D7FF}"},
+      %q{"\u{E000}"},
+    ].each do |code|
+      refute_diagnoses(code)
+    end
+  end
+
   def test_on_error
     assert_diagnoses(
       [:error, :unexpected_token, { :token => 'tIDENTIFIER' }],