diff --git a/Include/internal/pycore_bytesobject.h b/Include/internal/pycore_bytesobject.h index 300e7f4896a39e..c145cfa57f2bd6 100644 --- a/Include/internal/pycore_bytesobject.h +++ b/Include/internal/pycore_bytesobject.h @@ -20,8 +20,8 @@ extern PyObject* _PyBytes_FromHex( // Helper for PyBytes_DecodeEscape that detects invalid escape chars. // Export for test_peg_generator. -PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape(const char *, Py_ssize_t, - const char *, const char **); +PyAPI_FUNC(PyObject*) _PyBytes_DecodeEscape2(const char *, Py_ssize_t, + const char *, int *); // Substring Search. diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index a60372f58295a9..038b704e829765 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -141,14 +141,14 @@ extern PyObject* _PyUnicode_DecodeUnicodeEscapeStateful( // Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape // chars. // Export for test_peg_generator. -PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal( +PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal2( const char *string, /* Unicode-Escape encoded string */ Py_ssize_t length, /* size of string */ const char *errors, /* error handling */ Py_ssize_t *consumed, /* bytes consumed */ - const char **first_invalid_escape); /* on return, points to first - invalid escaped char in - string. */ + int *first_invalid_escape); /* on return, if not -1, contain the first + invalid escaped char (<= 0xff) or invalid + octal escape (> 0xff) in string. */ /* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */ diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 86e5e5c1474674..a767f67a02cf56 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -2,6 +2,7 @@ import codecs import html.entities import itertools +import re import sys import unicodedata import unittest @@ -1125,7 +1126,7 @@ def test_bug828737(self): text = 'abcghi'*n text.translate(charmap) - def test_mutatingdecodehandler(self): + def test_mutating_decode_handler(self): baddata = [ ("ascii", b"\xff"), ("utf-7", b"++"), @@ -1160,6 +1161,42 @@ def mutating(exc): for (encoding, data) in baddata: self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242") + def test_mutating_decode_handler_unicode_escape(self): + decode = codecs.unicode_escape_decode + def mutating(exc): + if isinstance(exc, UnicodeDecodeError): + r = data.get(exc.object[:exc.end]) + if r is not None: + exc.object = r[0] + exc.object[exc.end:] + return ('\u0404', r[1]) + raise AssertionError("don't know how to handle %r" % exc) + + codecs.register_error('test.mutating2', mutating) + data = { + br'\x0': (b'\\', 0), + br'\x3': (b'xxx\\', 3), + br'\x5': (b'x\\', 1), + } + def check(input, expected, msg): + with self.assertWarns(DeprecationWarning) as cm: + self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input))) + self.assertIn(msg, str(cm.warning)) + + check(br'\x0n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence') + check(br'\x0n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence') + check(br'\x0z', '\u0404\\z', r'"\z" is an invalid escape sequence') + + check(br'\x3n\zr', '\u0404\n\\zr', r'"\z" is an invalid escape sequence') + check(br'\x3zr', '\u0404\\zr', r'"\z" is an invalid escape sequence') + check(br'\x3z5', '\u0404\\z5', r'"\z" is an invalid escape sequence') + check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r'"\z" is an invalid escape sequence') + check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r'"\z" is an invalid escape sequence') + + check(br'\x5n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence') + check(br'\x5n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence') + check(br'\x5z', '\u0404\\z', r'"\z" is an invalid escape sequence') + check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r'"\z" is an invalid escape sequence') + # issue32583 def test_crashing_decode_handler(self): # better generating one more character to fill the extra space slot diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index e51f7e0ee12b1f..4143756a460445 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1196,23 +1196,39 @@ def test_escape(self): check(br"[\1010]", b"[A0]") check(br"[\x41]", b"[A]") check(br"[\x410]", b"[A0]") + + def test_warnings(self): + decode = codecs.escape_decode + check = coding_checker(self, decode) for i in range(97, 123): b = bytes([i]) if b not in b'abfnrtvx': - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\%c" is an invalid escape sequence' % i): check(b"\\" + b, b"\\" + b) - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\%c" is an invalid escape sequence' % (i-32)): check(b"\\" + b.upper(), b"\\" + b.upper()) - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\8" is an invalid escape sequence'): check(br"\8", b"\\8") with self.assertWarns(DeprecationWarning): check(br"\9", b"\\9") - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\\xfa" is an invalid escape sequence') as cm: check(b"\\\xfa", b"\\\xfa") for i in range(0o400, 0o1000): - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\%o" is an invalid octal escape sequence' % i): check(rb'\%o' % i, bytes([i & 0o377])) + with self.assertWarnsRegex(DeprecationWarning, + r'"\\z" is an invalid escape sequence'): + self.assertEqual(decode(br'\x\z', 'ignore'), (b'\\z', 4)) + with self.assertWarnsRegex(DeprecationWarning, + r'"\\501" is an invalid octal escape sequence'): + self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6)) + def test_errors(self): decode = codecs.escape_decode self.assertRaises(ValueError, decode, br"\x") @@ -2661,24 +2677,40 @@ def test_escape_decode(self): check(br"[\x410]", "[A0]") check(br"\u20ac", "\u20ac") check(br"\U0001d120", "\U0001d120") + + def test_decode_warnings(self): + decode = codecs.unicode_escape_decode + check = coding_checker(self, decode) for i in range(97, 123): b = bytes([i]) if b not in b'abfnrtuvx': - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\%c" is an invalid escape sequence' % i): check(b"\\" + b, "\\" + chr(i)) if b.upper() not in b'UN': - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\%c" is an invalid escape sequence' % (i-32)): check(b"\\" + b.upper(), "\\" + chr(i-32)) - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\8" is an invalid escape sequence'): check(br"\8", "\\8") with self.assertWarns(DeprecationWarning): check(br"\9", "\\9") - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\\xfa" is an invalid escape sequence') as cm: check(b"\\\xfa", "\\\xfa") for i in range(0o400, 0o1000): - with self.assertWarns(DeprecationWarning): + with self.assertWarnsRegex(DeprecationWarning, + r'"\\%o" is an invalid octal escape sequence' % i): check(rb'\%o' % i, chr(i)) + with self.assertWarnsRegex(DeprecationWarning, + r'"\\z" is an invalid escape sequence'): + self.assertEqual(decode(br'\x\z', 'ignore'), ('\\z', 4)) + with self.assertWarnsRegex(DeprecationWarning, + r'"\\501" is an invalid octal escape sequence'): + self.assertEqual(decode(br'\x\501', 'ignore'), ('\u0141', 6)) + def test_decode_errors(self): decode = codecs.unicode_escape_decode for c, d in (b'x', 2), (b'u', 4), (b'U', 4): diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index b3d1c425ad18b7..1d0494c5cb3c14 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1076,10 +1076,10 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len, } /* Unescape a backslash-escaped string. */ -PyObject *_PyBytes_DecodeEscape(const char *s, +PyObject *_PyBytes_DecodeEscape2(const char *s, Py_ssize_t len, const char *errors, - const char **first_invalid_escape) + int *first_invalid_escape) { int c; char *p; @@ -1093,7 +1093,7 @@ PyObject *_PyBytes_DecodeEscape(const char *s, return NULL; writer.overallocate = 1; - *first_invalid_escape = NULL; + *first_invalid_escape = -1; end = s + len; while (s < end) { @@ -1131,9 +1131,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s, c = (c<<3) + *s++ - '0'; } if (c > 0377) { - if (*first_invalid_escape == NULL) { - *first_invalid_escape = s-3; /* Back up 3 chars, since we've - already incremented s. */ + if (*first_invalid_escape == -1) { + *first_invalid_escape = c; } } *p++ = c; @@ -1174,9 +1173,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s, break; default: - if (*first_invalid_escape == NULL) { - *first_invalid_escape = s-1; /* Back up one char, since we've - already incremented s. */ + if (*first_invalid_escape == -1) { + *first_invalid_escape = (unsigned char)s[-1]; } *p++ = '\\'; s--; @@ -1196,16 +1194,15 @@ PyObject *PyBytes_DecodeEscape(const char *s, Py_ssize_t Py_UNUSED(unicode), const char *Py_UNUSED(recode_encoding)) { - const char* first_invalid_escape; - PyObject *result = _PyBytes_DecodeEscape(s, len, errors, + int first_invalid_escape; + PyObject *result = _PyBytes_DecodeEscape2(s, len, errors, &first_invalid_escape); if (result == NULL) return NULL; - if (first_invalid_escape != NULL) { - unsigned char c = *first_invalid_escape; - if ('4' <= c && c <= '7') { + if (first_invalid_escape != -1) { + if (first_invalid_escape > 0xff) { if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "b\"\\%.3s\" is an invalid octal escape sequence. " + "b\"\\%o\" is an invalid octal escape sequence. " "Such sequences will not work in the future. ", first_invalid_escape) < 0) { @@ -1217,7 +1214,7 @@ PyObject *PyBytes_DecodeEscape(const char *s, if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "b\"\\%c\" is an invalid escape sequence. " "Such sequences will not work in the future. ", - c) < 0) + first_invalid_escape) < 0) { Py_DECREF(result); return NULL; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 75967d69ed374d..de0f1087c04de1 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6599,11 +6599,11 @@ _PyUnicode_GetNameCAPI(void) /* --- Unicode Escape Codec ----------------------------------------------- */ PyObject * -_PyUnicode_DecodeUnicodeEscapeInternal(const char *s, +_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s, Py_ssize_t size, const char *errors, Py_ssize_t *consumed, - const char **first_invalid_escape) + int *first_invalid_escape) { const char *starts = s; _PyUnicodeWriter writer; @@ -6613,7 +6613,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, _PyUnicode_Name_CAPI *ucnhash_capi; // so we can remember if we've seen an invalid escape char or not - *first_invalid_escape = NULL; + *first_invalid_escape = -1; if (size == 0) { if (consumed) { @@ -6701,9 +6701,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, } } if (ch > 0377) { - if (*first_invalid_escape == NULL) { - *first_invalid_escape = s-3; /* Back up 3 chars, since we've - already incremented s. */ + if (*first_invalid_escape == -1) { + *first_invalid_escape = ch; } } WRITE_CHAR(ch); @@ -6798,9 +6797,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s, goto error; default: - if (*first_invalid_escape == NULL) { - *first_invalid_escape = s-1; /* Back up one char, since we've - already incremented s. */ + if (*first_invalid_escape == -1) { + *first_invalid_escape = c; } WRITE_ASCII_CHAR('\\'); WRITE_CHAR(c); @@ -6845,17 +6843,16 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s, const char *errors, Py_ssize_t *consumed) { - const char *first_invalid_escape; - PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors, + int first_invalid_escape; + PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors, consumed, &first_invalid_escape); if (result == NULL) return NULL; - if (first_invalid_escape != NULL) { - unsigned char c = *first_invalid_escape; - if ('4' <= c && c <= '7') { + if (first_invalid_escape != -1) { + if (first_invalid_escape > 0xff) { if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "\"\\%.3s\" is an invalid octal escape sequence. " + "\"\\%o\" is an invalid octal escape sequence. " "Such sequences will not work in the future. ", first_invalid_escape) < 0) { @@ -6867,7 +6864,7 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s, if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "\"\\%c\" is an invalid escape sequence. " "Such sequences will not work in the future. ", - c) < 0) + first_invalid_escape) < 0) { Py_DECREF(result); return NULL; diff --git a/Parser/string_parser.c b/Parser/string_parser.c index 9dd8f9ef28bd4f..40cbdd554100cf 100644 --- a/Parser/string_parser.c +++ b/Parser/string_parser.c @@ -1,8 +1,8 @@ #include #include -#include "pycore_bytesobject.h" // _PyBytes_DecodeEscape() -#include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal() +#include "pycore_bytesobject.h" // _PyBytes_DecodeEscape2() +#include "pycore_unicodeobject.h" // _PyUnicode_DecodeUnicodeEscapeInternal2() #include "lexer/state.h" #include "pegen.h" @@ -11,33 +11,35 @@ //// STRING HANDLING FUNCTIONS //// static int -warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t) +warn_invalid_escape_sequence(Parser *p, int first_invalid_escape, Token *t) { if (p->call_invalid_rules) { // Do not report warnings if we are in the second pass of the parser // to avoid showing the warning twice. return 0; } - unsigned char c = (unsigned char)*first_invalid_escape; - if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) && (c == '{' || c == '}')) { + if ((t->type == FSTRING_MIDDLE || t->type == FSTRING_END) + && (first_invalid_escape == '{' || first_invalid_escape == '}')) + { // in this case the tokenizer has already emitted a warning, // see Parser/tokenizer/helpers.c:warn_invalid_escape_sequence return 0; } - int octal = ('4' <= c && c <= '7'); + assert(first_invalid_escape >= 0); + int octal = (first_invalid_escape > 0xff); PyObject *msg = octal ? PyUnicode_FromFormat( - "\"\\%.3s\" is an invalid octal escape sequence. " + "\"\\%o\" is an invalid octal escape sequence. " "Such sequences will not work in the future. " - "Did you mean \"\\\\%.3s\"? A raw string is also an option.", + "Did you mean \"\\\\%o\"? A raw string is also an option.", first_invalid_escape, first_invalid_escape) : PyUnicode_FromFormat( "\"\\%c\" is an invalid escape sequence. " "Such sequences will not work in the future. " "Did you mean \"\\\\%c\"? A raw string is also an option.", - c, c); + first_invalid_escape, first_invalid_escape); if (msg == NULL) { return -1; } @@ -61,15 +63,15 @@ warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token p->known_err_token = t; if (octal) { RAISE_SYNTAX_ERROR( - "\"\\%.3s\" is an invalid octal escape sequence. " - "Did you mean \"\\\\%.3s\"? A raw string is also an option.", + "\"\\%o\" is an invalid octal escape sequence. " + "Did you mean \"\\\\%o\"? A raw string is also an option.", first_invalid_escape, first_invalid_escape); } else { RAISE_SYNTAX_ERROR( "\"\\%c\" is an invalid escape sequence. " "Did you mean \"\\\\%c\"? A raw string is also an option.", - c, c); + first_invalid_escape, first_invalid_escape); } } Py_DECREF(msg); @@ -157,34 +159,31 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t) len = (size_t)(p - buf); s = buf; - const char *first_invalid_escape; - v = _PyUnicode_DecodeUnicodeEscapeInternal(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape); + int first_invalid_escape; + v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL, &first_invalid_escape); + Py_XDECREF(u); // HACK: later we can simply pass the line no, since we don't preserve the tokens // when we are decoding the string but we preserve the line numbers. - if (v != NULL && first_invalid_escape != NULL && t != NULL) { + if (v != NULL && first_invalid_escape != -1 && t != NULL) { if (warn_invalid_escape_sequence(parser, first_invalid_escape, t) < 0) { - /* We have not decref u before because first_invalid_escape points - inside u. */ - Py_XDECREF(u); Py_DECREF(v); return NULL; } } - Py_XDECREF(u); return v; } static PyObject * decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t) { - const char *first_invalid_escape; - PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape); + int first_invalid_escape; + PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL, &first_invalid_escape); if (result == NULL) { return NULL; } - if (first_invalid_escape != NULL) { + if (first_invalid_escape != -1) { if (warn_invalid_escape_sequence(p, first_invalid_escape, t) < 0) { Py_DECREF(result); return NULL;