Simpler utf8_decode (#414)

chqrlie · web-flow · commit 921c1eef509d · 2024-05-27T08:15:52.000+02:00
- no longer pass the array length to `utf8_decode`
- add `utf8_decode_len` for border cases
- use switch based dispatch in `utf8_decode_len` to work around a gcc 12.2 optimizer bug
diff --git a/cutils.c b/cutils.c
@@ -276,17 +276,19 @@ size_t utf8_encode(uint8_t *buf, uint32_t c)
 
 /* Decode a single code point from a UTF-8 encoded array of bytes
    `p` is a valid pointer to an array of bytes
-   `max_len` is the number of bytes available in the array
    `pp` is a valid pointer to a `const uint8_t *` to store a pointer
    to the byte following the current sequence.
    Return the code point at `p`, in the range `0..0x10FFFF`
    Return 0xFFFD on error. Only a single byte is consumed in this case
    The maximum length for a UTF-8 byte sequence is 4 bytes.
    This implements the algorithm specified in whatwg.org, except it accepts
    UTF-8 encoded surrogates as JavaScript allows them in strings.
+   The source string is assumed to have at least UTF8_CHAR_LEN_MAX bytes
+   or be null terminated.
+   If `p[0]` is '\0', the return value is `0` and the byte is consumed.
    cf: https://encoding.spec.whatwg.org/#utf-8-encoder
  */
-uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
+uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp)
 {
     uint32_t c;
     uint8_t lower, upper;
@@ -305,10 +307,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
     case 0xD4: case 0xD5: case 0xD6: case 0xD7:
     case 0xD8: case 0xD9: case 0xDA: case 0xDB:
     case 0xDC: case 0xDD: case 0xDE: case 0xDF:
-        if (max_len < 2) {
-            // need more bytes
-            break;
-        }
         if (*p >= 0x80 && *p <= 0xBF) {
             *pp = p + 1;
             return ((c - 0xC0) << 6) + (*p - 0x80);
@@ -324,10 +322,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
     case 0xEC: case 0xED: case 0xEE: case 0xEF:
         lower = 0x80;
     need2:
-        if (max_len < 3) {
-            // need more bytes
-            break;
-        }
         if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
             *pp = p + 2;
             return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
@@ -346,10 +340,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
         lower = 0x80;
         upper = 0xBF;
     need3:
-        if (max_len < 4) {
-            // need more bytes
-            break;
-        }
         if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
         &&  p[2] >= 0x80 && p[2] <= 0xBF) {
             *pp = p + 3;
@@ -366,6 +356,31 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
     return 0xFFFD;
 }
 
+uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp) {
+    switch (max_len) {
+    case 0:
+        *pp = p;
+        return 0xFFFD;
+    case 1:
+        if (*p < 0x80)
+            goto good;
+        break;
+    case 2:
+        if (*p < 0xE0)
+            goto good;
+        break;
+    case 3:
+        if (*p < 0xF0)
+            goto good;
+        break;
+    default:
+    good:
+        return utf8_decode(p, pp);
+    }
+    *pp = p + 1;
+    return 0xFFFD;
+}
+
 /* Scan a UTF-8 encoded buffer for content type
    `buf` is a valid pointer to a UTF-8 encoded string
    `len` is the number of bytes to scan
@@ -399,7 +414,7 @@ int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
             len++;
             if (*p++ >= 0x80) {
                 /* parse UTF-8 sequence, check for encoding error */
-                uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
+                uint32_t c = utf8_decode_len(p - 1, p_end - (p - 1), &p_next);
                 if (p_next == p)
                     kind |= UTF8_HAS_ERRORS;
                 p = p_next;
@@ -464,7 +479,7 @@ size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_
         uint32_t c = *p++;
         if (c >= 0x80) {
             /* parse utf-8 sequence */
-            c = utf8_decode(p - 1, p_end - (p - 1), &p);
+            c = utf8_decode_len(p - 1, p_end - (p - 1), &p);
             /* encoding errors are converted as 0xFFFD and use a single byte */
             if (c > 0xFFFF) {
                 if (i < dest_len)
diff --git a/cutils.h b/cutils.h
@@ -401,7 +401,8 @@ enum {
 int utf8_scan(const char *buf, size_t len, size_t *plen);
 size_t utf8_encode_len(uint32_t c);
 size_t utf8_encode(uint8_t *buf, uint32_t c);
-uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
+uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp);
+uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp);
 size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
 size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
 size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);
diff --git a/libregexp.c b/libregexp.c
@@ -806,7 +806,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
     normal_char:
         p++;
         if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
             if (p_next == p)
                 return re_parse_error(s, "invalid UTF-8 sequence");
             p = p_next;
@@ -1125,12 +1125,12 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
         } else if (c == '>') {
             break;
         } else if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
             if (p_next == p)
                 return -1;
             p = p_next;
             if (is_hi_surrogate(c)) {
-                d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                d = utf8_decode(p, &p_next);
                 if (is_lo_surrogate(d)) {
                     c = from_surrogate(c, d);
                     p = p_next;
diff --git a/quickjs-libc.c b/quickjs-libc.c
@@ -276,7 +276,7 @@ static JSValue js_printf_internal(JSContext *ctx,
                     string_arg = JS_ToCString(ctx, argv[i++]);
                     if (!string_arg)
                         goto fail;
-                    int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
+                    int32_arg = utf8_decode((const uint8_t *)string_arg, &p);
                     JS_FreeCString(ctx, string_arg);
                 } else {
                     if (JS_ToInt32(ctx, &int32_arg, argv[i++]))
diff --git a/quickjs.c b/quickjs.c
@@ -10049,7 +10049,7 @@ static int skip_spaces(const char *pc)
             if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
                 break;
         } else {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
             /* no need to test for invalid UTF-8, 0xFFFD is not a space */
             if (!lre_is_space(c))
                 break;
@@ -18724,7 +18724,7 @@ static __exception int js_parse_template_part(JSParseState *s,
             s->eol = &p[-1];
             s->mark = p;
         } else if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
             if (p_next == p) {
                 js_parse_error(s, "invalid UTF-8 sequence");
                 goto fail;
@@ -18830,7 +18830,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                     }
                     goto fail;
                 } else if (c >= 0x80) {
-                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                    c = utf8_decode(p, &p_next);
                     if (p_next == p + 1) {
                         goto invalid_utf8;
                     }
@@ -18856,7 +18856,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
                 break;
             }
         } else if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
             if (p_next == p)
                 goto invalid_utf8;
             p = p_next;
@@ -18928,7 +18928,7 @@ static __exception int js_parse_regexp(JSParseState *s)
             else if (c == '\0' && p >= s->buf_end)
                 goto eof_error;
             else if (c >= 0x80) {
-                c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+                c = utf8_decode(p - 1, &p_next);
                 if (p_next == p) {
                     goto invalid_utf8;
                 }
@@ -18937,7 +18937,7 @@ static __exception int js_parse_regexp(JSParseState *s)
                     goto eol_error;
             }
         } else if (c >= 0x80) {
-            c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p - 1, &p_next);
             if (p_next == p) {
             invalid_utf8:
                 js_parse_error(s, "invalid UTF-8 sequence");
@@ -18957,7 +18957,7 @@ static __exception int js_parse_regexp(JSParseState *s)
 
     /* flags */
     for(;;) {
-        c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+        c = utf8_decode(p, &p_next);
         /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
         if (!lre_js_is_ident_next(c))
             break;
@@ -19031,7 +19031,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
             c = lre_parse_escape(&p_next, TRUE);
             *pident_has_escape = TRUE;
         } else if (c >= 0x80) {
-            c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p, &p_next);
             /* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
         }
         if (!lre_js_is_ident_next(c))
@@ -19135,7 +19135,7 @@ static __exception int next_token(JSParseState *s)
                     s->got_lf = TRUE; /* considered as LF for ASI */
                     p++;
                 } else if (*p >= 0x80) {
-                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                    c = utf8_decode(p, &p);
                     /* ignore invalid UTF-8 in comments */
                     if (c == CP_LS || c == CP_PS) {
                         s->got_lf = TRUE; /* considered as LF for ASI */
@@ -19156,7 +19156,7 @@ static __exception int next_token(JSParseState *s)
                 if (*p == '\r' || *p == '\n')
                     break;
                 if (*p >= 0x80) {
-                    c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                    c = utf8_decode(p, &p);
                     /* ignore invalid UTF-8 in comments */
                     /* LS or PS are considered as line terminator */
                     if (c == CP_LS || c == CP_PS) {
@@ -19256,7 +19256,7 @@ static __exception int next_token(JSParseState *s)
             if (c == '\\' && *p_next == 'u') {
                 c = lre_parse_escape(&p_next, TRUE);
             } else if (c >= 0x80) {
-                c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+                c = utf8_decode(p, &p_next);
                 if (p_next == p + 1)
                     goto invalid_utf8;
             }
@@ -19328,7 +19328,7 @@ static __exception int next_token(JSParseState *s)
                 goto fail;
             /* reject `10instanceof Number` */
             if (JS_VALUE_IS_NAN(ret) ||
-                lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
+                lre_js_is_ident_next(utf8_decode(p, &p_next))) {
                 JS_FreeValue(s->ctx, ret);
                 js_parse_error(s, "invalid number literal");
                 goto fail;
@@ -19521,7 +19521,7 @@ static __exception int next_token(JSParseState *s)
         break;
     default:
         if (c >= 0x80) {  /* non-ASCII code-point */
-            c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
+            c = utf8_decode(p, &p_next);
             if (p_next == p + 1)
                 goto invalid_utf8;
             p = p_next;
@@ -19631,7 +19631,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
             }
         } else
         if (c >= 0x80) {
-            c = utf8_decode(p - 1, s->buf_end - p, &p_next);
+            c = utf8_decode(p - 1, &p_next);
             if (p_next == p) {
                 json_parse_error(s, p - 1, "Bad UTF-8 sequence");
                 goto fail;
@@ -19835,7 +19835,7 @@ static __exception int json_next_token(JSParseState *s)
         break;
     default:
         if (c >= 0x80) {
-            c = utf8_decode(p, s->buf_end - p, &p_next);
+            c = utf8_decode(p, &p_next);
             if (p_next == p + 1) {
                 js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
             } else {
@@ -19958,7 +19958,7 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
             if (*p == '\n' || *p == '\r') {
                 break;
             } else if (*p >= 0x80) {
-                c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
+                c = utf8_decode(p, &p);
                 /* purposely ignore UTF-8 encoding errors in this comment line */
                 if (c == CP_LS || c == CP_PS)
                     break;