Skip to content

Commit 921c1ee

Browse files
authored
Simpler utf8_decode (#414)
- no longer pass the array length to `utf8_decode` - add `utf8_decode_len` for border cases - use switch based dispatch in `utf8_decode_len` to work around a gcc 12.2 optimizer bug
1 parent 9e67b47 commit 921c1ee

File tree

5 files changed

+53
-37
lines changed

5 files changed

+53
-37
lines changed

cutils.c

+31-16
Original file line numberDiff line numberDiff line change
@@ -276,17 +276,19 @@ size_t utf8_encode(uint8_t *buf, uint32_t c)
276276

277277
/* Decode a single code point from a UTF-8 encoded array of bytes
278278
`p` is a valid pointer to an array of bytes
279-
`max_len` is the number of bytes available in the array
280279
`pp` is a valid pointer to a `const uint8_t *` to store a pointer
281280
to the byte following the current sequence.
282281
Return the code point at `p`, in the range `0..0x10FFFF`
283282
Return 0xFFFD on error. Only a single byte is consumed in this case
284283
The maximum length for a UTF-8 byte sequence is 4 bytes.
285284
This implements the algorithm specified in whatwg.org, except it accepts
286285
UTF-8 encoded surrogates as JavaScript allows them in strings.
286+
The source string is assumed to have at least UTF8_CHAR_LEN_MAX bytes
287+
or be null terminated.
288+
If `p[0]` is '\0', the return value is `0` and the byte is consumed.
287289
cf: https://encoding.spec.whatwg.org/#utf-8-encoder
288290
*/
289-
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
291+
uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp)
290292
{
291293
uint32_t c;
292294
uint8_t lower, upper;
@@ -305,10 +307,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
305307
case 0xD4: case 0xD5: case 0xD6: case 0xD7:
306308
case 0xD8: case 0xD9: case 0xDA: case 0xDB:
307309
case 0xDC: case 0xDD: case 0xDE: case 0xDF:
308-
if (max_len < 2) {
309-
// need more bytes
310-
break;
311-
}
312310
if (*p >= 0x80 && *p <= 0xBF) {
313311
*pp = p + 1;
314312
return ((c - 0xC0) << 6) + (*p - 0x80);
@@ -324,10 +322,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
324322
case 0xEC: case 0xED: case 0xEE: case 0xEF:
325323
lower = 0x80;
326324
need2:
327-
if (max_len < 3) {
328-
// need more bytes
329-
break;
330-
}
331325
if (*p >= lower && *p <= 0xBF && p[1] >= 0x80 && p[1] <= 0xBF) {
332326
*pp = p + 2;
333327
return ((c - 0xE0) << 12) + ((*p - 0x80) << 6) + (p[1] - 0x80);
@@ -346,10 +340,6 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
346340
lower = 0x80;
347341
upper = 0xBF;
348342
need3:
349-
if (max_len < 4) {
350-
// need more bytes
351-
break;
352-
}
353343
if (*p >= lower && *p <= upper && p[1] >= 0x80 && p[1] <= 0xBF
354344
&& p[2] >= 0x80 && p[2] <= 0xBF) {
355345
*pp = p + 3;
@@ -366,6 +356,31 @@ uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp)
366356
return 0xFFFD;
367357
}
368358

359+
uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp) {
360+
switch (max_len) {
361+
case 0:
362+
*pp = p;
363+
return 0xFFFD;
364+
case 1:
365+
if (*p < 0x80)
366+
goto good;
367+
break;
368+
case 2:
369+
if (*p < 0xE0)
370+
goto good;
371+
break;
372+
case 3:
373+
if (*p < 0xF0)
374+
goto good;
375+
break;
376+
default:
377+
good:
378+
return utf8_decode(p, pp);
379+
}
380+
*pp = p + 1;
381+
return 0xFFFD;
382+
}
383+
369384
/* Scan a UTF-8 encoded buffer for content type
370385
`buf` is a valid pointer to a UTF-8 encoded string
371386
`len` is the number of bytes to scan
@@ -399,7 +414,7 @@ int utf8_scan(const char *buf, size_t buf_len, size_t *plen)
399414
len++;
400415
if (*p++ >= 0x80) {
401416
/* parse UTF-8 sequence, check for encoding error */
402-
uint32_t c = utf8_decode(p - 1, p_end - (p - 1), &p_next);
417+
uint32_t c = utf8_decode_len(p - 1, p_end - (p - 1), &p_next);
403418
if (p_next == p)
404419
kind |= UTF8_HAS_ERRORS;
405420
p = p_next;
@@ -464,7 +479,7 @@ size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_
464479
uint32_t c = *p++;
465480
if (c >= 0x80) {
466481
/* parse utf-8 sequence */
467-
c = utf8_decode(p - 1, p_end - (p - 1), &p);
482+
c = utf8_decode_len(p - 1, p_end - (p - 1), &p);
468483
/* encoding errors are converted as 0xFFFD and use a single byte */
469484
if (c > 0xFFFF) {
470485
if (i < dest_len)

cutils.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,8 @@ enum {
401401
int utf8_scan(const char *buf, size_t len, size_t *plen);
402402
size_t utf8_encode_len(uint32_t c);
403403
size_t utf8_encode(uint8_t *buf, uint32_t c);
404-
uint32_t utf8_decode(const uint8_t *p, size_t max_len, const uint8_t **pp);
404+
uint32_t utf8_decode_len(const uint8_t *p, size_t max_len, const uint8_t **pp);
405+
uint32_t utf8_decode(const uint8_t *p, const uint8_t **pp);
405406
size_t utf8_decode_buf8(uint8_t *dest, size_t dest_len, const char *src, size_t src_len);
406407
size_t utf8_decode_buf16(uint16_t *dest, size_t dest_len, const char *src, size_t src_len);
407408
size_t utf8_encode_buf8(char *dest, size_t dest_len, const uint8_t *src, size_t src_len);

libregexp.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
806806
normal_char:
807807
p++;
808808
if (c >= 0x80) {
809-
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
809+
c = utf8_decode(p - 1, &p_next);
810810
if (p_next == p)
811811
return re_parse_error(s, "invalid UTF-8 sequence");
812812
p = p_next;
@@ -1125,12 +1125,12 @@ static int re_parse_group_name(char *buf, int buf_size, const uint8_t **pp)
11251125
} else if (c == '>') {
11261126
break;
11271127
} else if (c >= 0x80) {
1128-
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
1128+
c = utf8_decode(p - 1, &p_next);
11291129
if (p_next == p)
11301130
return -1;
11311131
p = p_next;
11321132
if (is_hi_surrogate(c)) {
1133-
d = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
1133+
d = utf8_decode(p, &p_next);
11341134
if (is_lo_surrogate(d)) {
11351135
c = from_surrogate(c, d);
11361136
p = p_next;

quickjs-libc.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ static JSValue js_printf_internal(JSContext *ctx,
276276
string_arg = JS_ToCString(ctx, argv[i++]);
277277
if (!string_arg)
278278
goto fail;
279-
int32_arg = utf8_decode((const uint8_t *)string_arg, UTF8_CHAR_LEN_MAX, &p);
279+
int32_arg = utf8_decode((const uint8_t *)string_arg, &p);
280280
JS_FreeCString(ctx, string_arg);
281281
} else {
282282
if (JS_ToInt32(ctx, &int32_arg, argv[i++]))

quickjs.c

+16-16
Original file line numberDiff line numberDiff line change
@@ -10049,7 +10049,7 @@ static int skip_spaces(const char *pc)
1004910049
if (!((c >= 0x09 && c <= 0x0d) || (c == 0x20)))
1005010050
break;
1005110051
} else {
10052-
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
10052+
c = utf8_decode(p - 1, &p_next);
1005310053
/* no need to test for invalid UTF-8, 0xFFFD is not a space */
1005410054
if (!lre_is_space(c))
1005510055
break;
@@ -18724,7 +18724,7 @@ static __exception int js_parse_template_part(JSParseState *s,
1872418724
s->eol = &p[-1];
1872518725
s->mark = p;
1872618726
} else if (c >= 0x80) {
18727-
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
18727+
c = utf8_decode(p - 1, &p_next);
1872818728
if (p_next == p) {
1872918729
js_parse_error(s, "invalid UTF-8 sequence");
1873018730
goto fail;
@@ -18830,7 +18830,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
1883018830
}
1883118831
goto fail;
1883218832
} else if (c >= 0x80) {
18833-
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
18833+
c = utf8_decode(p, &p_next);
1883418834
if (p_next == p + 1) {
1883518835
goto invalid_utf8;
1883618836
}
@@ -18856,7 +18856,7 @@ static __exception int js_parse_string(JSParseState *s, int sep,
1885618856
break;
1885718857
}
1885818858
} else if (c >= 0x80) {
18859-
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
18859+
c = utf8_decode(p - 1, &p_next);
1886018860
if (p_next == p)
1886118861
goto invalid_utf8;
1886218862
p = p_next;
@@ -18928,7 +18928,7 @@ static __exception int js_parse_regexp(JSParseState *s)
1892818928
else if (c == '\0' && p >= s->buf_end)
1892918929
goto eof_error;
1893018930
else if (c >= 0x80) {
18931-
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
18931+
c = utf8_decode(p - 1, &p_next);
1893218932
if (p_next == p) {
1893318933
goto invalid_utf8;
1893418934
}
@@ -18937,7 +18937,7 @@ static __exception int js_parse_regexp(JSParseState *s)
1893718937
goto eol_error;
1893818938
}
1893918939
} else if (c >= 0x80) {
18940-
c = utf8_decode(p - 1, UTF8_CHAR_LEN_MAX, &p_next);
18940+
c = utf8_decode(p - 1, &p_next);
1894118941
if (p_next == p) {
1894218942
invalid_utf8:
1894318943
js_parse_error(s, "invalid UTF-8 sequence");
@@ -18957,7 +18957,7 @@ static __exception int js_parse_regexp(JSParseState *s)
1895718957

1895818958
/* flags */
1895918959
for(;;) {
18960-
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
18960+
c = utf8_decode(p, &p_next);
1896118961
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
1896218962
if (!lre_js_is_ident_next(c))
1896318963
break;
@@ -19031,7 +19031,7 @@ static JSAtom parse_ident(JSParseState *s, const uint8_t **pp,
1903119031
c = lre_parse_escape(&p_next, TRUE);
1903219032
*pident_has_escape = TRUE;
1903319033
} else if (c >= 0x80) {
19034-
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
19034+
c = utf8_decode(p, &p_next);
1903519035
/* no need to test for invalid UTF-8, 0xFFFD is not ident_next */
1903619036
}
1903719037
if (!lre_js_is_ident_next(c))
@@ -19135,7 +19135,7 @@ static __exception int next_token(JSParseState *s)
1913519135
s->got_lf = TRUE; /* considered as LF for ASI */
1913619136
p++;
1913719137
} else if (*p >= 0x80) {
19138-
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
19138+
c = utf8_decode(p, &p);
1913919139
/* ignore invalid UTF-8 in comments */
1914019140
if (c == CP_LS || c == CP_PS) {
1914119141
s->got_lf = TRUE; /* considered as LF for ASI */
@@ -19156,7 +19156,7 @@ static __exception int next_token(JSParseState *s)
1915619156
if (*p == '\r' || *p == '\n')
1915719157
break;
1915819158
if (*p >= 0x80) {
19159-
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
19159+
c = utf8_decode(p, &p);
1916019160
/* ignore invalid UTF-8 in comments */
1916119161
/* LS or PS are considered as line terminator */
1916219162
if (c == CP_LS || c == CP_PS) {
@@ -19256,7 +19256,7 @@ static __exception int next_token(JSParseState *s)
1925619256
if (c == '\\' && *p_next == 'u') {
1925719257
c = lre_parse_escape(&p_next, TRUE);
1925819258
} else if (c >= 0x80) {
19259-
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
19259+
c = utf8_decode(p, &p_next);
1926019260
if (p_next == p + 1)
1926119261
goto invalid_utf8;
1926219262
}
@@ -19328,7 +19328,7 @@ static __exception int next_token(JSParseState *s)
1932819328
goto fail;
1932919329
/* reject `10instanceof Number` */
1933019330
if (JS_VALUE_IS_NAN(ret) ||
19331-
lre_js_is_ident_next(utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next))) {
19331+
lre_js_is_ident_next(utf8_decode(p, &p_next))) {
1933219332
JS_FreeValue(s->ctx, ret);
1933319333
js_parse_error(s, "invalid number literal");
1933419334
goto fail;
@@ -19521,7 +19521,7 @@ static __exception int next_token(JSParseState *s)
1952119521
break;
1952219522
default:
1952319523
if (c >= 0x80) { /* non-ASCII code-point */
19524-
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p_next);
19524+
c = utf8_decode(p, &p_next);
1952519525
if (p_next == p + 1)
1952619526
goto invalid_utf8;
1952719527
p = p_next;
@@ -19631,7 +19631,7 @@ static int json_parse_string(JSParseState *s, const uint8_t **pp)
1963119631
}
1963219632
} else
1963319633
if (c >= 0x80) {
19634-
c = utf8_decode(p - 1, s->buf_end - p, &p_next);
19634+
c = utf8_decode(p - 1, &p_next);
1963519635
if (p_next == p) {
1963619636
json_parse_error(s, p - 1, "Bad UTF-8 sequence");
1963719637
goto fail;
@@ -19835,7 +19835,7 @@ static __exception int json_next_token(JSParseState *s)
1983519835
break;
1983619836
default:
1983719837
if (c >= 0x80) {
19838-
c = utf8_decode(p, s->buf_end - p, &p_next);
19838+
c = utf8_decode(p, &p_next);
1983919839
if (p_next == p + 1) {
1984019840
js_parse_error(s, "Unexpected token '\\x%02x' in JSON", *p);
1984119841
} else {
@@ -19958,7 +19958,7 @@ static void skip_shebang(const uint8_t **pp, const uint8_t *buf_end)
1995819958
if (*p == '\n' || *p == '\r') {
1995919959
break;
1996019960
} else if (*p >= 0x80) {
19961-
c = utf8_decode(p, UTF8_CHAR_LEN_MAX, &p);
19961+
c = utf8_decode(p, &p);
1996219962
/* purposely ignore UTF-8 encoding errors in this comment line */
1996319963
if (c == CP_LS || c == CP_PS)
1996419964
break;

0 commit comments

Comments
 (0)