@@ -215,7 +215,7 @@ typedef enum {
215
215
* following header file: */
216
216
# include "utfebcdic.h"
217
217
218
- # else /* ! EBCDIC */
218
+ # else /* ! EBCDIC */
219
219
220
220
START_EXTERN_C
221
221
@@ -235,11 +235,11 @@ EXTCONST unsigned char PL_utf8skip[] = {
235
235
/* 0x90 */ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* bogus: continuation byte */
236
236
/* 0xA0 */ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* bogus: continuation byte */
237
237
/* 0xB0 */ 1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 ,1 , /* bogus: continuation byte */
238
- /* 0xC0 */ 2 ,2 , /* overlong */
238
+ /* 0xC0 */ 2 ,2 , /* overlong */
239
239
/* 0xC2 */ 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 , /* U+0080 to U+03FF */
240
240
/* 0xD0 */ 2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 ,2 , /* U+0400 to U+07FF */
241
241
/* 0xE0 */ 3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 ,3 , /* U+0800 to U+FFFF */
242
- /* 0xF0 */ 4 ,4 ,4 ,4 ,4 ,4 ,4 ,4 ,5 ,5 ,5 ,5 ,6 ,6 , /* above BMP to 2**31 - 1 */
242
+ /* 0xF0 */ 4 ,4 ,4 ,4 ,4 ,4 ,4 ,4 ,5 ,5 ,5 ,5 ,6 ,6 , /* above BMP to 2**31 - 1 */
243
243
/* Perl extended (never was official UTF-8). Up to 36 bit */
244
244
/* 0xFE */ 7 ,
245
245
/* More extended, Up to 72 bits (64-bit + reserved) */
@@ -314,21 +314,21 @@ adding no time nor space requirements to the implementation.
314
314
The following table is from Unicode 3.2, plus the Perl extensions for above
315
315
U+10FFFF
316
316
317
- Code Points 1st Byte 2nd Byte 3rd 4th 5th 6th 7th 8th-13th
317
+ Code Points 1st Byte 2nd Byte 3rd 4th 5th 6th 7th 8th-13th
318
318
319
- U+0000..U+007F 00..7F
319
+ U+0000..U+007F 00..7F
320
320
U+0080..U+07FF * C2..DF 80..BF
321
- U+0800..U+0FFF E0 * A0..BF 80..BF
321
+ U+0800..U+0FFF E0 * A0..BF 80..BF
322
322
U+1000..U+CFFF E1..EC 80..BF 80..BF
323
323
U+D000..U+D7FF ED 80..9F 80..BF
324
324
U+D800..U+DFFF ED A0..BF 80..BF (surrogates)
325
325
U+E000..U+FFFF EE..EF 80..BF 80..BF
326
- U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF
327
- U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
328
- U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
326
+ U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF
327
+ U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
328
+ U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
329
329
Below are above-Unicode code points
330
- U+110000..U+13FFFF F4 90..BF 80..BF 80..BF
331
- U+110000..U+1FFFFF F5..F7 80..BF 80..BF 80..BF
330
+ U+110000..U+13FFFF F4 90..BF 80..BF 80..BF
331
+ U+110000..U+1FFFFF F5..F7 80..BF 80..BF 80..BF
332
332
U+200000..U+FFFFFF F8 * 88..BF 80..BF 80..BF 80..BF
333
333
U+1000000..U+3FFFFFF F9..FB 80..BF 80..BF 80..BF 80..BF
334
334
U+4000000..U+3FFFFFFF FC * 84..BF 80..BF 80..BF 80..BF 80..BF
@@ -670,7 +670,7 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
670
670
/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use
671
671
* UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
672
672
* be well-formed. */
673
- #define UTF8_IS_DOWNGRADEABLE_START (c ) (__ASSERT_(FITS_IN_8_BITS(c)) \
673
+ #define UTF8_IS_DOWNGRADEABLE_START (c ) (__ASSERT_(FITS_IN_8_BITS(c)) \
674
674
inRANGE_helper_(U8, NATIVE_UTF8_TO_I8(c), \
675
675
UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1))
676
676
@@ -711,7 +711,7 @@ uppercase/lowercase/titlecase/fold into.
711
711
*
712
712
=cut
713
713
*/
714
- #define UTF8_MAXBYTES_CASE \
714
+ #define UTF8_MAXBYTES_CASE \
715
715
MAX(UTF8_MAXBYTES, UTF8_MAX_FOLD_CHAR_EXPAND * UNISKIP_BY_MSB_(20))
716
716
717
717
/* Rest of these are attributes of Unicode and perl's internals rather than the
@@ -859,11 +859,11 @@ that it returns TRUE in each for the exact same set of bit patterns. It is
859
859
valid on a subset of what UVCHR_IS_INVARIANT is valid on, so can just use that;
860
860
and the compiler should optimize out anything extraneous given the
861
861
implementation of the latter. */
862
- #define UTF8_IS_INVARIANT (c ) UVCHR_IS_INVARIANT(ASSERT_NOT_PTR(c))
862
+ #define UTF8_IS_INVARIANT (c ) UVCHR_IS_INVARIANT(ASSERT_NOT_PTR(c))
863
863
864
864
/* Like the above, but its name implies a non-UTF8 input, which as the comments
865
865
* above show, doesn't matter as to its implementation */
866
- #define NATIVE_BYTE_IS_INVARIANT (c ) UVCHR_IS_INVARIANT(c)
866
+ #define NATIVE_BYTE_IS_INVARIANT (c ) UVCHR_IS_INVARIANT(c)
867
867
868
868
/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
869
869
* in UTF-8? This is the inverse of UTF8_IS_INVARIANT. */
@@ -967,8 +967,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
967
967
&& is_in_locale_category_ (FALSE, -1 ))) \
968
968
&& (! IN_BYTES ))
969
969
970
- #define UNICODE_SURROGATE_FIRST 0xD800
971
- #define UNICODE_SURROGATE_LAST 0xDFFF
970
+ #define UNICODE_SURROGATE_FIRST 0xD800
971
+ #define UNICODE_SURROGATE_LAST 0xDFFF
972
972
973
973
/*
974
974
=for apidoc Am|bool|UNICODE_IS_SURROGATE|const UV uv
@@ -1012,7 +1012,7 @@ representation.
1012
1012
1013
1013
=cut
1014
1014
*/
1015
- #define UNICODE_REPLACEMENT 0xFFFD
1015
+ #define UNICODE_REPLACEMENT 0xFFFD
1016
1016
#define UNICODE_IS_REPLACEMENT (uv ) UNLIKELY((UV) (uv) == UNICODE_REPLACEMENT)
1017
1017
#define UTF8_IS_REPLACEMENT (s , send ) \
1018
1018
UNLIKELY( \
@@ -1021,7 +1021,7 @@ representation.
1021
1021
sizeof(REPLACEMENT_CHARACTER_UTF8) - 1))
1022
1022
1023
1023
/* Max legal code point according to Unicode */
1024
- #define PERL_UNICODE_MAX 0x10FFFF
1024
+ #define PERL_UNICODE_MAX 0x10FFFF
1025
1025
1026
1026
/*
1027
1027
@@ -1057,10 +1057,10 @@ this macro matches
1057
1057
1058
1058
=cut
1059
1059
1060
- * ASCII EBCDIC I8
1061
- * U+10FFFF: \xF4\x8F\xBF\xBF \xF9\xA1\xBF\xBF\xBF max legal Unicode
1062
- * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0
1063
- * U+110001: \xF4\x90\x80\x81 \xF9\xA2\xA0\xA0\xA1
1060
+ * ASCII EBCDIC I8
1061
+ * U+10FFFF: \xF4\x8F\xBF\xBF \xF9\xA1\xBF\xBF\xBF max legal Unicode
1062
+ * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0
1063
+ * U+110001: \xF4\x90\x80\x81 \xF9\xA2\xA0\xA0\xA1
1064
1064
*/
1065
1065
#define UTF_START_BYTE_110000_ UTF_START_BYTE(PERL_UNICODE_MAX + 1, 21)
1066
1066
#define UTF_FIRST_CONT_BYTE_110000_ \
@@ -1255,10 +1255,10 @@ point's representation.
1255
1255
#define UTF8_ALLOW_ANYUV 0
1256
1256
#define UTF8_ALLOW_DEFAULT UTF8_ALLOW_ANYUV
1257
1257
1258
- #define UNICODE_WARN_SURROGATE 0x0001 /* UTF-16 surrogates */
1259
- #define UNICODE_WARN_NONCHAR 0x0002 /* Non-char code points */
1260
- #define UNICODE_WARN_SUPER 0x0004 /* Above 0x10FFFF */
1261
- #define UNICODE_WARN_PERL_EXTENDED 0x0008 /* Above 0x7FFF_FFFF */
1258
+ #define UNICODE_WARN_SURROGATE 0x0001 /* UTF-16 surrogates */
1259
+ #define UNICODE_WARN_NONCHAR 0x0002 /* Non-char code points */
1260
+ #define UNICODE_WARN_SUPER 0x0004 /* Above 0x10FFFF */
1261
+ #define UNICODE_WARN_PERL_EXTENDED 0x0008 /* Above 0x7FFF_FFFF */
1262
1262
#define UNICODE_WARN_ABOVE_31_BIT UNICODE_WARN_PERL_EXTENDED
1263
1263
#define UNICODE_DISALLOW_SURROGATE 0x0010
1264
1264
#define UNICODE_DISALLOW_NONCHAR 0x0020
@@ -1286,11 +1286,11 @@ point's representation.
1286
1286
1287
1287
/* For backward source compatibility, as are now the default */
1288
1288
#define UNICODE_ALLOW_SURROGATE 0
1289
- #define UNICODE_ALLOW_SUPER 0
1290
- #define UNICODE_ALLOW_ANY 0
1289
+ #define UNICODE_ALLOW_SUPER 0
1290
+ #define UNICODE_ALLOW_ANY 0
1291
1291
1292
- #define UNICODE_BYTE_ORDER_MARK 0xFEFF
1293
- #define UNICODE_IS_BYTE_ORDER_MARK (uv ) UNLIKELY((UV) (uv) \
1292
+ #define UNICODE_BYTE_ORDER_MARK 0xFEFF
1293
+ #define UNICODE_IS_BYTE_ORDER_MARK (uv ) UNLIKELY((UV) (uv) \
1294
1294
== UNICODE_BYTE_ORDER_MARK)
1295
1295
1296
1296
#define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE
@@ -1301,15 +1301,15 @@ point's representation.
1301
1301
LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE_NATIVE
1302
1302
#define LATIN_SMALL_LETTER_A_WITH_RING_ABOVE \
1303
1303
LATIN_SMALL_LETTER_A_WITH_RING_ABOVE_NATIVE
1304
- #define UNICODE_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
1305
- #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
1306
- #define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3
1304
+ #define UNICODE_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
1305
+ #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
1306
+ #define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3
1307
1307
#define GREEK_SMALL_LETTER_MU 0x03BC
1308
- #define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case
1308
+ #define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case
1309
1309
of MICRON */
1310
- #define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */
1310
+ #define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */
1311
1311
#ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
1312
- # define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
1312
+ # define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
1313
1313
#endif
1314
1314
#define LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x130
1315
1315
#define LATIN_SMALL_LETTER_DOTLESS_I 0x131
@@ -1319,16 +1319,16 @@ point's representation.
1319
1319
#define KELVIN_SIGN 0x212A
1320
1320
#define ANGSTROM_SIGN 0x212B
1321
1321
1322
- #define UNI_DISPLAY_ISPRINT 0x0001
1323
- #define UNI_DISPLAY_BACKSLASH 0x0002
1324
- #define UNI_DISPLAY_BACKSPACE 0x0004 /* Allow \b when also
1322
+ #define UNI_DISPLAY_ISPRINT 0x0001
1323
+ #define UNI_DISPLAY_BACKSLASH 0x0002
1324
+ #define UNI_DISPLAY_BACKSPACE 0x0004 /* Allow \b when also
1325
1325
UNI_DISPLAY_BACKSLASH */
1326
- #define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT \
1326
+ #define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT \
1327
1327
|UNI_DISPLAY_BACKSLASH \
1328
1328
|UNI_DISPLAY_BACKSPACE)
1329
1329
1330
1330
/* Character classes could also allow \b, but not patterns in general */
1331
- #define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
1331
+ #define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
1332
1332
1333
1333
/* Should be removed; maybe deprecated, but not used in CPAN */
1334
1334
#define SHARP_S_SKIP 2
0 commit comments