Skip to content

Commit 9bb97ee

Browse files
committed
Fix mb_detect_encoding's recognition of Slavic names
Thanks to Côme Chilliet for reporting that mb_detect_encoding was not detecting the desired text encoding for strings containing š or Ž. These characters are used in Czech, Serbian, Croatian, Bosnian, Macedonian, etc. names.
1 parent 5017240 commit 9bb97ee

File tree

3 files changed

+11
-1
lines changed

3 files changed

+11
-1
lines changed

ext/mbstring/common_codepoints.txt

+2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
0x0118 0x0119 # Polish
88
0x0141 0x0144 # Polish
99
0x015A 0x015B # Polish
10+
0x0160 0x0161 # Used in Slavic names
1011
0x0179 0x017C # Polish
12+
0x017D 0x017E # Used in Slavic names
1113
0x0300 0x030A # Diacritical marks
1214
0x0370 0x0377 # Greek
1315
0x037A 0x037F # Greek

ext/mbstring/rare_cp_bitvec.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
static uint32_t rare_codepoint_bitvec[] = {
1313
0xffffd9ff, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0x00002001, 0x00000000, 0x00000000,
14-
0xfcffff0f, 0xffffffff, 0xf3ffffe1, 0xe1ffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
14+
0xfcffff0f, 0xffffffff, 0xf3ffffe1, 0x81fffffc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
1515
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
1616
0xfffff800, 0xffffffff, 0xffffffff, 0x0300ffff, 0x0000280f, 0x00000004, 0x00000000, 0x00000000,
1717
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

ext/mbstring/tests/mb_detect_encoding.phpt

+8
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,12 @@ END:VCARD
5858
';
5959
echo mb_detect_encoding($test, ['UTF-8', 'UTF-16']), "\n";
6060

61+
$test = 'Dušan';
62+
echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-8
63+
64+
$test = 'Živko';
65+
echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-8
66+
6167
// We once had a problem where all kind of strings would be detected as 'UUENCODE'
6268
echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n";
6369
echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n";
@@ -246,6 +252,8 @@ ISO-8859-1
246252
UTF-8
247253
UTF-8
248254
UTF-8
255+
UTF-8
256+
UTF-8
249257
SJIS
250258
== DETECT ORDER ==
251259
JIS: JIS

0 commit comments

Comments
 (0)