Fix mb_detect_encoding's recognition of Slavic names

alexdowad · alexdowad · commit 9bb97ee8bc61 · 2022-05-24T15:32:20.000+02:00
Thanks to Côme Chilliet for reporting that mb_detect_encoding was not
detecting the desired text encoding for strings containing š or Ž.
These characters are used in Czech, Serbian, Croatian, Bosnian,
Macedonian, etc. names.
diff --git a/ext/mbstring/common_codepoints.txt b/ext/mbstring/common_codepoints.txt
@@ -7,7 +7,9 @@
 0x0118	0x0119 # Polish
 0x0141	0x0144 # Polish
 0x015A	0x015B # Polish
+0x0160	0x0161 # Used in Slavic names
 0x0179	0x017C # Polish
+0x017D	0x017E # Used in Slavic names
 0x0300	0x030A # Diacritical marks
 0x0370	0x0377 # Greek
 0x037A	0x037F # Greek
diff --git a/ext/mbstring/rare_cp_bitvec.h b/ext/mbstring/rare_cp_bitvec.h
@@ -11,7 +11,7 @@
 
 static uint32_t rare_codepoint_bitvec[] = {
 0xffffd9ff, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0x00002001, 0x00000000, 0x00000000,
-0xfcffff0f, 0xffffffff, 0xf3ffffe1, 0xe1ffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xfcffff0f, 0xffffffff, 0xf3ffffe1, 0x81fffffc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
 0xfffff800, 0xffffffff, 0xffffffff, 0x0300ffff, 0x0000280f, 0x00000004, 0x00000000, 0x00000000,
 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt
@@ -58,6 +58,12 @@ END:VCARD
 ';
 echo mb_detect_encoding($test, ['UTF-8', 'UTF-16']), "\n";
 
+$test = 'Dušan';
+echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-8
+
+$test = 'Živko';
+echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-8
+
 // We once had a problem where all kind of strings would be detected as 'UUENCODE'
 echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n";
 echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n";
@@ -246,6 +252,8 @@ ISO-8859-1
 UTF-8
 UTF-8
 UTF-8
+UTF-8
+UTF-8
 SJIS
 == DETECT ORDER ==
 JIS: JIS