Skip to content

Commit

Permalink
MDEV-20912 Add support for utf8mb4_0900_* collations in MariaDB Server
Browse files Browse the repository at this point in the history
This is done by mapping most of the existing MySQL unicode 0900 collations
to MariadB 1400 unicode collations. The assumption is that 1400 is a super
set of 0900 for all practical purposes.

I also added a new function 'compare_collations()' and changed most code
to use this instead of comparing character sets directly.
This enables one to seamlessly mix-and-match the corresponding 0900 and
1400 sets. Field comparision and alter table treats the character sets
as identical.

All MySQL 8.0 0900 collations are supported except:
- utf8mb4_ja_0900_as_cs
- utf8mb4_ja_0900_as_cs_ks
- utf8mb4_ru_0900_as_cs
- utf8mb4_zh_0900_as_cs

These do not have corresponding entries in the MariadB 01400 collations.

Other things:
- Added COMMENT colum to information_schema.collations. For utf8mb4_0900
  colletions it contains the corresponding alias collation.
  • Loading branch information
montywi committed Dec 28, 2024
1 parent 9e7762e commit 7fcaab7
Show file tree
Hide file tree
Showing 21 changed files with 6,284 additions and 102 deletions.
7 changes: 4 additions & 3 deletions include/m_ctype.h
Original file line number Diff line number Diff line change
Expand Up @@ -458,9 +458,9 @@ typedef struct my_charset_loader_st
{
char error[128];
void *(*once_alloc)(size_t);
void *(*malloc)(size_t);
void *(*realloc)(void *, size_t);
void (*free)(void *);
void *(*malloc)(size_t); /* Not used */
void *(*realloc)(void *, size_t); /* Not used */
void (*free)(void *); /* Not used */
void (*reporter)(enum loglevel, const char *format, ...);
int (*add_collation)(struct charset_info_st *cs);
} MY_CHARSET_LOADER;
Expand Down Expand Up @@ -1693,6 +1693,7 @@ my_bool my_propagate_complex(CHARSET_INFO *cs, const uchar *str, size_t len);
uint my_ci_get_id_generic(CHARSET_INFO *cs, my_collation_id_type_t type);
LEX_CSTRING my_ci_get_collation_name_generic(CHARSET_INFO *cs,
my_collation_name_mode_t mode);
my_bool compare_collations(CHARSET_INFO *cs1, CHARSET_INFO *cs2);

typedef struct
{
Expand Down
3 changes: 3 additions & 0 deletions include/my_sys.h
Original file line number Diff line number Diff line change
Expand Up @@ -1119,6 +1119,9 @@ static inline my_bool my_charset_same(CHARSET_INFO *cs1, CHARSET_INFO *cs2)
extern my_bool init_compiled_charsets(myf flags);
extern void add_compiled_collation(struct charset_info_st *cs);
extern void add_compiled_extra_collation(struct charset_info_st *cs);
extern my_bool add_alias_for_collation(LEX_CSTRING *collation_name,
LEX_CSTRING *alias,
uint alias_id);
extern size_t escape_string_for_mysql(CHARSET_INFO *charset_info,
char *to, size_t to_length,
const char *from, size_t length,
Expand Down
2 changes: 1 addition & 1 deletion libmariadb
74 changes: 37 additions & 37 deletions mysql-test/main/ctype_ldml.result
Original file line number Diff line number Diff line change
Expand Up @@ -456,43 +456,43 @@ select "foo" = "foo " collate latin1_test;
1
The following tests check that two-byte collation IDs work
select * from information_schema.collations where id>256 and is_compiled<>'Yes' order by id;
COLLATION_NAME CHARACTER_SET_NAME ID IS_DEFAULT IS_COMPILED SORTLEN
ascii2_general_nopad_ci ascii2 318 1
ascii2_bin2 ascii2 319 1
ascii2_general_ci ascii2 320 Yes 1
ascii2_bin ascii2 321 1
ascii2_general_inherited_ci ascii2 322 1
ascii2_general_inherited2_ci ascii2 323 1
ascii2_badly_inherited_ci ascii2 324 1
ascii2_nopad_bin ascii2 325 1
utf8mb4_test_ci utf8mb4 326 8
utf16_test_ci utf16 327 8
utf8mb4_test_400_ci utf8mb4 328 8
utf8mb4_test_520_nopad_ci utf8mb4 329 8
utf8mb4_uca1400_test01_as_ci utf8mb4 330 4
latin1_test latin1 331 1
latin1_test2 latin1 332 1
latin1_test2_cs latin1 333 1
latin1_swedish_nopad2_ci latin1 334 1
utf8mb3_bengali_standard_ci utf8mb3 336 8
utf8mb3_bengali_traditional_ci utf8mb3 337 8
utf8mb3_implicit_weights_ci utf8mb3 338 8
utf8mb3_phone_ci utf8mb3 352 8
utf8mb3_test_ci utf8mb3 353 8
utf8mb3_5624_1 utf8mb3 354 8
utf8mb3_5624_2 utf8mb3 355 8
utf8mb3_5624_3 utf8mb3 356 8
utf8mb3_5624_4 utf8mb3 357 8
ucs2_test_ci ucs2 358 8
ucs2_vn_ci ucs2 359 8
ucs2_5624_1 ucs2 360 8
utf8mb3_5624_5 utf8mb3 368 8
utf8mb3_5624_5_bad utf8mb3 369 8
utf8mb3_czech_test_w2 utf8mb3 370 4
utf8mb3_czech_test_nopad_w2 utf8mb3 371 4
utf8mb3_czech_test_bad_w2 utf8mb3 372 4
utf32_test_ci utf32 391 8
utf8mb3_maxuserid_ci utf8mb3 2047 8
COLLATION_NAME CHARACTER_SET_NAME ID IS_DEFAULT IS_COMPILED SORTLEN COMMENT
ascii2_general_nopad_ci ascii2 318 1
ascii2_bin2 ascii2 319 1
ascii2_general_ci ascii2 320 Yes 1
ascii2_bin ascii2 321 1
ascii2_general_inherited_ci ascii2 322 1
ascii2_general_inherited2_ci ascii2 323 1
ascii2_badly_inherited_ci ascii2 324 1
ascii2_nopad_bin ascii2 325 1
utf8mb4_test_ci utf8mb4 326 8
utf16_test_ci utf16 327 8
utf8mb4_test_400_ci utf8mb4 328 8
utf8mb4_test_520_nopad_ci utf8mb4 329 8
utf8mb4_uca1400_test01_as_ci utf8mb4 330 4
latin1_test latin1 331 1 cp1252 West European
latin1_test2 latin1 332 1 cp1252 West European
latin1_test2_cs latin1 333 1 cp1252 West European
latin1_swedish_nopad2_ci latin1 334 1 cp1252 West European
utf8mb3_bengali_standard_ci utf8mb3 336 8
utf8mb3_bengali_traditional_ci utf8mb3 337 8
utf8mb3_implicit_weights_ci utf8mb3 338 8
utf8mb3_phone_ci utf8mb3 352 8
utf8mb3_test_ci utf8mb3 353 8
utf8mb3_5624_1 utf8mb3 354 8
utf8mb3_5624_2 utf8mb3 355 8
utf8mb3_5624_3 utf8mb3 356 8
utf8mb3_5624_4 utf8mb3 357 8
ucs2_test_ci ucs2 358 8
ucs2_vn_ci ucs2 359 8
ucs2_5624_1 ucs2 360 8
utf8mb3_5624_5 utf8mb3 368 8
utf8mb3_5624_5_bad utf8mb3 369 8
utf8mb3_czech_test_w2 utf8mb3 370 4
utf8mb3_czech_test_nopad_w2 utf8mb3 371 4
utf8mb3_czech_test_bad_w2 utf8mb3 372 4
utf32_test_ci utf32 391 8
utf8mb3_maxuserid_ci utf8mb3 2047 8
show collation like '%test%';
Collation Charset Id Default Compiled Sortlen
latin1_test latin1 331 1
Expand Down
Loading

0 comments on commit 7fcaab7

Please sign in to comment.