From eb5f2169ca6dc5180eabc4c2a687db7145ea2b4f Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Mon, 14 Oct 2024 09:19:15 +0900 Subject: [PATCH 01/10] Add grapheme_levenshtein function. Measure levenshtein for grapheme cluster unit --- ext/intl/grapheme/grapheme_string.c | 181 +++++++++++++++++++++++ ext/intl/php_intl.stub.php | 2 + ext/intl/tests/grapheme_levenshtein.phpt | 104 +++++++++++++ 3 files changed, 287 insertions(+) create mode 100644 ext/intl/tests/grapheme_levenshtein.phpt diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index 77bf4319928a8..a383489f8c453 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -918,4 +918,185 @@ PHP_FUNCTION(grapheme_str_split) ubrk_close(bi); } +PHP_FUNCTION(grapheme_levenshtein) +{ + zend_string *string1, *string2; + zend_long cost_ins = 1; + zend_long cost_rep = 1; + zend_long cost_del = 1; + + ZEND_PARSE_PARAMETERS_START(2, 5) + Z_PARAM_STR(string1) + Z_PARAM_STR(string2) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(cost_ins) + Z_PARAM_LONG(cost_rep) + Z_PARAM_LONG(cost_del) + ZEND_PARSE_PARAMETERS_END(); + + if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) { + zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) { + zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + if (cost_del <= 0 || cost_del > UINT_MAX / 4) { + zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + zend_long *p1, *p2, *tmp; + zend_long c0, c1, c2; + zend_long retval; + size_t i2; + char *pstr1, *pstr2; + + UChar *ustring1 = NULL; + UChar *ustring2 = NULL; + + int32_t ustring1_len = 0; + int32_t ustring2_len = 0; + + UErrorCode ustatus1 = U_ZERO_ERROR; + UErrorCode ustatus2 = U_ZERO_ERROR; + + /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means + * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time) + * by having shorter rows (p1 & p2). */ + if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) { + zend_string *tmp = string1; + string1 = string2; + string2 = tmp; + } + + pstr1 = ZSTR_VAL(string1); + pstr2 = ZSTR_VAL(string2); + + intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus1); + + if ( U_FAILURE( ustatus1 ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, ustatus1 ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 ); + if (ustring1) { + efree( ustring1 ); + } + RETURN_FALSE; + } + + intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus2); + + if ( U_FAILURE( ustatus2 ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, ustatus2 ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 ); + if (ustring2) { + efree( ustring2 ); + } + if (ustring1) { + efree( ustring1 ); + } + RETURN_FALSE; + } + + UText *ut1 = NULL; + UText *ut2 = NULL; + UBreakIterator *bi1, *bi2; + + int32_t strlen_1, strlen_2; + strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0 ); + strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0 ); + + if (strlen_1 == 0) { + efree(ustring1); + efree(ustring2); + RETURN_LONG(strlen_2 * cost_ins); + } + if (strlen_2 == 0) { + efree(ustring1); + efree(ustring2); + RETURN_LONG(strlen_1 * cost_del); + } + + unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE]; + unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE]; + bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus1 ); + bi2 = grapheme_get_break_iterator((void*)u_break_iterator_buffer2, &ustatus2 ); + + ut1 = utext_openUTF8(ut1, pstr1, ZSTR_LEN(string1), &ustatus1); + ubrk_setUText(bi1, ut1, &ustatus1); + ut2 = utext_openUTF8(ut2, pstr2, ZSTR_LEN(string2), &ustatus2); + ubrk_setUText(bi2, ut2, &ustatus2); + + p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); + p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); + + for (i2 = 0; i2 <= strlen_2; i2++) { + p1[i2] = i2 * cost_ins; + } + + int32_t current1 = 0; + int32_t current2 = 0; + int32_t pos1 = 0; + int32_t pos2 = 0; + int32_t usrch_pos = 0; + for ( ; pos1 != UBRK_DONE; ) { + current1 = ubrk_current(bi1); + pos1 = ubrk_next(bi1); + if (pos1 == UBRK_DONE) { + break; + } + p2[0] = p1[0] + cost_del; + for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) { + current2 = ubrk_current(bi2); + pos2 = ubrk_next(bi2); + if (pos2 == UBRK_DONE) { + break; + } + usrch_pos = grapheme_strpos_utf16(pstr1 + current1, pos1 - current1, pstr2 + current2, pos2 - current2, 0, NULL, 0, 0); + if (usrch_pos == 0) { + c0 = p1[i2]; + } else { + c0 = p1[i2] + cost_rep; + } + c1 = p1[i2 + 1] + cost_del; + if (c1 < c0) { + c0 = c1; + } + c2 = p2[i2] + cost_ins; + if (c2 < c0) { + c0 = c2; + } + p2[i2 + 1] = c0; + } + ubrk_first(bi2); + tmp = p1; + p1 = p2; + p2 = tmp; + } + + utext_close(ut1); + utext_close(ut2); + + ubrk_close(bi1); + ubrk_close(bi2); + + efree(ustring1); + efree(ustring2); + + retval = p1[strlen_2]; + + efree(p1); + efree(p2); + RETURN_LONG(retval); +} + /* }}} */ diff --git a/ext/intl/php_intl.stub.php b/ext/intl/php_intl.stub.php index 4469845483e8e..7d45dcb3601f3 100644 --- a/ext/intl/php_intl.stub.php +++ b/ext/intl/php_intl.stub.php @@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = function grapheme_str_split(string $string, int $length = 1): array|false {} +function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {} + /** @param int $next */ function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {} diff --git a/ext/intl/tests/grapheme_levenshtein.phpt b/ext/intl/tests/grapheme_levenshtein.phpt new file mode 100644 index 0000000000000..eec36ea2f9801 --- /dev/null +++ b/ext/intl/tests/grapheme_levenshtein.phpt @@ -0,0 +1,104 @@ +--TEST-- +grapheme_levenshtein() function test +--EXTENSIONS-- +intl +--FILE-- + +--EXPECT-- +--- Equal --- +int(0) +--- First string empty --- +int(3) +--- Second string empty --- +int(3) +--- Both empty --- +int(0) +int(0) +--- 1 character --- +int(1) +--- 2 character swapped --- +int(2) +--- Inexpensive deletion --- +int(2) +--- Expensive deletion --- +int(10) +--- Inexpensive insertion --- +int(2) +--- Expensive insertion --- +int(10) +--- Expensive replacement --- +int(3) +--- Very expensive replacement --- +int(4) +--- 128 codepoints --- +int(2) +--- 128 codepoints over --- +int(2) +int(256) +--- 128 codepoints over only $string1 --- +int(128) +--- 128 codepoints over only $string2 --- +int(130) +--- 128 codepoints over Hiragana --- +int(2) +--- Variable selector --- +int(1) +int(0) +int(0) From f22d76ec237af47697e6c58888a0fd8c0629e8fc Mon Sep 17 00:00:00 2001 From: Derick Rethans Date: Mon, 7 Apr 2025 23:59:24 +0100 Subject: [PATCH 02/10] Update for coding standards. --- ext/intl/grapheme/grapheme_string.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index a383489f8c453..c5ac6da7b347a 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -978,31 +978,31 @@ PHP_FUNCTION(grapheme_levenshtein) intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus1); - if ( U_FAILURE( ustatus1 ) ) { + if (U_FAILURE(ustatus1)) { /* Set global error code. */ intl_error_set_code( NULL, ustatus1 ); /* Set error messages. */ intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 ); if (ustring1) { - efree( ustring1 ); + efree(ustring1); } RETURN_FALSE; } intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus2); - if ( U_FAILURE( ustatus2 ) ) { + if (U_FAILURE(ustatus2)) { /* Set global error code. */ - intl_error_set_code( NULL, ustatus2 ); + intl_error_set_code(NULL, ustatus2); /* Set error messages. */ - intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 ); + intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0); if (ustring2) { - efree( ustring2 ); + efree(ustring2); } if (ustring1) { - efree( ustring1 ); + efree(ustring1); } RETURN_FALSE; } @@ -1012,8 +1012,8 @@ PHP_FUNCTION(grapheme_levenshtein) UBreakIterator *bi1, *bi2; int32_t strlen_1, strlen_2; - strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0 ); - strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0 ); + strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0); + strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0); if (strlen_1 == 0) { efree(ustring1); @@ -1028,8 +1028,8 @@ PHP_FUNCTION(grapheme_levenshtein) unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE]; unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE]; - bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus1 ); - bi2 = grapheme_get_break_iterator((void*)u_break_iterator_buffer2, &ustatus2 ); + bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus1); + bi2 = grapheme_get_break_iterator((void*)u_break_iterator_buffer2, &ustatus2); ut1 = utext_openUTF8(ut1, pstr1, ZSTR_LEN(string1), &ustatus1); ubrk_setUText(bi1, ut1, &ustatus1); @@ -1048,7 +1048,7 @@ PHP_FUNCTION(grapheme_levenshtein) int32_t pos1 = 0; int32_t pos2 = 0; int32_t usrch_pos = 0; - for ( ; pos1 != UBRK_DONE; ) { + for (; pos1 != UBRK_DONE;) { current1 = ubrk_current(bi1); pos1 = ubrk_next(bi1); if (pos1 == UBRK_DONE) { From c019d07807e56eb79bae9edabad780c7cf2a805d Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Thu, 17 Apr 2025 09:30:54 +0900 Subject: [PATCH 03/10] Fix some nits --- ext/intl/grapheme/grapheme_string.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index c5ac6da7b347a..9b056348688cf 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -965,7 +965,7 @@ PHP_FUNCTION(grapheme_levenshtein) UErrorCode ustatus2 = U_ZERO_ERROR; /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means - * that the distance is symmetric. If string1 is shorter than string 2 we can save memory (and CPU time) + * that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time) * by having shorter rows (p1 & p2). */ if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) { zend_string *tmp = string1; @@ -980,10 +980,10 @@ PHP_FUNCTION(grapheme_levenshtein) if (U_FAILURE(ustatus1)) { /* Set global error code. */ - intl_error_set_code( NULL, ustatus1 ); + intl_error_set_code(NULL, ustatus1); /* Set error messages. */ - intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 ); + intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0); if (ustring1) { efree(ustring1); } @@ -1048,7 +1048,7 @@ PHP_FUNCTION(grapheme_levenshtein) int32_t pos1 = 0; int32_t pos2 = 0; int32_t usrch_pos = 0; - for (; pos1 != UBRK_DONE;) { + while (pos1 != UBRK_DONE) { current1 = ubrk_current(bi1); pos1 = ubrk_next(bi1); if (pos1 == UBRK_DONE) { From 2881b1cb4c8334447114c8eb9a00f39125da596b Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Thu, 17 Apr 2025 10:20:24 +0900 Subject: [PATCH 04/10] Add test for corner case --- ext/intl/tests/grapheme_levenshtein.phpt | 26 +++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/ext/intl/tests/grapheme_levenshtein.phpt b/ext/intl/tests/grapheme_levenshtein.phpt index eec36ea2f9801..322717e5a54ce 100644 --- a/ext/intl/tests/grapheme_levenshtein.phpt +++ b/ext/intl/tests/grapheme_levenshtein.phpt @@ -60,8 +60,28 @@ var_dump(grapheme_levenshtein($nabe, $nabe_E0100)); // combining character var_dump(grapheme_levenshtein("\u{0065}\u{0301}", "\u{00e9}")); + +// Corner case +echo '--- Corner case ---' . PHP_EOL; +try { + grapheme_levenshtein($nabe, $nabe_E0100, 0, 1, 1); +} catch (ValueError $e) { + var_dump($e->getMessage()); +} + +try { + grapheme_levenshtein($nabe, $nabe_E0100, 1, 0, 1); +} catch (ValueError $e) { + var_dump($e->getMessage()); +} + +try { + grapheme_levenshtein($nabe, $nabe_E0100, 1, 1, 0); +} catch (ValueError $e) { + var_dump($e->getMessage()); +} ?> ---EXPECT-- +--EXPECTF-- --- Equal --- int(0) --- First string empty --- @@ -102,3 +122,7 @@ int(2) int(1) int(0) int(0) +--- Corner case --- +string(%d) "grapheme_levenshtein(): Argument #3 ($insertion_cost) must be greater than 0 and less than or equal to %d" +string(%d) "grapheme_levenshtein(): Argument #4 ($replacement_cost) must be greater than 0 and less than or equal to %d" +string(%d) "grapheme_levenshtein(): Argument #5 ($deletion_cost) must be greater than 0 and less than or equal to %d" From 8dc01caba2faddf61bab06f2064d2d51d4c31249 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Thu, 17 Apr 2025 15:45:33 +0900 Subject: [PATCH 05/10] Fix test for output of error message --- ext/intl/tests/grapheme_levenshtein.phpt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ext/intl/tests/grapheme_levenshtein.phpt b/ext/intl/tests/grapheme_levenshtein.phpt index 322717e5a54ce..4ff7dbb607bcd 100644 --- a/ext/intl/tests/grapheme_levenshtein.phpt +++ b/ext/intl/tests/grapheme_levenshtein.phpt @@ -66,19 +66,19 @@ echo '--- Corner case ---' . PHP_EOL; try { grapheme_levenshtein($nabe, $nabe_E0100, 0, 1, 1); } catch (ValueError $e) { - var_dump($e->getMessage()); + echo $e->getMessage() . PHP_EOL; } try { grapheme_levenshtein($nabe, $nabe_E0100, 1, 0, 1); } catch (ValueError $e) { - var_dump($e->getMessage()); + echo $e->getMessage() . PHP_EOL; } try { grapheme_levenshtein($nabe, $nabe_E0100, 1, 1, 0); } catch (ValueError $e) { - var_dump($e->getMessage()); + echo $e->getMessage() . PHP_EOL; } ?> --EXPECTF-- @@ -123,6 +123,6 @@ int(1) int(0) int(0) --- Corner case --- -string(%d) "grapheme_levenshtein(): Argument #3 ($insertion_cost) must be greater than 0 and less than or equal to %d" -string(%d) "grapheme_levenshtein(): Argument #4 ($replacement_cost) must be greater than 0 and less than or equal to %d" -string(%d) "grapheme_levenshtein(): Argument #5 ($deletion_cost) must be greater than 0 and less than or equal to %d" +grapheme_levenshtein(): Argument #3 ($insertion_cost) must be greater than 0 and less than or equal to %d +grapheme_levenshtein(): Argument #4 ($replacement_cost) must be greater than 0 and less than or equal to %d +grapheme_levenshtein(): Argument #5 ($deletion_cost) must be greater than 0 and less than or equal to %d From a37e54f3d5d798ebb6267ec25ee4a2e5b205e202 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 18 Apr 2025 22:09:40 +0900 Subject: [PATCH 06/10] grapheme_levenshtein unify internal character is UTF-16 --- ext/intl/grapheme/grapheme_string.c | 52 ++++++++++++++++++++--------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index 9b056348688cf..e3d019c29c758 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -26,6 +26,7 @@ #include #include #include +#include /* }}} */ @@ -979,10 +980,8 @@ PHP_FUNCTION(grapheme_levenshtein) intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus1); if (U_FAILURE(ustatus1)) { - /* Set global error code. */ intl_error_set_code(NULL, ustatus1); - /* Set error messages. */ intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0); if (ustring1) { efree(ustring1); @@ -993,10 +992,8 @@ PHP_FUNCTION(grapheme_levenshtein) intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus2); if (U_FAILURE(ustatus2)) { - /* Set global error code. */ intl_error_set_code(NULL, ustatus2); - /* Set error messages. */ intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0); if (ustring2) { efree(ustring2); @@ -1007,8 +1004,6 @@ PHP_FUNCTION(grapheme_levenshtein) RETURN_FALSE; } - UText *ut1 = NULL; - UText *ut2 = NULL; UBreakIterator *bi1, *bi2; int32_t strlen_1, strlen_2; @@ -1031,10 +1026,28 @@ PHP_FUNCTION(grapheme_levenshtein) bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus1); bi2 = grapheme_get_break_iterator((void*)u_break_iterator_buffer2, &ustatus2); - ut1 = utext_openUTF8(ut1, pstr1, ZSTR_LEN(string1), &ustatus1); - ubrk_setUText(bi1, ut1, &ustatus1); - ut2 = utext_openUTF8(ut2, pstr2, ZSTR_LEN(string2), &ustatus2); - ubrk_setUText(bi2, ut2, &ustatus2); + ubrk_setText(bi1, ustring1, ustring1_len, &ustatus1); + + if (U_FAILURE(ustatus1)) { + intl_error_set_code(NULL, ustatus1); + + intl_error_set_custom_msg(NULL, "Error on ubrk_setText on ustring1", 0); + if (ustring1) { + efree(ustring1); + } + RETURN_FALSE; + } + + ubrk_setText(bi2, ustring2, ustring2_len, &ustatus2); + if (U_FAILURE(ustatus2)) { + intl_error_set_code(NULL, ustatus2); + + intl_error_set_custom_msg(NULL, "Error on ubrk_setText on ustring2", 0); + if (ustring2) { + efree(ustring2); + } + RETURN_FALSE; + } p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); @@ -1048,6 +1061,7 @@ PHP_FUNCTION(grapheme_levenshtein) int32_t pos1 = 0; int32_t pos2 = 0; int32_t usrch_pos = 0; + while (pos1 != UBRK_DONE) { current1 = ubrk_current(bi1); pos1 = ubrk_next(bi1); @@ -1061,8 +1075,19 @@ PHP_FUNCTION(grapheme_levenshtein) if (pos2 == UBRK_DONE) { break; } - usrch_pos = grapheme_strpos_utf16(pstr1 + current1, pos1 - current1, pstr2 + current2, pos2 - current2, 0, NULL, 0, 0); - if (usrch_pos == 0) { + UStringSearch *srch = usearch_open(ustring1 + current1, pos1 - current1, ustring2 + current2, pos2 - current2, "", NULL, &ustatus2); + if (U_FAILURE(ustatus2)) { + intl_error_set_code(NULL, ustatus2); + intl_error_set_custom_msg(NULL, "Error usearch_open", 0); + } + usrch_pos = usearch_first(srch, &ustatus2); + if (U_FAILURE(ustatus2)) { + intl_error_set_code(NULL, ustatus2); + intl_error_set_custom_msg(NULL, "Error usearch_first", 0); + } + usearch_close(srch); + + if (usrch_pos != USEARCH_DONE) { c0 = p1[i2]; } else { c0 = p1[i2] + cost_rep; @@ -1083,9 +1108,6 @@ PHP_FUNCTION(grapheme_levenshtein) p2 = tmp; } - utext_close(ut1); - utext_close(ut2); - ubrk_close(bi1); ubrk_close(bi2); From b88d99302a9bd3bafe91a4bd9e154108ae39dc47 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 18 Apr 2025 23:13:33 +0900 Subject: [PATCH 07/10] Add free if U_FAILED --- ext/intl/grapheme/grapheme_string.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index e3d019c29c758..ac91bf194b3f5 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -1032,6 +1032,9 @@ PHP_FUNCTION(grapheme_levenshtein) intl_error_set_code(NULL, ustatus1); intl_error_set_custom_msg(NULL, "Error on ubrk_setText on ustring1", 0); + if (ustring2) { + efree(ustring2); + } if (ustring1) { efree(ustring1); } @@ -1046,6 +1049,9 @@ PHP_FUNCTION(grapheme_levenshtein) if (ustring2) { efree(ustring2); } + if (ustring1) { + efree(ustring1); + } RETURN_FALSE; } @@ -1079,11 +1085,29 @@ PHP_FUNCTION(grapheme_levenshtein) if (U_FAILURE(ustatus2)) { intl_error_set_code(NULL, ustatus2); intl_error_set_custom_msg(NULL, "Error usearch_open", 0); + ubrk_close(bi1); + ubrk_close(bi2); + + efree(ustring1); + efree(ustring2); + + efree(p1); + efree(p2); + RETURN_FALSE; } usrch_pos = usearch_first(srch, &ustatus2); if (U_FAILURE(ustatus2)) { intl_error_set_code(NULL, ustatus2); intl_error_set_custom_msg(NULL, "Error usearch_first", 0); + ubrk_close(bi1); + ubrk_close(bi2); + + efree(ustring1); + efree(ustring2); + + efree(p1); + efree(p2); + RETURN_FALSE; } usearch_close(srch); From f709ba7ce7c3403d0618364daa4d74df02fce748 Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Fri, 18 Apr 2025 23:57:01 +0900 Subject: [PATCH 08/10] Remove null check --- ext/intl/grapheme/grapheme_string.c | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index ac91bf194b3f5..405ed3f25db46 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -983,9 +983,7 @@ PHP_FUNCTION(grapheme_levenshtein) intl_error_set_code(NULL, ustatus1); intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0); - if (ustring1) { - efree(ustring1); - } + efree(ustring1); RETURN_FALSE; } @@ -995,12 +993,8 @@ PHP_FUNCTION(grapheme_levenshtein) intl_error_set_code(NULL, ustatus2); intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0); - if (ustring2) { - efree(ustring2); - } - if (ustring1) { - efree(ustring1); - } + efree(ustring2); + efree(ustring1); RETURN_FALSE; } @@ -1032,12 +1026,8 @@ PHP_FUNCTION(grapheme_levenshtein) intl_error_set_code(NULL, ustatus1); intl_error_set_custom_msg(NULL, "Error on ubrk_setText on ustring1", 0); - if (ustring2) { - efree(ustring2); - } - if (ustring1) { - efree(ustring1); - } + efree(ustring2); + efree(ustring1); RETURN_FALSE; } @@ -1046,12 +1036,8 @@ PHP_FUNCTION(grapheme_levenshtein) intl_error_set_code(NULL, ustatus2); intl_error_set_custom_msg(NULL, "Error on ubrk_setText on ustring2", 0); - if (ustring2) { - efree(ustring2); - } - if (ustring1) { - efree(ustring1); - } + efree(ustring2); + efree(ustring1); RETURN_FALSE; } From e2912783fa95ee4213521c6f233d9b8a18f6f05a Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Sat, 19 Apr 2025 21:38:57 +0900 Subject: [PATCH 09/10] Fix while --- ext/intl/grapheme/grapheme_string.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index 405ed3f25db46..c4d0c3987a8fa 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -1054,7 +1054,7 @@ PHP_FUNCTION(grapheme_levenshtein) int32_t pos2 = 0; int32_t usrch_pos = 0; - while (pos1 != UBRK_DONE) { + while (true) { current1 = ubrk_current(bi1); pos1 = ubrk_next(bi1); if (pos1 == UBRK_DONE) { From 6f41f98782bfb31a9fd053f242d77fe7bfc437cc Mon Sep 17 00:00:00 2001 From: Yuya Hamada Date: Mon, 21 Apr 2025 12:03:22 +0900 Subject: [PATCH 10/10] Fix usearch to ucol --- ext/intl/grapheme/grapheme_string.c | 106 +++++++++++++++------------- ext/intl/php_intl_arginfo.h | 12 +++- 2 files changed, 66 insertions(+), 52 deletions(-) diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index c4d0c3987a8fa..95563406e308a 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -950,7 +950,6 @@ PHP_FUNCTION(grapheme_levenshtein) RETURN_THROWS(); } - zend_long *p1, *p2, *tmp; zend_long c0, c1, c2; zend_long retval; size_t i2; @@ -962,8 +961,7 @@ PHP_FUNCTION(grapheme_levenshtein) int32_t ustring1_len = 0; int32_t ustring2_len = 0; - UErrorCode ustatus1 = U_ZERO_ERROR; - UErrorCode ustatus2 = U_ZERO_ERROR; + UErrorCode ustatus = U_ZERO_ERROR; /* When all costs are equal, levenshtein fulfills the requirements of a metric, which means * that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time) @@ -977,20 +975,20 @@ PHP_FUNCTION(grapheme_levenshtein) pstr1 = ZSTR_VAL(string1); pstr2 = ZSTR_VAL(string2); - intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus1); + intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus); - if (U_FAILURE(ustatus1)) { - intl_error_set_code(NULL, ustatus1); + if (U_FAILURE(ustatus)) { + intl_error_set_code(NULL, ustatus); intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0); efree(ustring1); RETURN_FALSE; } - intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus2); + intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus); - if (U_FAILURE(ustatus2)) { - intl_error_set_code(NULL, ustatus2); + if (U_FAILURE(ustatus)) { + intl_error_set_code(NULL, ustatus); intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0); efree(ustring2); @@ -1017,30 +1015,64 @@ PHP_FUNCTION(grapheme_levenshtein) unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE]; unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE]; - bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus1); - bi2 = grapheme_get_break_iterator((void*)u_break_iterator_buffer2, &ustatus2); + bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus); + if (U_FAILURE(ustatus)) { + intl_error_set_code(NULL, ustatus); + intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #1 ($string1)", 0); + efree(ustring2); + efree(ustring1); + ubrk_close(bi1); + RETURN_FALSE; + } - ubrk_setText(bi1, ustring1, ustring1_len, &ustatus1); + bi2 = grapheme_get_break_iterator(u_break_iterator_buffer2, &ustatus); + if (U_FAILURE(ustatus)) { + intl_error_set_code(NULL, ustatus); + intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #2 ($string2)", 0); + efree(ustring2); + efree(ustring1); + ubrk_close(bi2); + ubrk_close(bi1); + RETURN_FALSE; + } + ubrk_setText(bi1, ustring1, ustring1_len, &ustatus); - if (U_FAILURE(ustatus1)) { - intl_error_set_code(NULL, ustatus1); + if (U_FAILURE(ustatus)) { + intl_error_set_code(NULL, ustatus); - intl_error_set_custom_msg(NULL, "Error on ubrk_setText on ustring1", 0); + intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #1 ($string1)", 0); efree(ustring2); efree(ustring1); + ubrk_close(bi2); + ubrk_close(bi1); RETURN_FALSE; } - ubrk_setText(bi2, ustring2, ustring2_len, &ustatus2); - if (U_FAILURE(ustatus2)) { - intl_error_set_code(NULL, ustatus2); + ubrk_setText(bi2, ustring2, ustring2_len, &ustatus); + if (U_FAILURE(ustatus)) { + intl_error_set_code(NULL, ustatus); - intl_error_set_custom_msg(NULL, "Error on ubrk_setText on ustring2", 0); + intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #2 ($string2)", 0); efree(ustring2); efree(ustring1); + ubrk_close(bi2); + ubrk_close(bi1); RETURN_FALSE; } + UCollator *collator = ucol_open("", &ustatus); + if (U_FAILURE(ustatus)) { + intl_error_set_code(NULL, ustatus); + intl_error_set_custom_msg(NULL, "Error on ucol_open", 0); + efree(ustring2); + efree(ustring1); + ubrk_close(bi2); + ubrk_close(bi1); + ucol_close(collator); + RETURN_FALSE; + } + + zend_long *p1, *p2, *tmp; p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0); @@ -1052,7 +1084,6 @@ PHP_FUNCTION(grapheme_levenshtein) int32_t current2 = 0; int32_t pos1 = 0; int32_t pos2 = 0; - int32_t usrch_pos = 0; while (true) { current1 = ubrk_current(bi1); @@ -1067,37 +1098,8 @@ PHP_FUNCTION(grapheme_levenshtein) if (pos2 == UBRK_DONE) { break; } - UStringSearch *srch = usearch_open(ustring1 + current1, pos1 - current1, ustring2 + current2, pos2 - current2, "", NULL, &ustatus2); - if (U_FAILURE(ustatus2)) { - intl_error_set_code(NULL, ustatus2); - intl_error_set_custom_msg(NULL, "Error usearch_open", 0); - ubrk_close(bi1); - ubrk_close(bi2); - - efree(ustring1); - efree(ustring2); - - efree(p1); - efree(p2); - RETURN_FALSE; - } - usrch_pos = usearch_first(srch, &ustatus2); - if (U_FAILURE(ustatus2)) { - intl_error_set_code(NULL, ustatus2); - intl_error_set_custom_msg(NULL, "Error usearch_first", 0); - ubrk_close(bi1); - ubrk_close(bi2); - - efree(ustring1); - efree(ustring2); - - efree(p1); - efree(p2); - RETURN_FALSE; - } - usearch_close(srch); - - if (usrch_pos != USEARCH_DONE) { + if (ucol_strcoll(collator, ustring1 + current1, pos1 - current1, ustring2 + current2, pos2 - current2) == UCOL_EQUAL) { + c0 = p1[i2]; c0 = p1[i2]; } else { c0 = p1[i2] + cost_rep; @@ -1118,6 +1120,8 @@ PHP_FUNCTION(grapheme_levenshtein) p2 = tmp; } + ucol_close(collator); + ubrk_close(bi1); ubrk_close(bi2); diff --git a/ext/intl/php_intl_arginfo.h b/ext/intl/php_intl_arginfo.h index bf016abf99dcb..ed4bdcded94be 100644 --- a/ext/intl/php_intl_arginfo.h +++ b/ext/intl/php_intl_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 4fb44fc170e74af2e9fb52c5a1029004f708fcda */ + * Stub hash: adcf3b6ef720a518087efedbe2b62b10ad4b2624 */ ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1) ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null") @@ -489,6 +489,14 @@ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_str_split, 0, 1, MAY_BE ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, length, IS_LONG, 0, "1") ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_levenshtein, 0, 2, MAY_BE_LONG|MAY_BE_FALSE) + ZEND_ARG_TYPE_INFO(0, string1, IS_STRING, 0) + ZEND_ARG_TYPE_INFO(0, string2, IS_STRING, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, insertion_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, replacement_cost, IS_LONG, 0, "1") + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, deletion_cost, IS_LONG, 0, "1") +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE) ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0) @@ -903,6 +911,7 @@ ZEND_FUNCTION(grapheme_substr); ZEND_FUNCTION(grapheme_strstr); ZEND_FUNCTION(grapheme_stristr); ZEND_FUNCTION(grapheme_str_split); +ZEND_FUNCTION(grapheme_levenshtein); ZEND_FUNCTION(grapheme_extract); ZEND_FUNCTION(idn_to_ascii); ZEND_FUNCTION(idn_to_utf8); @@ -1091,6 +1100,7 @@ static const zend_function_entry ext_functions[] = { ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr) ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr) ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split) + ZEND_FE(grapheme_levenshtein, arginfo_grapheme_levenshtein) ZEND_FE(grapheme_extract, arginfo_grapheme_extract) ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii) ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8)