Skip to content

Add grapheme_levenshtein function. #18087

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
217 changes: 217 additions & 0 deletions ext/intl/grapheme/grapheme_string.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <unicode/ucol.h>
#include <unicode/ustring.h>
#include <unicode/ubrk.h>
#include <unicode/usearch.h>

/* }}} */

Expand Down Expand Up @@ -918,4 +919,220 @@ PHP_FUNCTION(grapheme_str_split)
ubrk_close(bi);
}

PHP_FUNCTION(grapheme_levenshtein)
{
zend_string *string1, *string2;
zend_long cost_ins = 1;
zend_long cost_rep = 1;
zend_long cost_del = 1;

ZEND_PARSE_PARAMETERS_START(2, 5)
Z_PARAM_STR(string1)
Z_PARAM_STR(string2)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(cost_ins)
Z_PARAM_LONG(cost_rep)
Z_PARAM_LONG(cost_del)
ZEND_PARSE_PARAMETERS_END();

if (cost_ins <= 0 || cost_ins > UINT_MAX / 4) {
zend_argument_value_error(3, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}

if (cost_rep <= 0 || cost_rep > UINT_MAX / 4) {
zend_argument_value_error(4, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}

if (cost_del <= 0 || cost_del > UINT_MAX / 4) {
zend_argument_value_error(5, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}

zend_long c0, c1, c2;
zend_long retval;
size_t i2;
char *pstr1, *pstr2;

UChar *ustring1 = NULL;
UChar *ustring2 = NULL;

int32_t ustring1_len = 0;
int32_t ustring2_len = 0;

UErrorCode ustatus = U_ZERO_ERROR;

/* When all costs are equal, levenshtein fulfills the requirements of a metric, which means
* that the distance is symmetric. If string1 is shorter than string2 we can save memory (and CPU time)
* by having shorter rows (p1 & p2). */
if (ZSTR_LEN(string1) < ZSTR_LEN(string2) && cost_ins == cost_rep && cost_rep == cost_del) {
zend_string *tmp = string1;
string1 = string2;
string2 = tmp;
}

pstr1 = ZSTR_VAL(string1);
pstr2 = ZSTR_VAL(string2);

intl_convert_utf8_to_utf16(&ustring1, &ustring1_len, pstr1, ZSTR_LEN(string1), &ustatus);

if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
efree(ustring1);
RETURN_FALSE;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason that false is being returned on failure, rather than throwing an exception?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, Userland can uses intl_get_error_message function after this function if this error. Therefore, this block is returns false.

}

intl_convert_utf8_to_utf16(&ustring2, &ustring2_len, pstr2, ZSTR_LEN(string2), &ustatus);

if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error converting input string to UTF-16", 0);
efree(ustring2);
efree(ustring1);
RETURN_FALSE;
}

UBreakIterator *bi1, *bi2;

int32_t strlen_1, strlen_2;
strlen_1 = grapheme_split_string(ustring1, ustring1_len, NULL, 0);
strlen_2 = grapheme_split_string(ustring2, ustring2_len, NULL, 0);

if (strlen_1 == 0) {
efree(ustring1);
efree(ustring2);
RETURN_LONG(strlen_2 * cost_ins);
}
if (strlen_2 == 0) {
efree(ustring1);
efree(ustring2);
RETURN_LONG(strlen_1 * cost_del);
}

unsigned char u_break_iterator_buffer1[U_BRK_SAFECLONE_BUFFERSIZE];
unsigned char u_break_iterator_buffer2[U_BRK_SAFECLONE_BUFFERSIZE];
bi1 = grapheme_get_break_iterator((void*)u_break_iterator_buffer1, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #1 ($string1)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi1);
RETURN_FALSE;
}

bi2 = grapheme_get_break_iterator(u_break_iterator_buffer2, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);
intl_error_set_custom_msg(NULL, "Error on grapheme_get_break_iterator for argument #2 ($string2)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
RETURN_FALSE;
}
ubrk_setText(bi1, ustring1, ustring1_len, &ustatus);

if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #1 ($string1)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
RETURN_FALSE;
}

ubrk_setText(bi2, ustring2, ustring2_len, &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error on ubrk_setText for argument #2 ($string2)", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
RETURN_FALSE;
}
UCollator *collator = ucol_open("", &ustatus);
if (U_FAILURE(ustatus)) {
intl_error_set_code(NULL, ustatus);

intl_error_set_custom_msg(NULL, "Error on ucol_open", 0);
efree(ustring2);
efree(ustring1);
ubrk_close(bi2);
ubrk_close(bi1);
ucol_close(collator);
RETURN_FALSE;
}

zend_long *p1, *p2, *tmp;
p1 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);
p2 = safe_emalloc(strlen_2 + 1, sizeof(zend_long), 0);

for (i2 = 0; i2 <= strlen_2; i2++) {
p1[i2] = i2 * cost_ins;
}

int32_t current1 = 0;
int32_t current2 = 0;
int32_t pos1 = 0;
int32_t pos2 = 0;

while (true) {
current1 = ubrk_current(bi1);
pos1 = ubrk_next(bi1);
if (pos1 == UBRK_DONE) {
break;
}
p2[0] = p1[0] + cost_del;
for (i2 = 0, pos2 = 0; pos2 != UBRK_DONE; i2++) {
current2 = ubrk_current(bi2);
pos2 = ubrk_next(bi2);
if (pos2 == UBRK_DONE) {
break;
}
if (ucol_strcoll(collator, ustring1 + current1, pos1 - current1, ustring2 + current2, pos2 - current2) == UCOL_EQUAL) {
c0 = p1[i2];
c0 = p1[i2];
} else {
c0 = p1[i2] + cost_rep;
}
c1 = p1[i2 + 1] + cost_del;
if (c1 < c0) {
c0 = c1;
}
c2 = p2[i2] + cost_ins;
if (c2 < c0) {
c0 = c2;
}
p2[i2 + 1] = c0;
}
ubrk_first(bi2);
tmp = p1;
p1 = p2;
p2 = tmp;
}

ucol_close(collator);

ubrk_close(bi1);
ubrk_close(bi2);

efree(ustring1);
efree(ustring2);

retval = p1[strlen_2];

efree(p1);
efree(p2);
RETURN_LONG(retval);
}

/* }}} */
2 changes: 2 additions & 0 deletions ext/intl/php_intl.stub.php
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,8 @@ function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle =

function grapheme_str_split(string $string, int $length = 1): array|false {}

function grapheme_levenshtein(string $string1, string $string2, int $insertion_cost = 1, int $replacement_cost = 1, int $deletion_cost = 1): int|false {}

/** @param int $next */
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}

Expand Down
12 changes: 11 additions & 1 deletion ext/intl/php_intl_arginfo.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading