Add grapheme_str_split function

I noticed that PHP does not have a grapheme cluster based str_split function.
So I created the grapheme_str_split function.

This feature will allow you to correctly handle emoji
and variable selectors.

Co-authored-by: Ayesh Karunaratne <Ayesh@users.noreply.github.com>

Close GH-13580
This commit is contained in:
Yuya Hamada 2024-03-03 01:09:26 +09:00 committed by David Carlier
parent 78ccea4e40
commit 44e8301cf6
No known key found for this signature in database
GPG Key ID: CEF290BB40D2086B
6 changed files with 91 additions and 1 deletions

1
NEWS
View File

@ -83,6 +83,7 @@ PHP NEWS
- ValueError if the integer index does not fit in a signed 32 bit integer - ValueError if the integer index does not fit in a signed 32 bit integer
. ResourceBundle::get() now has a tentative return type of: . ResourceBundle::get() now has a tentative return type of:
ResourceBundle|array|string|int|null ResourceBundle|array|string|int|null
. Added the new Grapheme function grapheme_str_split. (youkidearitai)
- LDAP: - LDAP:
. Added LDAP_OPT_X_TLS_PROTOCOL_MAX/LDAP_OPT_X_TLS_PROTOCOL_TLS1_3 . Added LDAP_OPT_X_TLS_PROTOCOL_MAX/LDAP_OPT_X_TLS_PROTOCOL_TLS1_3

View File

@ -435,6 +435,8 @@ PHP 8.4 UPGRADE NOTES
- Intl: - Intl:
. Added IntlDateFormatter::getIanaID()/intltz_get_iana_id() to . Added IntlDateFormatter::getIanaID()/intltz_get_iana_id() to
the IANA identifier from a given timezone. the IANA identifier from a given timezone.
. Added grapheme_str_split which allow to support emoji and Variation
Selectors.
- MBString: - MBString:
. Added mb_trim, mb_ltrim and mb_rtrim functions. . Added mb_trim, mb_ltrim and mb_rtrim functions.

View File

@ -816,4 +816,82 @@ PHP_FUNCTION(grapheme_extract)
RETURN_STRINGL(((char *)pstr), ret_pos); RETURN_STRINGL(((char *)pstr), ret_pos);
} }
PHP_FUNCTION(grapheme_str_split)
{
char *pstr, *end;
zend_string *str;
zend_long split_len = 1;
unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
UErrorCode ustatus = U_ZERO_ERROR;
int32_t pos, current, i, end_len = 0;
UBreakIterator* bi;
UText *ut = NULL;
ZEND_PARSE_PARAMETERS_START(1, 2)
Z_PARAM_STR(str)
Z_PARAM_OPTIONAL
Z_PARAM_LONG(split_len)
ZEND_PARSE_PARAMETERS_END();
if (split_len <= 0 || split_len > UINT_MAX / 4) {
zend_argument_value_error(2, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
RETURN_THROWS();
}
if (ZSTR_LEN(str) == 0) {
RETURN_EMPTY_ARRAY();
}
pstr = ZSTR_VAL(str);
ut = utext_openUTF8(ut, pstr, ZSTR_LEN(str), &ustatus);
if ( U_FAILURE( ustatus ) ) {
/* Set global error code. */
intl_error_set_code( NULL, ustatus );
/* Set error messages. */
intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
RETURN_FALSE;
}
bi = NULL;
ustatus = U_ZERO_ERROR;
bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &ustatus );
if( U_FAILURE(ustatus) ) {
RETURN_FALSE;
}
ubrk_setUText(bi, ut, &ustatus);
pos = 0;
array_init(return_value);
for (end = pstr, i = 0, current = 0; pos != UBRK_DONE;) {
end_len = pos - current;
pos = ubrk_next(bi);
if (i == split_len - 1) {
if ( pos != UBRK_DONE ) {
add_next_index_stringl(return_value, pstr, pos - current);
end = pstr + pos - current;
i = 0;
}
pstr += pos - current;
current = pos;
} else {
i += 1;
}
}
if (i != 0 && end_len != 0) {
add_next_index_stringl(return_value, end, end_len);
}
utext_close(ut);
ubrk_close(bi);
}
/* }}} */ /* }}} */

View File

@ -445,6 +445,8 @@ function grapheme_strstr(string $haystack, string $needle, bool $beforeNeedle =
function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = false): string|false {} function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = false): string|false {}
function grapheme_str_split(string $string, int $length = 1): array|false {}
/** @param int $next */ /** @param int $next */
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {} function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}

View File

@ -1,5 +1,5 @@
/* This is a generated file, edit the .stub.php file instead. /* This is a generated file, edit the .stub.php file instead.
* Stub hash: 08c4caf706645f1afaa8d40c3887f01025ced930 */ * Stub hash: b45ef763d82e1ad9ab27336fd0ab95e2d2e79a90 */
ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1) ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1)
ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null") ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null")
@ -484,6 +484,11 @@ ZEND_END_ARG_INFO()
#define arginfo_grapheme_stristr arginfo_grapheme_strstr #define arginfo_grapheme_stristr arginfo_grapheme_strstr
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_str_split, 0, 1, MAY_BE_ARRAY|MAY_BE_FALSE)
ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0)
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, length, IS_LONG, 0, "1")
ZEND_END_ARG_INFO()
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE) ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE)
ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0)
ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0) ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0)
@ -901,6 +906,7 @@ ZEND_FUNCTION(grapheme_strripos);
ZEND_FUNCTION(grapheme_substr); ZEND_FUNCTION(grapheme_substr);
ZEND_FUNCTION(grapheme_strstr); ZEND_FUNCTION(grapheme_strstr);
ZEND_FUNCTION(grapheme_stristr); ZEND_FUNCTION(grapheme_stristr);
ZEND_FUNCTION(grapheme_str_split);
ZEND_FUNCTION(grapheme_extract); ZEND_FUNCTION(grapheme_extract);
ZEND_FUNCTION(idn_to_ascii); ZEND_FUNCTION(idn_to_ascii);
ZEND_FUNCTION(idn_to_utf8); ZEND_FUNCTION(idn_to_utf8);
@ -1093,6 +1099,7 @@ static const zend_function_entry ext_functions[] = {
ZEND_FE(grapheme_substr, arginfo_grapheme_substr) ZEND_FE(grapheme_substr, arginfo_grapheme_substr)
ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr) ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr)
ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr) ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr)
ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split)
ZEND_FE(grapheme_extract, arginfo_grapheme_extract) ZEND_FE(grapheme_extract, arginfo_grapheme_extract)
ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii) ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii)
ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8) ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8)

Binary file not shown.