diff --git a/NEWS b/NEWS index e95c2048b5f..0a3c75348ea 100644 --- a/NEWS +++ b/NEWS @@ -83,6 +83,7 @@ PHP NEWS - ValueError if the integer index does not fit in a signed 32 bit integer . ResourceBundle::get() now has a tentative return type of: ResourceBundle|array|string|int|null + . Added the new Grapheme function grapheme_str_split. (youkidearitai) - LDAP: . Added LDAP_OPT_X_TLS_PROTOCOL_MAX/LDAP_OPT_X_TLS_PROTOCOL_TLS1_3 diff --git a/UPGRADING b/UPGRADING index 138de74811e..02c2ebafd76 100644 --- a/UPGRADING +++ b/UPGRADING @@ -435,6 +435,8 @@ PHP 8.4 UPGRADE NOTES - Intl: . Added IntlDateFormatter::getIanaID()/intltz_get_iana_id() to the IANA identifier from a given timezone. + . Added grapheme_str_split which allow to support emoji and Variation + Selectors. - MBString: . Added mb_trim, mb_ltrim and mb_rtrim functions. diff --git a/ext/intl/grapheme/grapheme_string.c b/ext/intl/grapheme/grapheme_string.c index a9cfd3d2ea6..e75b7ca3b1c 100644 --- a/ext/intl/grapheme/grapheme_string.c +++ b/ext/intl/grapheme/grapheme_string.c @@ -816,4 +816,82 @@ PHP_FUNCTION(grapheme_extract) RETURN_STRINGL(((char *)pstr), ret_pos); } +PHP_FUNCTION(grapheme_str_split) +{ + char *pstr, *end; + zend_string *str; + zend_long split_len = 1; + + unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; + UErrorCode ustatus = U_ZERO_ERROR; + int32_t pos, current, i, end_len = 0; + UBreakIterator* bi; + UText *ut = NULL; + + ZEND_PARSE_PARAMETERS_START(1, 2) + Z_PARAM_STR(str) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(split_len) + ZEND_PARSE_PARAMETERS_END(); + + if (split_len <= 0 || split_len > UINT_MAX / 4) { + zend_argument_value_error(2, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4); + RETURN_THROWS(); + } + + if (ZSTR_LEN(str) == 0) { + RETURN_EMPTY_ARRAY(); + } + + pstr = ZSTR_VAL(str); + ut = utext_openUTF8(ut, pstr, ZSTR_LEN(str), &ustatus); + + if ( U_FAILURE( ustatus ) ) { + /* Set global error code. */ + intl_error_set_code( NULL, ustatus ); + + /* Set error messages. */ + intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 ); + + RETURN_FALSE; + } + + bi = NULL; + ustatus = U_ZERO_ERROR; + bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &ustatus ); + + if( U_FAILURE(ustatus) ) { + RETURN_FALSE; + } + + ubrk_setUText(bi, ut, &ustatus); + + pos = 0; + array_init(return_value); + + for (end = pstr, i = 0, current = 0; pos != UBRK_DONE;) { + end_len = pos - current; + pos = ubrk_next(bi); + + if (i == split_len - 1) { + if ( pos != UBRK_DONE ) { + add_next_index_stringl(return_value, pstr, pos - current); + end = pstr + pos - current; + i = 0; + } + pstr += pos - current; + current = pos; + } else { + i += 1; + } + } + + if (i != 0 && end_len != 0) { + add_next_index_stringl(return_value, end, end_len); + } + + utext_close(ut); + ubrk_close(bi); +} + /* }}} */ diff --git a/ext/intl/php_intl.stub.php b/ext/intl/php_intl.stub.php index 07168112410..c03ee482123 100644 --- a/ext/intl/php_intl.stub.php +++ b/ext/intl/php_intl.stub.php @@ -445,6 +445,8 @@ function grapheme_strstr(string $haystack, string $needle, bool $beforeNeedle = function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = false): string|false {} +function grapheme_str_split(string $string, int $length = 1): array|false {} + /** @param int $next */ function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {} diff --git a/ext/intl/php_intl_arginfo.h b/ext/intl/php_intl_arginfo.h index 1f83bf90265..8e632029bf0 100644 --- a/ext/intl/php_intl_arginfo.h +++ b/ext/intl/php_intl_arginfo.h @@ -1,5 +1,5 @@ /* This is a generated file, edit the .stub.php file instead. - * Stub hash: 08c4caf706645f1afaa8d40c3887f01025ced930 */ + * Stub hash: b45ef763d82e1ad9ab27336fd0ab95e2d2e79a90 */ ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1) ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null") @@ -484,6 +484,11 @@ ZEND_END_ARG_INFO() #define arginfo_grapheme_stristr arginfo_grapheme_strstr +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_str_split, 0, 1, MAY_BE_ARRAY|MAY_BE_FALSE) + ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, length, IS_LONG, 0, "1") +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE) ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0) ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0) @@ -901,6 +906,7 @@ ZEND_FUNCTION(grapheme_strripos); ZEND_FUNCTION(grapheme_substr); ZEND_FUNCTION(grapheme_strstr); ZEND_FUNCTION(grapheme_stristr); +ZEND_FUNCTION(grapheme_str_split); ZEND_FUNCTION(grapheme_extract); ZEND_FUNCTION(idn_to_ascii); ZEND_FUNCTION(idn_to_utf8); @@ -1093,6 +1099,7 @@ static const zend_function_entry ext_functions[] = { ZEND_FE(grapheme_substr, arginfo_grapheme_substr) ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr) ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr) + ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split) ZEND_FE(grapheme_extract, arginfo_grapheme_extract) ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii) ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8) diff --git a/ext/intl/tests/grapheme_str_split.phpt b/ext/intl/tests/grapheme_str_split.phpt new file mode 100644 index 00000000000..a5a3efac18f Binary files /dev/null and b/ext/intl/tests/grapheme_str_split.phpt differ