mirror of
https://github.com/php/php-src.git
synced 2024-09-21 09:57:23 +00:00
Add grapheme_str_split function
I noticed that PHP does not have a grapheme cluster based str_split function. So I created the grapheme_str_split function. This feature will allow you to correctly handle emoji and variable selectors. Co-authored-by: Ayesh Karunaratne <Ayesh@users.noreply.github.com> Close GH-13580
This commit is contained in:
parent
78ccea4e40
commit
44e8301cf6
1
NEWS
1
NEWS
@ -83,6 +83,7 @@ PHP NEWS
|
|||||||
- ValueError if the integer index does not fit in a signed 32 bit integer
|
- ValueError if the integer index does not fit in a signed 32 bit integer
|
||||||
. ResourceBundle::get() now has a tentative return type of:
|
. ResourceBundle::get() now has a tentative return type of:
|
||||||
ResourceBundle|array|string|int|null
|
ResourceBundle|array|string|int|null
|
||||||
|
. Added the new Grapheme function grapheme_str_split. (youkidearitai)
|
||||||
|
|
||||||
- LDAP:
|
- LDAP:
|
||||||
. Added LDAP_OPT_X_TLS_PROTOCOL_MAX/LDAP_OPT_X_TLS_PROTOCOL_TLS1_3
|
. Added LDAP_OPT_X_TLS_PROTOCOL_MAX/LDAP_OPT_X_TLS_PROTOCOL_TLS1_3
|
||||||
|
@ -435,6 +435,8 @@ PHP 8.4 UPGRADE NOTES
|
|||||||
- Intl:
|
- Intl:
|
||||||
. Added IntlDateFormatter::getIanaID()/intltz_get_iana_id() to
|
. Added IntlDateFormatter::getIanaID()/intltz_get_iana_id() to
|
||||||
the IANA identifier from a given timezone.
|
the IANA identifier from a given timezone.
|
||||||
|
. Added grapheme_str_split which allow to support emoji and Variation
|
||||||
|
Selectors.
|
||||||
|
|
||||||
- MBString:
|
- MBString:
|
||||||
. Added mb_trim, mb_ltrim and mb_rtrim functions.
|
. Added mb_trim, mb_ltrim and mb_rtrim functions.
|
||||||
|
@ -816,4 +816,82 @@ PHP_FUNCTION(grapheme_extract)
|
|||||||
RETURN_STRINGL(((char *)pstr), ret_pos);
|
RETURN_STRINGL(((char *)pstr), ret_pos);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PHP_FUNCTION(grapheme_str_split)
|
||||||
|
{
|
||||||
|
char *pstr, *end;
|
||||||
|
zend_string *str;
|
||||||
|
zend_long split_len = 1;
|
||||||
|
|
||||||
|
unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
|
||||||
|
UErrorCode ustatus = U_ZERO_ERROR;
|
||||||
|
int32_t pos, current, i, end_len = 0;
|
||||||
|
UBreakIterator* bi;
|
||||||
|
UText *ut = NULL;
|
||||||
|
|
||||||
|
ZEND_PARSE_PARAMETERS_START(1, 2)
|
||||||
|
Z_PARAM_STR(str)
|
||||||
|
Z_PARAM_OPTIONAL
|
||||||
|
Z_PARAM_LONG(split_len)
|
||||||
|
ZEND_PARSE_PARAMETERS_END();
|
||||||
|
|
||||||
|
if (split_len <= 0 || split_len > UINT_MAX / 4) {
|
||||||
|
zend_argument_value_error(2, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
|
||||||
|
RETURN_THROWS();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ZSTR_LEN(str) == 0) {
|
||||||
|
RETURN_EMPTY_ARRAY();
|
||||||
|
}
|
||||||
|
|
||||||
|
pstr = ZSTR_VAL(str);
|
||||||
|
ut = utext_openUTF8(ut, pstr, ZSTR_LEN(str), &ustatus);
|
||||||
|
|
||||||
|
if ( U_FAILURE( ustatus ) ) {
|
||||||
|
/* Set global error code. */
|
||||||
|
intl_error_set_code( NULL, ustatus );
|
||||||
|
|
||||||
|
/* Set error messages. */
|
||||||
|
intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
|
||||||
|
|
||||||
|
RETURN_FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
bi = NULL;
|
||||||
|
ustatus = U_ZERO_ERROR;
|
||||||
|
bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &ustatus );
|
||||||
|
|
||||||
|
if( U_FAILURE(ustatus) ) {
|
||||||
|
RETURN_FALSE;
|
||||||
|
}
|
||||||
|
|
||||||
|
ubrk_setUText(bi, ut, &ustatus);
|
||||||
|
|
||||||
|
pos = 0;
|
||||||
|
array_init(return_value);
|
||||||
|
|
||||||
|
for (end = pstr, i = 0, current = 0; pos != UBRK_DONE;) {
|
||||||
|
end_len = pos - current;
|
||||||
|
pos = ubrk_next(bi);
|
||||||
|
|
||||||
|
if (i == split_len - 1) {
|
||||||
|
if ( pos != UBRK_DONE ) {
|
||||||
|
add_next_index_stringl(return_value, pstr, pos - current);
|
||||||
|
end = pstr + pos - current;
|
||||||
|
i = 0;
|
||||||
|
}
|
||||||
|
pstr += pos - current;
|
||||||
|
current = pos;
|
||||||
|
} else {
|
||||||
|
i += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i != 0 && end_len != 0) {
|
||||||
|
add_next_index_stringl(return_value, end, end_len);
|
||||||
|
}
|
||||||
|
|
||||||
|
utext_close(ut);
|
||||||
|
ubrk_close(bi);
|
||||||
|
}
|
||||||
|
|
||||||
/* }}} */
|
/* }}} */
|
||||||
|
@ -445,6 +445,8 @@ function grapheme_strstr(string $haystack, string $needle, bool $beforeNeedle =
|
|||||||
|
|
||||||
function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = false): string|false {}
|
function grapheme_stristr(string $haystack, string $needle, bool $beforeNeedle = false): string|false {}
|
||||||
|
|
||||||
|
function grapheme_str_split(string $string, int $length = 1): array|false {}
|
||||||
|
|
||||||
/** @param int $next */
|
/** @param int $next */
|
||||||
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}
|
function grapheme_extract(string $haystack, int $size, int $type = GRAPHEME_EXTR_COUNT, int $offset = 0, &$next = null): string|false {}
|
||||||
|
|
||||||
|
9
ext/intl/php_intl_arginfo.h
generated
9
ext/intl/php_intl_arginfo.h
generated
@ -1,5 +1,5 @@
|
|||||||
/* This is a generated file, edit the .stub.php file instead.
|
/* This is a generated file, edit the .stub.php file instead.
|
||||||
* Stub hash: 08c4caf706645f1afaa8d40c3887f01025ced930 */
|
* Stub hash: b45ef763d82e1ad9ab27336fd0ab95e2d2e79a90 */
|
||||||
|
|
||||||
ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1)
|
ZEND_BEGIN_ARG_WITH_RETURN_OBJ_INFO_EX(arginfo_intlcal_create_instance, 0, 0, IntlCalendar, 1)
|
||||||
ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null")
|
ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, timezone, "null")
|
||||||
@ -484,6 +484,11 @@ ZEND_END_ARG_INFO()
|
|||||||
|
|
||||||
#define arginfo_grapheme_stristr arginfo_grapheme_strstr
|
#define arginfo_grapheme_stristr arginfo_grapheme_strstr
|
||||||
|
|
||||||
|
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_str_split, 0, 1, MAY_BE_ARRAY|MAY_BE_FALSE)
|
||||||
|
ZEND_ARG_TYPE_INFO(0, string, IS_STRING, 0)
|
||||||
|
ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, length, IS_LONG, 0, "1")
|
||||||
|
ZEND_END_ARG_INFO()
|
||||||
|
|
||||||
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE)
|
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_grapheme_extract, 0, 2, MAY_BE_STRING|MAY_BE_FALSE)
|
||||||
ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0)
|
ZEND_ARG_TYPE_INFO(0, haystack, IS_STRING, 0)
|
||||||
ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0)
|
ZEND_ARG_TYPE_INFO(0, size, IS_LONG, 0)
|
||||||
@ -901,6 +906,7 @@ ZEND_FUNCTION(grapheme_strripos);
|
|||||||
ZEND_FUNCTION(grapheme_substr);
|
ZEND_FUNCTION(grapheme_substr);
|
||||||
ZEND_FUNCTION(grapheme_strstr);
|
ZEND_FUNCTION(grapheme_strstr);
|
||||||
ZEND_FUNCTION(grapheme_stristr);
|
ZEND_FUNCTION(grapheme_stristr);
|
||||||
|
ZEND_FUNCTION(grapheme_str_split);
|
||||||
ZEND_FUNCTION(grapheme_extract);
|
ZEND_FUNCTION(grapheme_extract);
|
||||||
ZEND_FUNCTION(idn_to_ascii);
|
ZEND_FUNCTION(idn_to_ascii);
|
||||||
ZEND_FUNCTION(idn_to_utf8);
|
ZEND_FUNCTION(idn_to_utf8);
|
||||||
@ -1093,6 +1099,7 @@ static const zend_function_entry ext_functions[] = {
|
|||||||
ZEND_FE(grapheme_substr, arginfo_grapheme_substr)
|
ZEND_FE(grapheme_substr, arginfo_grapheme_substr)
|
||||||
ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr)
|
ZEND_FE(grapheme_strstr, arginfo_grapheme_strstr)
|
||||||
ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr)
|
ZEND_FE(grapheme_stristr, arginfo_grapheme_stristr)
|
||||||
|
ZEND_FE(grapheme_str_split, arginfo_grapheme_str_split)
|
||||||
ZEND_FE(grapheme_extract, arginfo_grapheme_extract)
|
ZEND_FE(grapheme_extract, arginfo_grapheme_extract)
|
||||||
ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii)
|
ZEND_FE(idn_to_ascii, arginfo_idn_to_ascii)
|
||||||
ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8)
|
ZEND_FE(idn_to_utf8, arginfo_idn_to_utf8)
|
||||||
|
BIN
ext/intl/tests/grapheme_str_split.phpt
Normal file
BIN
ext/intl/tests/grapheme_str_split.phpt
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user