From 5ca019aad3fb39d2aba9c5782cabf784f9fad674 Mon Sep 17 00:00:00 2001 From: Rolland Santimano Date: Thu, 20 Oct 2005 19:25:54 +0000 Subject: [PATCH] - Unicode impl of stristr() --- ext/standard/php_string.h | 1 + ext/standard/string.c | 146 +++++++++++++++++++++++++++++--------- 2 files changed, 113 insertions(+), 34 deletions(-) diff --git a/ext/standard/php_string.h b/ext/standard/php_string.h index 2fbaaf518ed..02e724037a2 100644 --- a/ext/standard/php_string.h +++ b/ext/standard/php_string.h @@ -130,6 +130,7 @@ PHPAPI void php_u_stripslashes(UChar *str, int32_t *len TSRMLS_DC); PHPAPI void php_stripcslashes(char *str, int *len); PHPAPI void php_basename(char *s, size_t len, char *suffix, size_t sufflen, char **p_ret, size_t *p_len TSRMLS_DC); PHPAPI size_t php_dirname(char *str, size_t len); +PHPAPI UChar *php_u_stristr(UChar *s, UChar *t, int32_t s_len, int32_t t_len); PHPAPI char *php_stristr(unsigned char *s, unsigned char *t, size_t s_len, size_t t_len); PHPAPI char *php_str_to_str_ex(char *haystack, int length, char *needle, int needle_len, char *str, int str_len, int *_new_length, int case_sensitivity, int *replace_count); diff --git a/ext/standard/string.c b/ext/standard/string.c index f279cf559a0..cc0133cda02 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -1915,6 +1915,42 @@ PHP_FUNCTION(pathinfo) } /* }}} */ +/* {{{ php_u_stristr + Unicode version of case insensitve strstr */ +PHPAPI UChar *php_u_stristr(UChar *s, UChar *t, int32_t s_len, int32_t t_len) +{ + int32_t i,j, last; + UChar32 ch1, ch2; + + /* Have to do this by hand since lower-casing can change lengths + by changing codepoints, and an offset within the lower-case & + upper-case strings might be different codepoints + */ + i = 0; + while (i <= (s_len-t_len)) { + last = i; + U16_NEXT(s, i, s_len, ch1); + U16_GET(t, 0, 0, t_len, ch2); + if (u_tolower(ch1) == u_tolower(ch2)) { + j = 0; + U16_FWD_1(t, j, t_len); + while (j < t_len) { + U16_NEXT(s, i, s_len, ch1); + U16_NEXT(t, j, t_len, ch2); + if (u_tolower(ch1) != u_tolower(ch2)) { + U16_BACK_1(s, 0, i); + break; + } + } + if (u_tolower(ch1) == u_tolower(ch2)) { + return s+last; + } + } + } + return NULL; +} +/* }}} */ + /* {{{ php_stristr case insensitve strstr */ PHPAPI char *php_stristr(unsigned char *s, unsigned char *t, size_t s_len, size_t t_len) @@ -2005,61 +2041,103 @@ PHPAPI size_t php_strcspn(char *s1, char *s2, char *s1_end, char *s2_end) Finds first occurrence of a string within another, case insensitive */ PHP_FUNCTION(stristr) { - char *haystack; - long haystack_len; - zval *needle; + zval *haystack, *needle; zend_bool part = 0; - char *found = NULL; - int found_offset; - char *haystack_orig; + zend_uchar str_type; char needle_char[2]; - - if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sz|b", &haystack, &haystack_len, &needle, &part) == FAILURE) { + UChar u_needle_char[3]; + int32_t needle_len; + char *haystack_copy; + void *target; + void *found = NULL; + int found_offset; + void *start, *end; + + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "zz|b", &haystack, &needle, &part) == FAILURE) { return; } - + SEPARATE_ZVAL(&haystack); SEPARATE_ZVAL(&needle); + if (Z_TYPE_P(haystack) != IS_UNICODE && Z_TYPE_P(haystack) != IS_BINARY && Z_TYPE_P(haystack) != IS_STRING) { + convert_to_text(haystack); + } - haystack_orig = estrndup(haystack, haystack_len); - - if (Z_TYPE_P(needle) == IS_STRING) { - if (!Z_STRLEN_P(needle)) { + if (Z_TYPE_P(needle) == IS_UNICODE || Z_TYPE_P(needle) == IS_BINARY || Z_TYPE_P(needle) == IS_STRING) { + if (!Z_UNILEN_P(needle)) { php_error_docref(NULL TSRMLS_CC, E_WARNING, "Empty delimiter."); - efree(haystack_orig); RETURN_FALSE; } - - found = php_stristr(haystack, - Z_STRVAL_P(needle), - haystack_len, - Z_STRLEN_P(needle)); + if (Z_TYPE_P(haystack) != Z_TYPE_P(needle)) { + str_type = zend_get_unified_string_type(2 TSRMLS_CC, Z_TYPE_P(haystack), Z_TYPE_P(needle)); + if (str_type == (zend_uchar)-1) { + convert_to_explicit_type(haystack, IS_BINARY); + convert_to_explicit_type(needle, IS_BINARY); + } else { + convert_to_explicit_type(haystack, str_type); + convert_to_explicit_type(needle, str_type); + } + } + target = Z_UNIVAL_P(needle); + needle_len = Z_UNILEN_P(needle); } else { convert_to_long_ex(&needle); - needle_char[0] = (char) Z_LVAL_P(needle); - needle_char[1] = 0; + needle_len = 0; + if (Z_TYPE_P(haystack) == IS_UNICODE) { + if (Z_LVAL_P(needle) < 0 || Z_LVAL_P(needle) > 0x10FFFF) { + php_error(E_WARNING, "Needle argument codepoint value out of range (0 - 0x10FFFF)"); + RETURN_FALSE; + } + if (U_IS_BMP(Z_LVAL_P(needle))) { + u_needle_char[needle_len++] = (UChar)Z_LVAL_P(needle); + u_needle_char[needle_len] = 0; + } else { + u_needle_char[needle_len++] = (UChar)U16_LEAD(Z_LVAL_P(needle)); + u_needle_char[needle_len++] = (UChar)U16_TRAIL(Z_LVAL_P(needle)); + u_needle_char[needle_len] = 0; + } + target = u_needle_char; + } else { + needle_char[needle_len++] = (char)Z_LVAL_P(needle); + needle_char[needle_len] = 0; + target = needle_char; + } + } - found = php_stristr(haystack, - needle_char, - haystack_len, - 1); + if (needle_len > Z_UNILEN_P(haystack)) { + RETURN_FALSE; + } + + if (Z_TYPE_P(haystack) == IS_UNICODE) { + found = php_u_stristr(Z_USTRVAL_P(haystack), (UChar *)target, + Z_USTRLEN_P(haystack), needle_len); + } else { + haystack_copy = estrndup(Z_STRVAL_P(haystack), Z_STRLEN_P(haystack)); + found = php_stristr(Z_STRVAL_P(haystack), (char *)target, + Z_STRLEN_P(haystack), needle_len); } if (found) { - found_offset = found - haystack; - if (part) { - char *ret; - ret = emalloc(found_offset + 1); - strncpy(ret, haystack_orig, found_offset); - ret[found_offset] = '\0'; - RETVAL_STRINGL(ret , found_offset, 0); + if (Z_TYPE_P(haystack) == IS_UNICODE) { + start = part ? Z_USTRVAL_P(haystack) : found; + end = part ? found : (Z_USTRVAL_P(haystack) + Z_USTRLEN_P(haystack)); + RETVAL_UNICODEL((UChar *)start, (UChar *)end-(UChar *)start, 1); } else { - RETVAL_STRINGL(haystack_orig + found_offset, haystack_len - found_offset, 1); + found_offset = (char *)found - Z_STRVAL_P(haystack); + start = part ? haystack_copy : haystack_copy + found_offset; + end = part ? haystack_copy + found_offset : (haystack_copy + Z_STRLEN_P(haystack)); + if (Z_TYPE_P(haystack) == IS_BINARY) { + RETVAL_BINARYL((char *)start, (char *)end-(char *)start, 1); + } else { + RETVAL_STRINGL((char *)start, (char *)end-(char *)start, 1); + } } } else { RETVAL_FALSE; } - efree(haystack_orig); + if (Z_TYPE_P(haystack) != IS_UNICODE) { + efree(haystack_copy); + } } /* }}} */