mirror of
https://github.com/php/php-src.git
synced 2024-09-21 18:07:23 +00:00
Add fast SSE2-based implementation of mb_strlen for known-valid UTF-8 strings
One small piece of this was obtained from Stack Overflow. According to Stack Overflow's Terms of Service, all user-contributed code on SO is provided under a Creative Commons license. I believe this license is compatible with the code being included in PHP. Benchmarking results (UTF-8 only, for strings which have already been checked using mb_check_encoding): For very short (0-5 byte) strings, mb_strlen is 12% faster. The speedup gets greater and greater on longer input strings; for strings around 100KB, mb_strlen is 23 times faster. Currently the 'fast' code is gated behind a GC flag check which ensures it is only used on strings which have already been checked for UTF-8 validity. This is because the accelerated code will return different results on some invalid UTF-8 strings.
This commit is contained in:
parent
60102c3228
commit
b4cbaabd9b
@ -1715,13 +1715,85 @@ PHP_FUNCTION(mb_str_split)
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __SSE2__
|
||||
/* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r)
|
||||
* From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
|
||||
* Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all
|
||||
* 16 of them, returning the sum in an ordinary scalar register */
|
||||
static inline uint32_t _mm_sum_epu8(const __m128i v)
|
||||
{
|
||||
/* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register
|
||||
* _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers,
|
||||
* sums up those differences, and stores them as two 16-byte integers in the top and bottom
|
||||
* halves of the destination XMM register
|
||||
* By using a zeroed-out XMM register as one operand, we ensure the "differences" which are
|
||||
* summed up will actually just be the 8-bit values from `v` */
|
||||
__m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
|
||||
/* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have
|
||||
* to extract it here; but it stored the sum as two different 16-bit values
|
||||
* _mm_cvtsi128_si32 extracts one of those values into a scalar register
|
||||
* _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */
|
||||
return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* This assumes that `string` is valid UTF-8
|
||||
* In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes)
|
||||
* Interpreted as signed integers, those are all byte values less than -64
|
||||
* A fast way to get the length of a UTF-8 string is to start with its byte length,
|
||||
* then subtract off the number of continuation bytes */
|
||||
static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len)
|
||||
{
|
||||
unsigned char *e = p + len;
|
||||
|
||||
#ifdef __SSE2__
|
||||
if (len >= sizeof(__m128i)) {
|
||||
const __m128i threshold = _mm_set1_epi8(-64);
|
||||
const __m128i delta = _mm_set1_epi8(1);
|
||||
__m128i counter = _mm_set1_epi8(0); /* Vector of 16 continuation-byte counters */
|
||||
|
||||
int reset_counter = 255;
|
||||
do {
|
||||
__m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */
|
||||
__m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */
|
||||
counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */
|
||||
|
||||
/* The counters can only go up to 255, so every 255 iterations, fold them into `len`
|
||||
* and reset them to zero */
|
||||
if (--reset_counter == 0) {
|
||||
len -= _mm_sum_epu8(counter);
|
||||
counter = _mm_set1_epi8(0);
|
||||
reset_counter = 255;
|
||||
}
|
||||
|
||||
p += sizeof(__m128i);
|
||||
} while (p + sizeof(__m128i) <= e);
|
||||
|
||||
len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */
|
||||
while (p < e) {
|
||||
signed char c = *p++;
|
||||
if (c < -64) {
|
||||
len--;
|
||||
}
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
|
||||
{
|
||||
unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4);
|
||||
if (char_len) {
|
||||
return ZSTR_LEN(string) / char_len;
|
||||
} else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string) & IS_STR_VALID_UTF8) {
|
||||
return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
|
||||
}
|
||||
|
||||
|
||||
uint32_t wchar_buf[128];
|
||||
unsigned char *in = (unsigned char*)ZSTR_VAL(string);
|
||||
size_t in_len = ZSTR_LEN(string);
|
||||
@ -1789,14 +1861,7 @@ static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char *
|
||||
}
|
||||
|
||||
static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) {
|
||||
size_t result = 0;
|
||||
while (pos > start) {
|
||||
unsigned char c = *--pos;
|
||||
if (c < 0x80 || (c & 0xC0) != 0x80) {
|
||||
result++;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
return mb_fast_strlen_utf8(start, pos - start);
|
||||
}
|
||||
|
||||
static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse)
|
||||
|
@ -62,10 +62,26 @@ mb_internal_encoding('JIS') or print("mb_internal_encoding() failed\n");
|
||||
print strlen($jis) . "\n";
|
||||
|
||||
echo "== UTF-8 ==\n";
|
||||
$utf8 = mb_convert_encoding($euc_jp, 'UTF-8','EUC-JP');
|
||||
print mb_strlen($utf8,'UTF-8') . "\n";
|
||||
mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n");
|
||||
print strlen($utf8) . "\n";
|
||||
$utf8 = mb_convert_encoding($euc_jp, 'UTF-8', 'EUC-JP');
|
||||
print mb_strlen($utf8,'UTF-8') . " codepoints\n";
|
||||
mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n");
|
||||
print strlen($utf8) . " bytes\n";
|
||||
|
||||
$utf8 = "abcde あいうえお 汉字 ελληνικά";
|
||||
$long_utf8 = str_repeat($utf8, 100);
|
||||
print mb_strlen($utf8, 'UTF-8') . "\n";
|
||||
print mb_strlen($long_utf8, 'UTF-8') . "\n";
|
||||
|
||||
echo "== UTF-8 with performance optimizations ==\n";
|
||||
// Optimized mb_strlen can be used on UTF-8 strings after they are checked for validity
|
||||
mb_check_encoding($utf8);
|
||||
mb_check_encoding($long_utf8);
|
||||
print mb_strlen($utf8, 'UTF-8') . "\n";
|
||||
print mb_strlen($long_utf8, 'UTF-8') . "\n";
|
||||
|
||||
$str = str_repeat('Σ', 2048); // 2-byte UTF-8 character
|
||||
mb_check_encoding($str, 'UTF-8');
|
||||
print mb_strlen($str, 'UTF-8') . "\n";
|
||||
|
||||
// Wrong Parameters
|
||||
echo "== WRONG PARAMETERS ==\n";
|
||||
@ -110,7 +126,13 @@ try {
|
||||
43
|
||||
90
|
||||
== UTF-8 ==
|
||||
43
|
||||
101
|
||||
43 codepoints
|
||||
101 bytes
|
||||
23
|
||||
2300
|
||||
== UTF-8 with performance optimizations ==
|
||||
23
|
||||
2300
|
||||
2048
|
||||
== WRONG PARAMETERS ==
|
||||
mb_strlen(): Argument #2 ($encoding) must be a valid encoding, "BAD_NAME" given
|
||||
|
Loading…
Reference in New Issue
Block a user