Use fast conversion filters to implement php_mb_ord

Even for single-character strings, this is about 50% faster for
ASCII, UTF-8, and UTF-16. For long strings, the performance gain is
enormous, since the old code would convert the ENTIRE string, just
to pick out the first codepoint.
This commit is contained in:
Alex Dowad 2022-05-13 22:03:44 +02:00
parent 9468fa7ff2
commit 880803a21e
2 changed files with 13 additions and 21 deletions

View File

@ -143,6 +143,10 @@ typedef struct {
typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state);
typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end);
/* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`,
* the buffer must be at least this size (to work with all supported text encodings) */
#define MBSTRING_MIN_WCHAR_BUFSIZE 5
static inline void mb_convert_buf_init(mb_convert_buf *buf, size_t initsize, uint32_t repl_char, unsigned int err_mode)
{
buf->state = buf->errors = 0;

View File

@ -3993,29 +3993,17 @@ static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string
return -2;
}
{
mbfl_wchar_device dev;
mbfl_convert_filter *filter;
zend_long cp;
/* Some legacy text encodings have a minimum required wchar buffer size;
* the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
unsigned int state = 0;
size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);
mbfl_wchar_device_init(&dev);
filter = mbfl_convert_filter_new(enc, &mbfl_encoding_wchar, mbfl_wchar_device_output, 0, &dev);
/* If this assertion fails this means some memory allocation failure which is a bug */
ZEND_ASSERT(filter != NULL);
mbfl_convert_filter_feed_string(filter, (unsigned char*)str, str_len);
mbfl_convert_filter_flush(filter);
if (dev.pos < 1 || filter->num_illegalchar || dev.buffer[0] == MBFL_BAD_INPUT) {
cp = -1;
} else {
cp = dev.buffer[0];
}
mbfl_convert_filter_delete(filter);
mbfl_wchar_device_clear(&dev);
return cp;
if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
return -1;
}
return wchar_buf[0];
}