Optimize mb_str{,im}width for performance

Rather than doing a linear search of a table of fullwidth codepoint
ranges for every input character,

1) Short-cut the search if the codepoint is below the first such range
2) Otherwise, do a binary (rather than linear) search
This commit is contained in:
Alex Dowad 2021-09-22 21:13:12 +02:00
parent f4365d2c26
commit 0b32a15eb0
3 changed files with 28 additions and 19 deletions

View File

@ -14,6 +14,8 @@
* which should be displayed as double-width.
*/
#define FIRST_DOUBLEWIDTH_CODEPOINT 0x1100
static const struct {
int begin;
int end;

View File

@ -1203,31 +1203,33 @@ mbfl_strcut(
return result;
}
/*
* strwidth
*/
static size_t is_fullwidth(int c)
/* Some East Asian characters, when printed at a terminal (or the like), require double
* the usual amount of horizontal space. We call these "fullwidth" characters. */
static size_t character_width(int c)
{
int i;
if (c < mbfl_eaw_table[0].begin) {
return 0;
if (c < FIRST_DOUBLEWIDTH_CODEPOINT) {
return 1;
}
for (i = 0; i < sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]); i++) {
if (mbfl_eaw_table[i].begin <= c && c <= mbfl_eaw_table[i].end) {
return 1;
/* Do a binary search to see if we fall in any of the fullwidth ranges */
int lo = 0, hi = sizeof(mbfl_eaw_table) / sizeof(mbfl_eaw_table[0]);
while (lo < hi) {
int probe = (lo + hi) / 2;
if (c < mbfl_eaw_table[probe].begin) {
hi = probe;
} else if (c > mbfl_eaw_table[probe].end) {
lo = probe + 1;
} else {
return 2;
}
}
return 0;
return 1;
}
static int
filter_count_width(int c, void* data)
static int filter_count_width(int c, void* data)
{
(*(size_t *)data) += (is_fullwidth(c) ? 2: 1);
(*(size_t *)data) += character_width(c);
return 0;
}
@ -1289,7 +1291,7 @@ collector_strimwidth(int c, void* data)
break;
default:
if (pc->outchar >= pc->from) {
pc->outwidth += (is_fullwidth(c) ? 2: 1);
pc->outwidth += character_width(c);
if (pc->outwidth > pc->width) {
if (pc->status == 0) {

View File

@ -700,7 +700,7 @@ function generateMPH(array $map, bool $fast) {
}
function generateEastAsianWidthData(array $wideRanges) {
$result = <<<'HEADER'
$result = <<<'HEADER'
/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
*
* DO NOT EDIT THIS FILE!
@ -717,12 +717,17 @@ function generateEastAsianWidthData(array $wideRanges) {
* which should be displayed as double-width.
*/
HEADER;
$result .= "\n#define FIRST_DOUBLEWIDTH_CODEPOINT 0x" . dechex($wideRanges[0]->start) . "\n\n";
$result .= <<<'TABLESTART'
static const struct {
int begin;
int end;
} mbfl_eaw_table[] = {
HEADER;
TABLESTART;
foreach ($wideRanges as $range) {
$startCode = dechex($range->start);