From 3cf432798e4e793c954fcfbb2bb1685d115b644d Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Sat, 25 Jun 2022 09:06:39 +0200 Subject: [PATCH] Fix new conversion filter for CP50220 (multi-codepoint kana at end of buffer) If two codepoints which needed to be collapsed into a single kuten code were separated, with one at the end of one buffer and the other at the beginning of the next buffer, they were not converted correctly. This was discovered while fuzzing the new implementation of mb_decode_numericentity. --- .../libmbfl/filters/mbfilter_cp5022x.c | 20 +++++++++++++++++-- ext/mbstring/tests/cp5022x_encoding.phpt | 8 ++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c index ff5cfb43086..f9e64c32589 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c @@ -847,11 +847,27 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b MB_CONVERT_BUF_ENSURE(buf, out, limit, len); bool consumed = false; + uint32_t w; + + if (buf->state & 0xFFFF00) { + /* Reprocess cached codepoint */ + w = buf->state >> 8; + buf->state &= 0xFF; + goto reprocess_codepoint; + } while (len--) { - uint32_t w = *in++; + w = *in++; +reprocess_codepoint: - w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE); + if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) { + /* This codepoint may need to combine with the next one, + * but the 'next one' will come in a separate buffer */ + buf->state |= w << 8; + break; + } else { + w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE); + } if (consumed) { /* Two successive codepoints were converted into one */ diff --git a/ext/mbstring/tests/cp5022x_encoding.phpt b/ext/mbstring/tests/cp5022x_encoding.phpt index 04a9c015536..f8e5831ebe2 100644 --- a/ext/mbstring/tests/cp5022x_encoding.phpt +++ b/ext/mbstring/tests/cp5022x_encoding.phpt @@ -382,6 +382,14 @@ $converted = mb_convert_encoding("\xff\xff\x00&", 'CP50220', 'UTF-16BE'); if ($converted !== '?&') die("Bad handling of erroneous codepoint followed by good one (got " . bin2hex($converted) . ")"); +// In CP50220, two codepoints can be collapsed into a single kuten code in some cases +// This should work even on a boundary between separately processed buffers +$shouldCollapse = "\xFF\x76\xFF\x9E"; +$expected = "\x1B\$B%,\x1B(B"; +for ($i = 0; $i < 256; $i++) { + convertValidString(str_repeat("\x00a", $i) . $shouldCollapse, str_repeat('a', $i) . $expected, 'UTF-16BE', 'CP50220', false); +} + ?> --EXPECT-- ASCII support OK