Fix new conversion filter for CP50220 (multi-codepoint kana at end of buffer)

If two codepoints which needed to be collapsed into a single kuten code
were separated, with one at the end of one buffer and the other at the
beginning of the next buffer, they were not converted correctly.
This was discovered while fuzzing the new implementation of
mb_decode_numericentity.
This commit is contained in:
Alex Dowad 2022-06-25 09:06:39 +02:00
parent 7559bf77d2
commit 3cf432798e
2 changed files with 26 additions and 2 deletions

View File

@ -847,11 +847,27 @@ static void mb_wchar_to_cp50220(uint32_t *in, size_t len, mb_convert_buf *buf, b
MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
bool consumed = false;
uint32_t w;
if (buf->state & 0xFFFF00) {
/* Reprocess cached codepoint */
w = buf->state >> 8;
buf->state &= 0xFF;
goto reprocess_codepoint;
}
while (len--) {
uint32_t w = *in++;
w = *in++;
reprocess_codepoint:
w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
if (w >= 0xFF61 && w <= 0xFF9F && !len && !end) {
/* This codepoint may need to combine with the next one,
* but the 'next one' will come in a separate buffer */
buf->state |= w << 8;
break;
} else {
w = mbfl_convert_kana(w, len ? *in : 0, &consumed, NULL, MBFL_HAN2ZEN_KATAKANA | MBFL_HAN2ZEN_GLUE);
}
if (consumed) {
/* Two successive codepoints were converted into one */

View File

@ -382,6 +382,14 @@ $converted = mb_convert_encoding("\xff\xff\x00&", 'CP50220', 'UTF-16BE');
if ($converted !== '?&')
die("Bad handling of erroneous codepoint followed by good one (got " . bin2hex($converted) . ")");
// In CP50220, two codepoints can be collapsed into a single kuten code in some cases
// This should work even on a boundary between separately processed buffers
$shouldCollapse = "\xFF\x76\xFF\x9E";
$expected = "\x1B\$B%,\x1B(B";
for ($i = 0; $i < 256; $i++) {
convertValidString(str_repeat("\x00a", $i) . $shouldCollapse, str_repeat('a', $i) . $expected, 'UTF-16BE', 'CP50220', false);
}
?>
--EXPECT--
ASCII support OK