mirror of
https://github.com/php/php-src.git
synced 2024-09-22 10:27:25 +00:00
776296e12f
Previously, mbstring had a special mode whereby it would convert erroneous input byte sequences to output like "BAD+XXXX", where "XXXX" would be the erroneous bytes expressed in hexadecimal. This mode could be enabled by calling `mb_substitute_character("long")`. However, accurately reproducing input byte sequences from the cached state of a conversion filter is often tricky, and this significantly complicates the implementation. Further, the means used for passing the erroneous bytes through to where the "BAD+XXXX" text is generated only allows for up to 3 bytes to be passed, meaning that some erroneous byte sequences are truncated anyways. More to the point, a search of publically available PHP code indicates that nobody is really using this feature anyways. Incidentally, this feature also provided error output like "JIS+XXXX" if the input 'should have' represented a JISX 0208 codepoint, but it decodes to a codepoint which does not exist in the JISX 0208 charset. Similarly, specific error output was provided for non-existent JISX 0212 codepoints, and likewise for JISX 0213, CP932, and a few other charsets. All of that is now consigned to the flames. However, "long" error markers also include a somewhat more useful "U+XXXX" marker for Unicode codepoints which were successfully decoded from the input text, but cannot be represented in the output encoding. Those are still supported. With this change, there is no need to use a variety of special values in the high bits of a wchar to represent different types of error values. We can (and will) just use a single error value. This will be equal to -1. One complicating factor: Text conversion functions return an integer to indicate whether the conversion operation should be immediately aborted, and the magic 'abort' marker is -1. Also, almost all of these functions would return the received byte/codepoint to indicate success. That doesn't work with the new error value; if an input filter detects an error and passes -1 to the output filter, and the output filter returns it back, that would be taken to mean 'abort'. Therefore, amend all these functions to return 0 for success.
228 lines
7.9 KiB
PHP
228 lines
7.9 KiB
PHP
--TEST--
|
|
Exhaustive test of ISO-2022-JP-KDDI text encoding
|
|
--EXTENSIONS--
|
|
mbstring
|
|
--SKIPIF--
|
|
<?php
|
|
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
|
|
?>
|
|
--FILE--
|
|
<?php
|
|
srand(390); /* Make results consistent */
|
|
include('encoding_tests.inc');
|
|
mb_substitute_character(0x25); // '%'
|
|
|
|
function shiftJISDecode($bytes) {
|
|
/* Convert CP932's default Shift-JIS representation to kuten code */
|
|
$first = ($bytes >> 8) & 0xFF;
|
|
$second = $bytes & 0xFF;
|
|
$hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81);
|
|
if ($second > 0x9E) {
|
|
$kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21);
|
|
} else if ($second > 0x7F) {
|
|
$kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21);
|
|
} else {
|
|
$kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21);
|
|
}
|
|
return $kuten;
|
|
}
|
|
|
|
/* Read in the table of all characters in CP932 */
|
|
$cp932Chars = array(); /* CP932 string -> UTF-32BE string */
|
|
$fp = fopen(realpath(__DIR__ . '/data/CP932.txt'), 'r+');
|
|
while ($line = fgets($fp, 256)) {
|
|
if ($line[0] == '#')
|
|
continue;
|
|
|
|
if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
|
|
if ($bytes < 256)
|
|
continue;
|
|
/* For ISO-2022-JP-KDDI, we only accept the first range of MicroSoft
|
|
* vendor extensions, in ku 13 */
|
|
if ($bytes > 0xEAA4)
|
|
continue;
|
|
$cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('N', $codepoint);
|
|
}
|
|
}
|
|
|
|
/* Add KDDI-specific emoji to the CP932 characters
|
|
* They are mapped in 22 ku (or 'rows') above the places where they are mapped
|
|
* in the Shift-JIS representation of KDDI emoji */
|
|
$fp = fopen(realpath(__DIR__ . '/data/EmojiSources.txt'), 'r+');
|
|
while ($line = fgets($fp, 256)) {
|
|
if ($line[0] == '#')
|
|
continue;
|
|
$fields = explode(';', rtrim($line));
|
|
if (count($fields) >= 4) {
|
|
if (sscanf($fields[0], "%x %x", $cp1, $cp2) == 2)
|
|
$utf32 = pack('N', $cp1) . pack('N', $cp2);
|
|
else
|
|
$utf32 = pack('N', hexdec($fields[0]));
|
|
|
|
if ($fields[2]) {
|
|
$kuten = shiftJISDecode(hexdec($fields[2]));
|
|
$ku = $kuten >> 8;
|
|
if ($ku >= 106 && $ku <= 112)
|
|
$cp932Chars[pack('n', $kuten - (22 * 0x100))] = $utf32;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Duplicate mappings for the same characters in CP932 */
|
|
$nonInvertible = array();
|
|
foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C] as $i) {
|
|
$bytes = pack('n', shiftJISDecode($i));
|
|
$nonInvertible[$bytes] = $cp932Chars[$bytes];
|
|
}
|
|
|
|
/* Read in table of all characters in JISX-0201 charset */
|
|
$jisx0201Chars = array(); /* JISX0201 -> UTF-32BE */
|
|
$fp = fopen(realpath(__DIR__ . '/data/JISX0201.txt'), 'r+');
|
|
while ($line = fgets($fp, 256)) {
|
|
if ($line[0] == '#')
|
|
continue;
|
|
|
|
if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2)
|
|
$jisx0201Chars[chr($byte)] = pack('N', $codepoint);
|
|
}
|
|
|
|
function testValid($from, $to, $encoding, $bothWays = true) {
|
|
identifyValidString($from, $encoding);
|
|
convertValidString($from, $to, $encoding, 'UTF-32BE', false);
|
|
|
|
if ($bothWays) {
|
|
/* ESC ( B at the beginning is redundant, since ASCII mode is the default */
|
|
if (substr($from, 0, 3) == "\x1B(B")
|
|
$from = substr($from, 3, strlen($from) - 3);
|
|
/* If the string switches to a different charset, it should switch back to
|
|
* ASCII at the end */
|
|
if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(I") !== false || strpos($from, "\x1B\$@") !== false || strpos($from, "\x1B\$(B") !== false)
|
|
$from .= "\x1B(B";
|
|
|
|
convertValidString($to, $from, 'UTF-32BE', $encoding, false);
|
|
}
|
|
}
|
|
|
|
function testInvalid($from, $to, $encoding) {
|
|
testInvalidString($from, $to, $encoding, 'UTF-32BE');
|
|
}
|
|
|
|
for ($i = 0; $i < 0x80; $i++) {
|
|
if ($i == 0x1B)
|
|
continue;
|
|
testValid(chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI');
|
|
testValid("\x1B(B" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI', false);
|
|
testValid("\x1B(J" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI', false);
|
|
}
|
|
|
|
for ($i = 0x80; $i < 256; $i++) {
|
|
if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana
|
|
continue;
|
|
testInvalid(chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
|
|
testInvalid("\x1B(B" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
|
|
testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
|
|
}
|
|
|
|
echo "ASCII support OK\n";
|
|
|
|
/* All valid JIS X 0201 characters
|
|
* Those with a 1 in the high bit are JIS X 0201 kana */
|
|
foreach ($jisx0201Chars as $jisx0201 => $utf32BE) {
|
|
if (ord($jisx0201) >= 128) {
|
|
$kana = chr(ord($jisx0201) - 128);
|
|
testValid("\x1B(I" . $kana, $utf32BE, 'ISO-2022-JP-KDDI', false);
|
|
testValid($jisx0201, $utf32BE, 'ISO-2022-JP-KDDI', false);
|
|
}
|
|
}
|
|
|
|
for ($i = 0x80; $i < 256; $i++) {
|
|
if ($i >= 0xA1 && $i <= 0xDF)
|
|
continue;
|
|
testInvalid("\x1B(I" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
|
|
testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI');
|
|
}
|
|
|
|
echo "JIS X 0201 support OK\n";
|
|
|
|
$validChars = $cp932Chars;
|
|
/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */
|
|
for ($i = 0; $i <= 0x7F; $i++)
|
|
$validChars[chr($i)] = chr($i);
|
|
for ($i = 0xA1; $i <= 0xDF; $i++)
|
|
$validChars[chr($i)] = $jisx0201Chars[chr($i)];
|
|
$lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
|
|
findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable);
|
|
|
|
foreach ($nonInvertible as $bytes => $char)
|
|
unset($cp932Chars[$bytes]);
|
|
|
|
$good = array_keys($cp932Chars);
|
|
shuffle($good);
|
|
while (!empty($good)) {
|
|
$length = min(rand(5,10), count($good));
|
|
$from = $to = '';
|
|
while ($length--) {
|
|
$goodChar = array_pop($good);
|
|
$from .= $goodChar;
|
|
$to .= $cp932Chars[$goodChar];
|
|
}
|
|
testValid("\x1B\$B" . $from, $to, 'ISO-2022-JP-KDDI');
|
|
}
|
|
|
|
$good = array_keys($nonInvertible);
|
|
shuffle($good);
|
|
while (!empty($good)) {
|
|
$length = min(rand(5,10), count($good));
|
|
$from = $to = '';
|
|
while ($length--) {
|
|
$goodChar = array_pop($good);
|
|
$from .= $goodChar;
|
|
$to .= $nonInvertible[$goodChar];
|
|
}
|
|
testValid("\x1B\$B" . $from, $to, 'ISO-2022-JP-KDDI', false);
|
|
}
|
|
|
|
foreach (array_keys($invalidChars) as $invalid) {
|
|
$firstByte = ord($invalid[0]);
|
|
if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) {
|
|
/* The first byte of this 2-byte character will be rejected and result in % being sent
|
|
* to the output. Then the second byte will do something else. It is easier to write the
|
|
* test if we only check with the 1st byte. */
|
|
testInvalidString("\x1B\$B" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE');
|
|
} else {
|
|
testInvalidString("\x1B\$B" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE');
|
|
}
|
|
}
|
|
|
|
foreach (array_keys($truncatedChars) as $truncated)
|
|
testInvalidString("\x1B\$B" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE');
|
|
|
|
echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n";
|
|
|
|
testValidString("\x00\xA5", "\x1B\$B!o\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
|
|
testValidString("\x20\x3E", "\x1B\$B!1\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
|
|
testValidString("\xFF\x5E", "\x1B\$B!A\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false);
|
|
|
|
echo "Other mappings from Unicode -> ISO-2022-JP-KDDI OK\n";
|
|
|
|
// Test "long" illegal character markers
|
|
mb_substitute_character("long");
|
|
convertInvalidString("\xE0", "%", "ISO-2022-JP-KDDI", "UTF-8");
|
|
// Invalid escapes:
|
|
convertInvalidString("\x1B", "%", "ISO-2022-JP-KDDI", "UTF-8");
|
|
convertInvalidString("\x1B.", "%", "ISO-2022-JP-KDDI", "UTF-8");
|
|
convertInvalidString("\x1B\$", "%", "ISO-2022-JP-KDDI", "UTF-8");
|
|
convertInvalidString("\x1B\$.", "%", "ISO-2022-JP-KDDI", "UTF-8");
|
|
convertInvalidString("\x1B\$(X", "%", "ISO-2022-JP-KDDI", "UTF-8");
|
|
convertInvalidString("\x1B\$B\x9F", "%", "ISO-2022-JP-KDDI", "UTF-8"); // 0x9F does not start any 2-byte character
|
|
convertInvalidString("\xE0\x00", "U+E000", "UTF-16BE", "ISO-2022-JP-KDDI");
|
|
|
|
echo "Done!\n";
|
|
?>
|
|
--EXPECT--
|
|
ASCII support OK
|
|
JIS X 0201 support OK
|
|
JIS X 0208 (with MS extensions) and KDDI emoji support OK
|
|
Other mappings from Unicode -> ISO-2022-JP-KDDI OK
|
|
Done!
|