php-src/ext/mbstring/tests/cp51932_encoding.phpt

--TEST--
Exhaustive test of CP51932 encoding verification and conversion
--EXTENSIONS--
mbstring
--SKIPIF--
<?php
if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
?>
--FILE--
<?php
srand(2020); /* Make results consistent */
include('encoding_tests.inc');
mb_substitute_character(0x25); // '%'

/* Read in the table of all characters in CP51932 */
$validChars = array(); /* CP51932 string -> UTF-16BE string */
$fromUnicode = array();

$fp = fopen(realpath(__DIR__ . '/data/CP51932.txt'), 'r+');
while ($line = fgets($fp, 256)) {
  if ($line[0] == '#')
    continue;

  $byte2 = null;
  if (sscanf($line, '<U%x> \x%x\x%x', $codepoint, $byte1, $byte2) >= 2) {
    /* The table we are using tries to map as many Unicode codepoints into
     * CP51932 as possible, including by mapping latin characters with accents
     * to the equivalent without accents; but since CP51932 is based on the
     * CP932 character set, we don't need to handle codepoints which are not
     * mapped from any character in CP932 */
    if (($codepoint >= 0xC0 && $codepoint <= 0xD6) ||
        ($codepoint >= 0xD8 && $codepoint <= 0xF6) ||
        ($codepoint >= 0xF8 && $codepoint <= 0xFF))
      continue;
    $cp51932 = ($byte2 ? (chr($byte1) . chr($byte2)) : chr($byte1));
    $utf16 = pack('n', $codepoint);
    $validChars[$cp51932] = $utf16;
    $fromUnicode[$utf16] = $cp51932;
  }
}

/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)
 * But when converting Unicode to CP51932, we also accept U+301C (WAVE DASH) */
$fromUnicode["\x30\x1C"] = "\xA1\xC1";
/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN),
 * but when converting Unicode to CP51932, we also accept U+2212 (MINUS SIGN) */
$fromUnicode["\x22\x12"] = "\xA1\xDD";
/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO),
 * but when converting Unicode to CP51932, we also accept U+2016
 * (DOUBLE VERTICAL LINE) */
$fromUnicode["\x20\x16"] = "\xA1\xC2";

/* There are a number of duplicate, irreversible mappings in the CP51932 table
 * In most cases, the one which we primarily use appears last in the table,
 * but in some cases, it is first and will be overwritten in the above loop
 *
 * Interestingly, the "collisions" happen in both directions! Part of this is
 * because the table we are using attempts to map as many Unicode codepoints
 * as possible to CP932 characters */
$fromUnicode["\x22\x20"] = "\xA2\xDC";
$fromUnicode["\x22\x29"] = "\xA2\xC1";
$fromUnicode["\x22\x2B"] = "\xA2\xE9";
$fromUnicode["\x22\x35"] = "\xA2\xE8";
$fromUnicode["\x22\x1A"] = "\xA2\xE5";
$fromUnicode["\x22\x2A"] = "\xA2\xC0";
$fromUnicode["\x22\x61"] = "\xA2\xE1";
$fromUnicode["\x22\xA5"] = "\xA2\xDD";
$fromUnicode["\x22\x52"] = "\xA2\xE2";
$fromUnicode["\xFF\xE2"] = "\xA2\xCC";
unset($fromUnicode["\x00\xA1"]); // Don't map upside-down ! to ordinary !
unset($fromUnicode["\x00\xA6"]); // Don't map broken bar to ordinary pipe character
unset($fromUnicode["\x00\xA9"]); // Don't map © to c
unset($fromUnicode["\x00\xAA"]); // Don't map feminine ordinal indicator
unset($fromUnicode["\x00\xAB"]); // Don't map left double angled quote mark to "much less than"
unset($fromUnicode["\x00\xAD"]); // Don't map soft hyphen to ordinary hyphen
unset($fromUnicode["\x00\xAE"]); // Don't map ® to R
unset($fromUnicode["\x00\xAF"]); // Don't map Unicode halfwidth macron to CP932 fullwidth macron
unset($fromUnicode["\x00\xB2"]); // Don't map ² to ordinary 2
unset($fromUnicode["\x00\xB3"]); // Don't map ³ to ordinary 3
unset($fromUnicode["\x00\xB5"]); // Don't map micro sign to Greek mu
unset($fromUnicode["\x00\xB7"]); // Don't map middle dot to katakana middle dot
unset($fromUnicode["\x00\xB8"]); // Don't map cedilla to fullwidth comma
unset($fromUnicode["\x00\xB9"]); // Don't map ¹ to ordinary 1
unset($fromUnicode["\x00\xBA"]); // Don't map "masculine ordinal indicator"
unset($fromUnicode["\x00\xBB"]); // Don't map right double angled quote mark to "much greater than"
unset($fromUnicode["\x30\x94"]); // Don't map hiragana vu to katakana vu

for ($i = 0; $i <= 0x7F; $i++)
  $validChars[chr($i)] = "\x00" . chr($i);

/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */
$fromUnicode["\x00\xA5"] = "\xA1\xEF";
/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */
$fromUnicode["\x20\x3E"] = "\xA1\xB1";
/* U+00AF is MACRON; convert to FULLWIDTH MACRON */
$fromUnicode["\x00\xAF"] = "\xA1\xB1";

testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false);
testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false);
echo "CP51932 verification and conversion works on all valid characters\n";

findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA9, 0xAF), 2) + array_fill_keys(range(0xF5, 0xF8), 2) + array(0xFD => 2, 0xFE => 2));

testAllInvalidChars($invalidChars, $validChars, 'CP51932', 'UTF-16BE', "\x00%");
testTruncatedChars($truncated, 'CP51932', 'UTF-16BE', "\x00%");
echo "CP51932 verification and conversion works on all invalid characters\n";

findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));
convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%');
echo "Unicode -> CP51932 conversion works on all invalid codepoints\n";

// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x80", "%", "CP51932", "UTF-8");
convertInvalidString("\xFE\xFF", "%", "CP51932", "UTF-8");

echo "Done!\n";
?>
--EXPECT--
CP51932 verification and conversion works on all valid characters
CP51932 verification and conversion works on all invalid characters
Unicode -> CP51932 conversion works on all invalid codepoints
Done!
Enhance handling of CP51932 encoding - Don't pass 'control' characters through in the middle of a multi-byte char - Treat truncated multi-byte characters as an error 2020-10-18 12:27:21 +00:00			`--TEST--`
			`Exhaustive test of CP51932 encoding verification and conversion`
Migrate more SKIPIF -> EXTENSIONS (#7139) This is a mix of more automated and manual migration. It should remove all applicable extension_loaded() checks outside of skipif.inc files. 2021-06-11 10:58:44 +00:00			`--EXTENSIONS--`
			`mbstring`
Mark CP932 and CP51932 encoding tests as 'slow tests' 2021-06-02 19:30:16 +00:00			`--SKIPIF--`
			`<?php`
			`if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");`
			`?>`
Enhance handling of CP51932 encoding - Don't pass 'control' characters through in the middle of a multi-byte char - Treat truncated multi-byte characters as an error 2020-10-18 12:27:21 +00:00			`--FILE--`
			`<?php`
			`srand(2020); /* Make results consistent */`
			`include('encoding_tests.inc');`
			`mb_substitute_character(0x25); // '%'`

			`/* Read in the table of all characters in CP51932 */`
			`$validChars = array(); /* CP51932 string -> UTF-16BE string */`
			`$fromUnicode = array();`

			`$fp = fopen(realpath(__DIR__ . '/data/CP51932.txt'), 'r+');`
			`while ($line = fgets($fp, 256)) {`
			`if ($line[0] == '#')`
			`continue;`

			`$byte2 = null;`
			`if (sscanf($line, '<U%x> \x%x\x%x', $codepoint, $byte1, $byte2) >= 2) {`
			`/* The table we are using tries to map as many Unicode codepoints into`
			`* CP51932 as possible, including by mapping latin characters with accents`
			`* to the equivalent without accents; but since CP51932 is based on the`
			`* CP932 character set, we don't need to handle codepoints which are not`
			`* mapped from any character in CP932 */`
			`if (($codepoint >= 0xC0 && $codepoint <= 0xD6) \|\|`
			`($codepoint >= 0xD8 && $codepoint <= 0xF6) \|\|`
			`($codepoint >= 0xF8 && $codepoint <= 0xFF))`
			`continue;`
			`$cp51932 = ($byte2 ? (chr($byte1) . chr($byte2)) : chr($byte1));`
			`$utf16 = pack('n', $codepoint);`
			`$validChars[$cp51932] = $utf16;`
			`$fromUnicode[$utf16] = $cp51932;`
			`}`
			`}`

			`/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE)`
			`* But when converting Unicode to CP51932, we also accept U+301C (WAVE DASH) */`
			`$fromUnicode["\x30\x1C"] = "\xA1\xC1";`
			`/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN),`
			`* but when converting Unicode to CP51932, we also accept U+2212 (MINUS SIGN) */`
			`$fromUnicode["\x22\x12"] = "\xA1\xDD";`
			`/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO),`
			`* but when converting Unicode to CP51932, we also accept U+2016`
			`* (DOUBLE VERTICAL LINE) */`
			`$fromUnicode["\x20\x16"] = "\xA1\xC2";`

			`/* There are a number of duplicate, irreversible mappings in the CP51932 table`
			`* In most cases, the one which we primarily use appears last in the table,`
			`* but in some cases, it is first and will be overwritten in the above loop`
			`*`
			`* Interestingly, the "collisions" happen in both directions! Part of this is`
			`* because the table we are using attempts to map as many Unicode codepoints`
			`* as possible to CP932 characters */`
			`$fromUnicode["\x22\x20"] = "\xA2\xDC";`
			`$fromUnicode["\x22\x29"] = "\xA2\xC1";`
			`$fromUnicode["\x22\x2B"] = "\xA2\xE9";`
			`$fromUnicode["\x22\x35"] = "\xA2\xE8";`
			`$fromUnicode["\x22\x1A"] = "\xA2\xE5";`
			`$fromUnicode["\x22\x2A"] = "\xA2\xC0";`
			`$fromUnicode["\x22\x61"] = "\xA2\xE1";`
			`$fromUnicode["\x22\xA5"] = "\xA2\xDD";`
			`$fromUnicode["\x22\x52"] = "\xA2\xE2";`
			`$fromUnicode["\xFF\xE2"] = "\xA2\xCC";`
			`unset($fromUnicode["\x00\xA1"]); // Don't map upside-down ! to ordinary !`
			`unset($fromUnicode["\x00\xA6"]); // Don't map broken bar to ordinary pipe character`
			`unset($fromUnicode["\x00\xA9"]); // Don't map © to c`
			`unset($fromUnicode["\x00\xAA"]); // Don't map feminine ordinal indicator`
			`unset($fromUnicode["\x00\xAB"]); // Don't map left double angled quote mark to "much less than"`
			`unset($fromUnicode["\x00\xAD"]); // Don't map soft hyphen to ordinary hyphen`
			`unset($fromUnicode["\x00\xAE"]); // Don't map ® to R`
			`unset($fromUnicode["\x00\xAF"]); // Don't map Unicode halfwidth macron to CP932 fullwidth macron`
			`unset($fromUnicode["\x00\xB2"]); // Don't map ² to ordinary 2`
			`unset($fromUnicode["\x00\xB3"]); // Don't map ³ to ordinary 3`
			`unset($fromUnicode["\x00\xB5"]); // Don't map micro sign to Greek mu`
			`unset($fromUnicode["\x00\xB7"]); // Don't map middle dot to katakana middle dot`
			`unset($fromUnicode["\x00\xB8"]); // Don't map cedilla to fullwidth comma`
			`unset($fromUnicode["\x00\xB9"]); // Don't map ¹ to ordinary 1`
			`unset($fromUnicode["\x00\xBA"]); // Don't map "masculine ordinal indicator"`
			`unset($fromUnicode["\x00\xBB"]); // Don't map right double angled quote mark to "much greater than"`
			`unset($fromUnicode["\x30\x94"]); // Don't map hiragana vu to katakana vu`

			`for ($i = 0; $i <= 0x7F; $i++)`
			`$validChars[chr($i)] = "\x00" . chr($i);`

0x5C is not a Yen sign in CP932 (or CP51932) When Microsoft created CP932 (their version of Shift-JIS), they explicitly used bytes 0-0x7F to represent ASCII characters rather than JIS X 0201 characters. So when converting Unicode to CP932, it is not correct to convert U+00A5 to CP932 0x5C. Fortunately, CP932 does have a multi-byte FULLWIDTH YEN SIGN character which we can use instead. CP51932 uses the same extended character set as CP932; while CP932 is MicroSoft's extended version of Shift-JIS, CP51932 is their extended version of EUC-JP. So the same reasoning applies to CP51932. 2020-11-14 19:15:11 +00:00			`/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */`
			`$fromUnicode["\x00\xA5"] = "\xA1\xEF";`
Convert U+203E (OVERLINE) to 0x8150 (FULLWIDTH MACRON) in some SJIS variants Converting U+203E to 0x7E was especially wrong for CP932, where 0x7E represents a tilde. For vanilla Shift-JIS and Shift-JIS-2004, converting to 0x7E is acceptable, since 0x7E does represent an overline/macron in those encodings. Follow the same principle in CP51932, which is closely related to CP932. 2020-11-14 21:03:03 +00:00			`/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */`
			`$fromUnicode["\x20\x3E"] = "\xA1\xB1";`
Convert U+00AF (MACRON) to 0x8150 (FULLWIDTH MACRON) in some SJIS variants Except for vanilla Shift-JIS, where 0x7E is a halfwidth overline/macron. As for Shift-JIS-2004, it has an added character (byte sequence 0x854A) which was defined as a halfwidth macron in JIS X 0213:2000, so we use that. 2020-11-14 21:43:28 +00:00			`/* U+00AF is MACRON; convert to FULLWIDTH MACRON */`
			`$fromUnicode["\x00\xAF"] = "\xA1\xB1";`
0x5C is not a Yen sign in CP932 (or CP51932) When Microsoft created CP932 (their version of Shift-JIS), they explicitly used bytes 0-0x7F to represent ASCII characters rather than JIS X 0201 characters. So when converting Unicode to CP932, it is not correct to convert U+00A5 to CP932 0x5C. Fortunately, CP932 does have a multi-byte FULLWIDTH YEN SIGN character which we can use instead. CP51932 uses the same extended character set as CP932; while CP932 is MicroSoft's extended version of Shift-JIS, CP51932 is their extended version of EUC-JP. So the same reasoning applies to CP51932. 2020-11-14 19:15:11 +00:00
Enhance handling of CP51932 encoding - Don't pass 'control' characters through in the middle of a multi-byte char - Treat truncated multi-byte characters as an error 2020-10-18 12:27:21 +00:00			`testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false);`
			`testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false);`
			`echo "CP51932 verification and conversion works on all valid characters\n";`

			`findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA9, 0xAF), 2) + array_fill_keys(range(0xF5, 0xF8), 2) + array(0xFD => 2, 0xFE => 2));`

			`testAllInvalidChars($invalidChars, $validChars, 'CP51932', 'UTF-16BE', "\x00%");`
			`testTruncatedChars($truncated, 'CP51932', 'UTF-16BE', "\x00%");`
			`echo "CP51932 verification and conversion works on all invalid characters\n";`

			`findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2));`
			`convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%');`
			`echo "Unicode -> CP51932 conversion works on all invalid codepoints\n";`
Test behavior of 'long' illegal character markers After mb_substitute_character("long"), mbstring will respond to erroneous input by inserting 'long' error markers into the output. Depending on the situation, these error markers will either look like BAD+XXXX (for general bad input), U+XXXX (when the input is OK, but it converts to Unicode codepoints which cannot be represented in the output encoding), or an encoding-specific marker like JISX+XXXX or W932+XXXX. We have almost no tests for this feature. Add a bunch of tests to ensure that all our legacy encoding handlers work in a reasonable way when 'long' error markers are enabled. 2021-07-27 11:21:48 +00:00
			`// Test "long" illegal character markers`
			`mb_substitute_character("long");`
mbstring no longer provides 'long' substitutions for erroneous input bytes Previously, mbstring had a special mode whereby it would convert erroneous input byte sequences to output like "BAD+XXXX", where "XXXX" would be the erroneous bytes expressed in hexadecimal. This mode could be enabled by calling `mb_substitute_character("long")`. However, accurately reproducing input byte sequences from the cached state of a conversion filter is often tricky, and this significantly complicates the implementation. Further, the means used for passing the erroneous bytes through to where the "BAD+XXXX" text is generated only allows for up to 3 bytes to be passed, meaning that some erroneous byte sequences are truncated anyways. More to the point, a search of publically available PHP code indicates that nobody is really using this feature anyways. Incidentally, this feature also provided error output like "JIS+XXXX" if the input 'should have' represented a JISX 0208 codepoint, but it decodes to a codepoint which does not exist in the JISX 0208 charset. Similarly, specific error output was provided for non-existent JISX 0212 codepoints, and likewise for JISX 0213, CP932, and a few other charsets. All of that is now consigned to the flames. However, "long" error markers also include a somewhat more useful "U+XXXX" marker for Unicode codepoints which were successfully decoded from the input text, but cannot be represented in the output encoding. Those are still supported. With this change, there is no need to use a variety of special values in the high bits of a wchar to represent different types of error values. We can (and will) just use a single error value. This will be equal to -1. One complicating factor: Text conversion functions return an integer to indicate whether the conversion operation should be immediately aborted, and the magic 'abort' marker is -1. Also, almost all of these functions would return the received byte/codepoint to indicate success. That doesn't work with the new error value; if an input filter detects an error and passes -1 to the output filter, and the output filter returns it back, that would be taken to mean 'abort'. Therefore, amend all these functions to return 0 for success. 2021-08-30 17:35:05 +00:00			`convertInvalidString("\x80", "%", "CP51932", "UTF-8");`
			`convertInvalidString("\xFE\xFF", "%", "CP51932", "UTF-8");`
Test behavior of 'long' illegal character markers After mb_substitute_character("long"), mbstring will respond to erroneous input by inserting 'long' error markers into the output. Depending on the situation, these error markers will either look like BAD+XXXX (for general bad input), U+XXXX (when the input is OK, but it converts to Unicode codepoints which cannot be represented in the output encoding), or an encoding-specific marker like JISX+XXXX or W932+XXXX. We have almost no tests for this feature. Add a bunch of tests to ensure that all our legacy encoding handlers work in a reasonable way when 'long' error markers are enabled. 2021-07-27 11:21:48 +00:00
			`echo "Done!\n";`
Enhance handling of CP51932 encoding - Don't pass 'control' characters through in the middle of a multi-byte char - Treat truncated multi-byte characters as an error 2020-10-18 12:27:21 +00:00			`?>`
			`--EXPECT--`
			`CP51932 verification and conversion works on all valid characters`
			`CP51932 verification and conversion works on all invalid characters`
			`Unicode -> CP51932 conversion works on all invalid codepoints`
Test behavior of 'long' illegal character markers After mb_substitute_character("long"), mbstring will respond to erroneous input by inserting 'long' error markers into the output. Depending on the situation, these error markers will either look like BAD+XXXX (for general bad input), U+XXXX (when the input is OK, but it converts to Unicode codepoints which cannot be represented in the output encoding), or an encoding-specific marker like JISX+XXXX or W932+XXXX. We have almost no tests for this feature. Add a bunch of tests to ensure that all our legacy encoding handlers work in a reasonable way when 'long' error markers are enabled. 2021-07-27 11:21:48 +00:00			`Done!`