Merge branch 'PHP-8.2'

* PHP-8.2:
  Support Microsoft's "Best Fit" mappings for Windows-1252 text encoding
This commit is contained in:
Alex Dowad 2022-12-09 15:41:07 +02:00
commit 14110bff7f
3 changed files with 11 additions and 16 deletions

View File

@ -485,10 +485,10 @@ DEF_SB_TBL(cp1251, "Windows-1251", "Windows-1251", cp1251_aliases, 0x80, cp1251_
static const char *cp1252_aliases[] = {"cp1252", NULL}; static const char *cp1252_aliases[] = {"cp1252", NULL};
static const unsigned short cp1252_ucs_table[] = { static const unsigned short cp1252_ucs_table[] = {
0x20AC, 0x0000, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x0000, 0x017D, 0x0000, 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
0x0000, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x0000, 0x017E, 0x0178 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
}; };
DEF_SB(cp1252, "Windows-1252", "Windows-1252", cp1252_aliases); DEF_SB(cp1252, "Windows-1252", "Windows-1252", cp1252_aliases);
@ -504,7 +504,7 @@ static int mbfl_filt_conv_wchar_cp1252(int c, mbfl_convert_filter *filter)
} }
} }
CK(mbfl_filt_conv_illegal_output(c, filter)); CK(mbfl_filt_conv_illegal_output(c, filter));
} else if (c <= 0x7F || c >= 0xA0) { } else if (c <= 0x7F || c >= 0xA0 || c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) {
CK((*filter->output_function)(c, filter->data)); CK((*filter->output_function)(c, filter->data));
} else { } else {
CK(mbfl_filt_conv_illegal_output(c, filter)); CK(mbfl_filt_conv_illegal_output(c, filter));
@ -562,7 +562,7 @@ static void mb_wchar_to_cp1252(uint32_t *in, size_t len, mb_convert_buf *buf, bo
} }
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp1252); MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp1252);
MB_CONVERT_BUF_ENSURE(buf, out, limit, len); MB_CONVERT_BUF_ENSURE(buf, out, limit, len);
} else if (w <= 0x7F || w >= 0xA0) { } else if (w <= 0x7F || w >= 0xA0 || w == 0x81 || w == 0x8D || w == 0x8F || w == 0x90 || w == 0x9D) {
out = mb_convert_buf_add(out, w); out = mb_convert_buf_add(out, w);
} else { } else {
MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp1252); MB_CONVERT_ERROR(buf, out, limit, w, mb_wchar_to_cp1252);

View File

@ -11,11 +11,6 @@ if (getenv("SKIP_SLOW_TESTS")) die("skip slow test");
include('encoding_tests.inc'); include('encoding_tests.inc');
testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1252.txt', 'CP1252'); testEncodingFromUTF16ConversionTable(__DIR__ . '/data/CP1252.txt', 'CP1252');
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\x81", "%", "CP1252", "UTF-8");
convertInvalidString("\x9D", "%", "CP1252", "UTF-8");
// Test replacement character which cannot be encoded in CP1252 // Test replacement character which cannot be encoded in CP1252
mb_substitute_character(0x1234); mb_substitute_character(0x1234);
convertInvalidString("\x23\x45", '?', 'UTF-16BE', 'CP1252'); convertInvalidString("\x23\x45", '?', 'UTF-16BE', 'CP1252');

View File

@ -145,7 +145,7 @@
0x7E 0x007E #TILDE 0x7E 0x007E #TILDE
0x7F 0x007F #DELETE 0x7F 0x007F #DELETE
0x80 0x20AC #EURO SIGN 0x80 0x20AC #EURO SIGN
0x81 #UNDEFINED 0x81 0x0081 #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
0x82 0x201A #SINGLE LOW-9 QUOTATION MARK 0x82 0x201A #SINGLE LOW-9 QUOTATION MARK
0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK 0x83 0x0192 #LATIN SMALL LETTER F WITH HOOK
0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK 0x84 0x201E #DOUBLE LOW-9 QUOTATION MARK
@ -157,10 +157,10 @@
0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON 0x8A 0x0160 #LATIN CAPITAL LETTER S WITH CARON
0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK 0x8B 0x2039 #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8C 0x0152 #LATIN CAPITAL LIGATURE OE 0x8C 0x0152 #LATIN CAPITAL LIGATURE OE
0x8D #UNDEFINED 0x8D 0x008D #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON 0x8E 0x017D #LATIN CAPITAL LETTER Z WITH CARON
0x8F #UNDEFINED 0x8F 0x008F #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
0x90 #UNDEFINED 0x90 0x0090 #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
0x91 0x2018 #LEFT SINGLE QUOTATION MARK 0x91 0x2018 #LEFT SINGLE QUOTATION MARK
0x92 0x2019 #RIGHT SINGLE QUOTATION MARK 0x92 0x2019 #RIGHT SINGLE QUOTATION MARK
0x93 0x201C #LEFT DOUBLE QUOTATION MARK 0x93 0x201C #LEFT DOUBLE QUOTATION MARK
@ -173,7 +173,7 @@
0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON 0x9A 0x0161 #LATIN SMALL LETTER S WITH CARON
0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 0x9B 0x203A #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9C 0x0153 #LATIN SMALL LIGATURE OE 0x9C 0x0153 #LATIN SMALL LIGATURE OE
0x9D #UNDEFINED 0x9D 0x009D #*** MODIFIED TO FOLLOW WINDOWS "BEST FIT" MAPPINGS
0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON 0x9E 0x017E #LATIN SMALL LETTER Z WITH CARON
0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS 0x9F 0x0178 #LATIN CAPITAL LETTER Y WITH DIAERESIS
0xA0 0x00A0 #NO-BREAK SPACE 0xA0 0x00A0 #NO-BREAK SPACE