Bug #81390: mb_detect_encoding should not prematurely stop processing input

As a performance optimization, mb_detect_encoding tries to stop
processing the input string early when there is only one 'candidate'
encoding which the input string is valid in. However, the code which
keeps count of how many candidate encodings have already been rejected
was buggy. This caused mb_detect_encoding to prematurely stop
processing the input when it should have continued.

As a result, it did not notice that in the test case provided by Alec,
the input string was not valid in UTF-16.
This commit is contained in:
Alex Dowad 2021-09-06 21:47:30 +02:00
parent ca33ab59ad
commit c25a1ef8d0
2 changed files with 13 additions and 4 deletions

View File

@ -352,9 +352,10 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str
while (n--) {
for (int i = 0; i < num; i++) {
mbfl_convert_filter *filter = identd->filter_list[i];
if (!filter->num_illegalchar) {
mbfl_encoding_detector_data *data = &identd->filter_data[i];
if (!data->num_illegalchars) {
(*filter->filter_function)(*p, filter);
if (identd->filter_data[i].num_illegalchars) {
if (data->num_illegalchars) {
bad++;
}
}

View File

@ -34,8 +34,6 @@ $s = $euc_jp;
$s = mb_detect_encoding($s, 'JIS,EUC-JP');
print("EUC-JP: $s\n");
// Using Encoding List Array
echo "== ARRAY ENCODING LIST ==\n";
@ -53,6 +51,15 @@ $s = $sjis;
$s = mb_detect_encoding($s, $a);
print("SJIS: $s\n");
$test = "CHARSET=windows-1252:Do\xeb;John";
$encodings = ['UTF-8', 'SJIS', 'GB2312',
'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4',
'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9',
'ISO-8859-10', 'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16',
'WINDOWS-1252', 'WINDOWS-1251', 'EUC-JP', 'EUC-TW', 'KOI8-R', 'BIG-5',
'ISO-2022-KR', 'ISO-2022-JP', 'UTF-16'
];
echo mb_detect_encoding($test, $encodings), "\n";
// Using Detect Order
echo "== DETECT ORDER ==\n";
@ -100,6 +107,7 @@ EUC-JP: EUC-JP
JIS: JIS
EUC-JP: EUC-JP
SJIS: SJIS
ISO-8859-1
== DETECT ORDER ==
JIS: JIS
EUC-JP: EUC-JP