Use fast text conversion filters to implement mb_convert_variables

This commit is contained in:
Alex Dowad 2022-11-15 10:55:22 +02:00
parent adfdfb2e1e
commit b1954f5fd6
2 changed files with 79 additions and 63 deletions

View File

@ -3180,7 +3180,7 @@ next_option:
RETVAL_STR(jp_kana_convert(str, enc, opt));
}
static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, int *recursion_error) /* {{{ */
static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zval *var, bool *recursion_error) /* {{{ */
{
mbfl_string string;
HashTable *ht;
@ -3196,7 +3196,7 @@ static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zv
} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
if (Z_REFCOUNTED_P(var)) {
if (Z_IS_RECURSIVE_P(var)) {
*recursion_error = 1;
*recursion_error = true;
return 0;
}
Z_PROTECT_RECURSION_P(var);
@ -3226,31 +3226,25 @@ static int mb_recursive_encoder_detector_feed(mbfl_encoding_detector *identd, zv
return 0;
} /* }}} */
static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var) /* {{{ */
static bool mb_recursive_convert_variable(zval *var, const mbfl_encoding* from_encoding, const mbfl_encoding* to_encoding)
{
mbfl_string string, result, *ret;
HashTable *ht;
zval *entry, *orig_var;
orig_var = var;
ZVAL_DEREF(var);
if (Z_TYPE_P(var) == IS_STRING) {
string.val = (unsigned char *)Z_STRVAL_P(var);
string.len = Z_STRLEN_P(var);
ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
if (ret != NULL) {
zval_ptr_dtor(orig_var);
// TODO: avoid reallocation ???
ZVAL_STRINGL(orig_var, (char *)ret->val, ret->len);
efree(ret->val);
}
zend_string *ret = php_mb_convert_encoding_ex(Z_STRVAL_P(var), Z_STRLEN_P(var), to_encoding, from_encoding);
zval_ptr_dtor(orig_var);
ZVAL_STR(orig_var, ret);
} else if (Z_TYPE_P(var) == IS_ARRAY || Z_TYPE_P(var) == IS_OBJECT) {
if (Z_TYPE_P(var) == IS_ARRAY) {
SEPARATE_ARRAY(var);
}
if (Z_REFCOUNTED_P(var)) {
if (Z_IS_RECURSIVE_P(var)) {
return 1;
return true;
}
Z_PROTECT_RECURSION_P(var);
}
@ -3258,11 +3252,11 @@ static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var
ht = HASH_OF(var);
if (ht != NULL) {
ZEND_HASH_FOREACH_VAL_IND(ht, entry) {
if (mb_recursive_convert_variable(convd, entry)) {
if (mb_recursive_convert_variable(entry, from_encoding, to_encoding)) {
if (Z_REFCOUNTED_P(var)) {
Z_UNPROTECT_RECURSION_P(var);
}
return 1;
return true;
}
} ZEND_HASH_FOREACH_END();
}
@ -3271,8 +3265,9 @@ static int mb_recursive_convert_variable(mbfl_buffer_converter *convd, zval *var
Z_UNPROTECT_RECURSION_P(var);
}
}
return 0;
} /* }}} */
return false;
}
/* {{{ Converts the string resource in variables to desired encoding */
PHP_FUNCTION(mb_convert_variables)
@ -3281,14 +3276,12 @@ PHP_FUNCTION(mb_convert_variables)
zend_string *to_enc_str;
zend_string *from_enc_str;
HashTable *from_enc_ht;
mbfl_string string, result;
const mbfl_encoding *from_encoding, *to_encoding;
mbfl_encoding_detector *identd;
mbfl_buffer_converter *convd;
int n, argc;
size_t elistsz;
const mbfl_encoding **elist;
int recursion_error = 0;
bool recursion_error = false;
ZEND_PARSE_PARAMETERS_START(3, -1)
Z_PARAM_STR(to_enc_str)
@ -3302,10 +3295,7 @@ PHP_FUNCTION(mb_convert_variables)
RETURN_THROWS();
}
/* initialize string */
from_encoding = MBSTRG(current_internal_encoding);
mbfl_string_init_set(&string, from_encoding);
mbfl_string_init(&result);
/* pre-conversion encoding */
if (from_enc_ht) {
@ -3356,29 +3346,18 @@ PHP_FUNCTION(mb_convert_variables)
efree(ZEND_VOIDP(elist));
convd = mbfl_buffer_converter_new(from_encoding, to_encoding, 0);
/* If this assertion fails this means some memory allocation failure which is a bug */
ZEND_ASSERT(convd != NULL);
mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
/* convert */
n = 0;
while (n < argc) {
zval *zv = &args[n];
ZVAL_DEREF(zv);
recursion_error = mb_recursive_convert_variable(convd, zv);
recursion_error = mb_recursive_convert_variable(zv, from_encoding, to_encoding);
if (recursion_error) {
break;
}
n++;
}
MBSTRG(illegalchars) += mbfl_buffer_illegalchars(convd);
mbfl_buffer_converter_delete(convd);
if (recursion_error) {
php_error_docref(NULL, E_WARNING, "Cannot handle recursive references");
RETURN_FALSE;

View File

@ -17,19 +17,19 @@ $sjis = base64_decode('k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==');
// JIS string (BASE64 encoded)
$jis = base64_decode('GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==');
// EUC-JP string
$euc_jp = '日本語テキストです。01234。';
$euc_jp = mb_convert_encoding("日本語テキストです。01234", 'EUC-JP', 'UTF-8');
// Test for single scalar
echo "== SCALAR TEST ==\n";
$s = $sjis;
$encoding = mb_convert_variables('EUC-JP', 'SJIS', $s);
print("$encoding\n"); // SJIS
print("$s\n"); // Converted to EUC-JP
echo bin2hex($s), "\n"; // Converted to EUC-JP
$s = $jis;
$encoding = mb_convert_variables('EUC-JP', 'JIS', $s);
print("$encoding\n"); // JIS
print("$s\n"); // Converted to EUC-JP
echo bin2hex($s), "\n"; // Converted to EUC-JP
$s = $euc_jp;
$encoding = mb_convert_variables('SJIS', 'EUC-JP', $s);
@ -47,9 +47,7 @@ $s2 = $euc_jp;
$s3 = $euc_jp;
$encoding = mb_convert_variables('EUC-JP', 'auto', $s1, $s2, $s3);
print("$encoding\n"); // EUC-JP
print("$s1$s2$s3\n"); // Converted to EUC-JP
echo bin2hex("$s1$s2$s3"), "\n"; // Converted to EUC-JP
// Note: Mixing encoding in array/object is not supported?
// Test for array
@ -58,15 +56,13 @@ $a = array($s3, $s2, $s1);
$aa = $a;
$encoding = mb_convert_variables('EUC-JP', 'auto', $aa);
print("$encoding\n"); // EUC-JP
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP
$a = array($s1, $s2, $s3);
$aa = $a;
$encoding = mb_convert_variables('EUC-JP', 'auto', $aa);
print("$encoding\n"); // EUC-JP
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP
// Test for object
echo "== OBJECT TEST ==\n";
@ -102,19 +98,17 @@ class bar
}
}
$o = new foo;
$oo = $o;
$encoding = mb_convert_variables('EUC-JP', 'auto', $oo);
print("$encoding\n"); // EUC-JP
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP
$o = new bar;
$oo = $o;
$encoding = mb_convert_variables('EUC-JP', 'auto', $oo);
print("$encoding\n"); // EUC-JP
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP
// Test for scalar, array and object
echo "== SCALAR, ARRAY AND OBJECT TEST ==\n";
@ -127,36 +121,79 @@ $oo = $o;
$encoding = mb_convert_variables('EUC-JP', 'auto', $s1, $s2, $s3, $aa, $oo);
print("$encoding\n"); // EUC-JP
print("$s1$s2$s3\n"); // Converted to EUC-JP
print("{$aa[0]}{$aa[1]}{$aa[2]}\n"); // Converted to EUC-JP
print("{$oo->s1}{$oo->s2}{$oo->s3}\n"); // Converted to EUC-JP
echo bin2hex("$s1$s2$s3"), "\n"; // Converted to EUC-JP
echo bin2hex("{$aa[0]}{$aa[1]}{$aa[2]}"), "\n"; // Converted to EUC-JP
echo bin2hex("{$oo->s1}{$oo->s2}{$oo->s3}"), "\n"; // Converted to EUC-JP
echo "== DEEPLY NESTED OBJECT/ARRAY TEST ==\n";
class Nested
{
public $inner;
function __construct($value)
{
$this->inner = $value;
}
}
$deeplyNested = array(new Nested(array(new Nested(array(new Nested("BLAH"))))));
$encoding = mb_convert_variables('UTF-16LE', 'UTF-8', $deeplyNested);
echo $encoding, "\n";
echo bin2hex($deeplyNested[0]->inner[0]->inner[0]->inner), "\n";
echo "== INVALID STRING ENCODING TEST ==\n";
// Make sure both that the correct invalid encoding marker is used,
// and that the count of illegal characters is incremented
$illegalCount = mb_get_info('illegal_chars');
$nested = array(new Nested("\xFF"));
mb_substitute_character(0x25);
mb_convert_variables('UTF-16LE', 'UTF-8', $nested);
echo bin2hex($nested[0]->inner), "\n";
echo "# of illegal characters detected: ", mb_get_info('illegal_chars') - $illegalCount, "\n";
$illegalCount = mb_get_info('illegal_chars');
$nested = array(new Nested("\xFF"));
mb_substitute_character(0x26);
mb_convert_variables('UTF-16LE', 'UTF-8', $nested);
echo bin2hex($nested[0]->inner), "\n";
echo "# of illegal characters detected: ", mb_get_info('illegal_chars') - $illegalCount, "\n";
?>
--EXPECT--
== SCALAR TEST ==
SJIS
日本語テキストです。01234
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
JIS
日本語テキストです。01234
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
EUC-JP
k/qWe4zqg2WDTINYg2eCxYK3gUIwMTIzNIJUglWCVoJXgliBQg==
EUC-JP
GyRCRnxLXDhsJUYlLSU5JUgkRyQ5ISMbKEIwMTIzNBskQiM1IzYjNyM4IzkhIxsoQg==
EUC-JP
日本語テキストです。01234。日本語テキストです。01234。日本語テキストです。01234
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
== ARRAY TEST ==
EUC-JP
日本語テキストです。01234。日本語テキストです。01234。日本語テキストです。01234
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
EUC-JP
日本語テキストです。01234。日本語テキストです。01234。日本語テキストです。01234
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
== OBJECT TEST ==
EUC-JP
日本語テキストです。01234。日本語テキストです。01234。日本語テキストです。01234
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
EUC-JP
日本語テキストです。01234。日本語テキストです。01234。日本語テキストです。01234
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
== SCALAR, ARRAY AND OBJECT TEST ==
EUC-JP
日本語テキストです。01234。日本語テキストです。01234。日本語テキストです。01234
日本語テキストです。01234。日本語テキストです。01234。日本語テキストです。01234
日本語テキストです。01234。日本語テキストです。01234。日本語テキストです。01234
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3c6fccbdcb8eca5c6a5ada5b9a5c8a4c7a4b9a1a33031323334a3b5a3b6a3b7a3b8a3b9a1a3
== DEEPLY NESTED OBJECT/ARRAY TEST ==
UTF-8
42004c0041004800
== INVALID STRING ENCODING TEST ==
2500
# of illegal characters detected: 1
2600
# of illegal characters detected: 1