Clean up determine_charset() implementation

And drop code related to locale-based charset guessing,
which is no longer in use.
This commit is contained in:
Nikita Popov 2020-05-07 14:58:24 +02:00
parent 481b7421f3
commit d6ac8b236f

View File

@ -370,90 +370,41 @@ static inline unsigned int get_next_char(
static enum entity_charset determine_charset(char *charset_hint) static enum entity_charset determine_charset(char *charset_hint)
{ {
size_t i; size_t i;
enum entity_charset charset = cs_utf_8;
size_t len = 0;
const zend_encoding *zenc; const zend_encoding *zenc;
/* Default is now UTF-8 */ if (charset_hint && *charset_hint) {
if (charset_hint == NULL) /* Explicitly passed charset */
return cs_utf_8; goto det_charset;
}
if ((len = strlen(charset_hint)) != 0) { charset_hint = get_default_charset();
if (charset_hint && *charset_hint) {
/* default_charset or internal_encoding */
goto det_charset; goto det_charset;
} }
zenc = zend_multibyte_get_internal_encoding(); zenc = zend_multibyte_get_internal_encoding();
if (zenc != NULL) { if (zenc != NULL) {
/* mbstring.internal_encoding or mb_internal_encoding() */
// TODO: We *shouldn't* be taking this into account anymore.
charset_hint = (char *)zend_multibyte_get_encoding_name(zenc); charset_hint = (char *)zend_multibyte_get_encoding_name(zenc);
if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
if (len == sizeof("auto")-1 && !memcmp("auto", charset_hint, sizeof("auto")-1)) {
charset_hint = NULL;
len = 0;
} else {
goto det_charset;
}
}
}
charset_hint = SG(default_charset);
if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
goto det_charset;
}
/* try to detect the charset for the locale */
#if HAVE_NL_LANGINFO && defined(CODESET)
charset_hint = nl_langinfo(CODESET);
if (charset_hint != NULL && (len=strlen(charset_hint)) != 0) {
goto det_charset;
}
#endif
/* try to figure out the charset from the locale */
{
char *localename;
char *dot, *at;
/* lang[_territory][.codeset][@modifier] */
localename = setlocale(LC_CTYPE, NULL);
dot = strchr(localename, '.');
if (dot) {
dot++;
/* locale specifies a codeset */
at = strchr(dot, '@');
if (at)
len = at - dot;
else
len = strlen(dot);
charset_hint = dot;
} else {
/* no explicit name; see if the name itself
* is the charset */
charset_hint = localename;
len = strlen(charset_hint);
}
} }
det_charset: det_charset:
if (charset_hint) { if (charset_hint) {
int found = 0; size_t len = strlen(charset_hint);
/* now walk the charset map and look for the codeset */ /* now walk the charset map and look for the codeset */
for (i = 0; i < sizeof(charset_map)/sizeof(charset_map[0]); i++) { for (i = 0; i < sizeof(charset_map)/sizeof(charset_map[0]); i++) {
if (len == charset_map[i].codeset_len && if (len == charset_map[i].codeset_len &&
zend_binary_strcasecmp(charset_hint, len, charset_map[i].codeset, len) == 0) { zend_binary_strcasecmp(charset_hint, len, charset_map[i].codeset, len) == 0) {
charset = charset_map[i].charset; return charset_map[i].charset;
found = 1;
break;
} }
} }
if (!found) {
php_error_docref(NULL, E_WARNING, "Charset `%s' not supported, assuming utf-8", php_error_docref(NULL, E_WARNING, "Charset `%s' not supported, assuming utf-8",
charset_hint); charset_hint);
}
} }
return charset; return cs_utf_8;
} }
/* }}} */ /* }}} */
@ -1384,7 +1335,6 @@ encode_amp:
static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all) static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
{ {
zend_string *str, *hint_charset = NULL; zend_string *str, *hint_charset = NULL;
char *default_charset;
zend_long flags = ENT_COMPAT; zend_long flags = ENT_COMPAT;
zend_string *replaced; zend_string *replaced;
zend_bool double_encode = 1; zend_bool double_encode = 1;
@ -1397,10 +1347,9 @@ static void php_html_entities(INTERNAL_FUNCTION_PARAMETERS, int all)
Z_PARAM_BOOL(double_encode); Z_PARAM_BOOL(double_encode);
ZEND_PARSE_PARAMETERS_END(); ZEND_PARSE_PARAMETERS_END();
if (!hint_charset) { replaced = php_escape_html_entities_ex(
default_charset = get_default_charset(); (unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), all, (int) flags,
} hint_charset ? ZSTR_VAL(hint_charset) : NULL, double_encode);
replaced = php_escape_html_entities_ex((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), all, (int) flags, (hint_charset ? ZSTR_VAL(hint_charset) : default_charset), double_encode);
RETVAL_STR(replaced); RETVAL_STR(replaced);
} }
/* }}} */ /* }}} */
@ -1462,7 +1411,6 @@ PHP_FUNCTION(htmlspecialchars_decode)
PHP_FUNCTION(html_entity_decode) PHP_FUNCTION(html_entity_decode)
{ {
zend_string *str, *hint_charset = NULL; zend_string *str, *hint_charset = NULL;
char *default_charset;
zend_long quote_style = ENT_COMPAT; zend_long quote_style = ENT_COMPAT;
zend_string *replaced; zend_string *replaced;
@ -1473,10 +1421,8 @@ PHP_FUNCTION(html_entity_decode)
Z_PARAM_STR(hint_charset) Z_PARAM_STR(hint_charset)
ZEND_PARSE_PARAMETERS_END(); ZEND_PARSE_PARAMETERS_END();
if (!hint_charset) { replaced = php_unescape_html_entities(
default_charset = get_default_charset(); str, 1 /*all*/, (int)quote_style, hint_charset ? ZSTR_VAL(hint_charset) : NULL);
}
replaced = php_unescape_html_entities(str, 1 /*all*/, (int)quote_style, (hint_charset ? ZSTR_VAL(hint_charset) : default_charset));
if (replaced) { if (replaced) {
RETURN_STR(replaced); RETURN_STR(replaced);