mirror of
https://github.com/php/php-src.git
synced 2024-09-22 18:37:25 +00:00
3ab10da758
The documentation for mb_detect_encoding says that this function "Detects the most likely character encoding for string `string` from an ordered list of candidates". Prior to28b346bc06
, mb_detect_encoding did not really attempt to determine the "most likely" text encoding for the input string. It would just return the first candidate encoding for which the string was valid. In28b346bc06
, I amended this function so that it uses heuristics to try to guess which candidate encoding is "most likely". However, the caller did not have any way to indicate which candidate text encoding(s) they consider to be more likely, in case the heuristics applied are inconclusive. In the language of Bayesian probability, there was no way for the caller to indicate their 'prior' assignment of probabilities. Further, the documentation for mb_detect_encoding also says that the second parameter `encodings` is "a list of character encodings to try, in order". The documentation clearly implies that the order of the `encodings` argument should be significant. Therefore, amend mb_detect_encoding so that while it still uses heuristics to guess the most likely text encoding for the input string, it favors those which are earlier in the list of candidate encodings. One complication is that many callers of mb_detect_encoding use it in this way: mb_detect_encoding($string, mb_list_encodings()); In a majority of cases, this is bad code; mb_detect_encoding will both be much slower and the results will be less reliable than if a smaller list of candidates is used. However, since such code already exists and people are using it in production, we should not unnecessarily break it. The order of candidate encodings obviously does not express any prior belief of which candidates are more likely in this case, and treating it as if it did will degrade the accuracy of the result. Since mb_list_encodings now returns a single, immutable array on each call, we can avoid that problem by turning off the new behavior when we receive the array of encodings returned by mb_list_encodings. This implementation means that if the user does this: $a = mb_list_encodings(); mb_detect_encoding($string, $a); ...then the order of candidate encodings will not be considered. However, if the user explicitly initializes their own array of all supported legacy text encodings, then the order *will* be considered. The other functions which also follow this new behavior are: • mb_convert_variables • mb_convert_encoding (when multiple candidate input encodings are listed) Other places where "detection" (or really "guessing") of text encoding may be performed include: • mb_send_mail • Zend engine, when determining the encoding of a PHP script • mbstring processing of HTTP request contents, when http_input INI parameter is set to a list In these cases, the new logic based on order of candidate encodings is *not* enabled. It *might* be logical to consider the order of candidate encodings in some or all of these cases, but I'm not sure if that is true, so it seems wiser to avoid more behavior changes than is necessary. Further, ever since the new encoding detection heuristics were implemented in28b346bc06
, we have not received any complaints of user code being broken in these areas. So I am reluctant to "fix what isn't broken". Well, some might say that applying the new detection heuristics to mb_send_mail, etc. in28b346bc06
was "fixing what wasn't broken", but (cough cough) I don't have any comment on that...
320 lines
9.0 KiB
C
320 lines
9.0 KiB
C
/*
|
|
+----------------------------------------------------------------------+
|
|
| Copyright (c) The PHP Group |
|
|
+----------------------------------------------------------------------+
|
|
| This source file is subject to version 3.01 of the PHP license, |
|
|
| that is bundled with this package in the file LICENSE, and is |
|
|
| available through the world-wide-web at the following url: |
|
|
| https://www.php.net/license/3_01.txt |
|
|
| If you did not receive a copy of the PHP license and are unable to |
|
|
| obtain it through the world-wide-web, please send a note to |
|
|
| license@php.net so we can mail you a copy immediately. |
|
|
+----------------------------------------------------------------------+
|
|
| Author: Rui Hirokawa <hirokawa@php.net> |
|
|
| Moriyoshi Koizumi <moriyoshi@php.net> |
|
|
+----------------------------------------------------------------------+
|
|
*/
|
|
|
|
/* {{{ includes */
|
|
#include "php.h"
|
|
#include "php_ini.h"
|
|
#include "php_variables.h"
|
|
#include "libmbfl/mbfl/mbfilter_pass.h"
|
|
#include "mbstring.h"
|
|
#include "ext/standard/php_string.h"
|
|
#include "ext/standard/php_mail.h"
|
|
#include "ext/standard/url.h"
|
|
#include "main/php_output.h"
|
|
#include "ext/standard/info.h"
|
|
|
|
#include "php_globals.h"
|
|
#include "rfc1867.h"
|
|
#include "php_content_types.h"
|
|
#include "SAPI.h"
|
|
#include "TSRM.h"
|
|
|
|
#include "mb_gpc.h"
|
|
/* }}} */
|
|
|
|
ZEND_EXTERN_MODULE_GLOBALS(mbstring)
|
|
|
|
/* {{{ MBSTRING_API SAPI_TREAT_DATA_FUNC(mbstr_treat_data)
|
|
* http input processing */
|
|
MBSTRING_API SAPI_TREAT_DATA_FUNC(mbstr_treat_data)
|
|
{
|
|
char *res = NULL, *separator=NULL;
|
|
const char *c_var;
|
|
zval v_array;
|
|
int free_buffer=0;
|
|
const mbfl_encoding *detected;
|
|
php_mb_encoding_handler_info_t info;
|
|
|
|
if (!MBSTRG(encoding_translation)) {
|
|
php_default_treat_data(arg, str, destArray);
|
|
return;
|
|
}
|
|
|
|
switch (arg) {
|
|
case PARSE_POST:
|
|
case PARSE_GET:
|
|
case PARSE_COOKIE:
|
|
array_init(&v_array);
|
|
switch (arg) {
|
|
case PARSE_POST:
|
|
ZVAL_COPY_VALUE(&PG(http_globals)[TRACK_VARS_POST], &v_array);
|
|
break;
|
|
case PARSE_GET:
|
|
ZVAL_COPY_VALUE(&PG(http_globals)[TRACK_VARS_GET], &v_array);
|
|
break;
|
|
case PARSE_COOKIE:
|
|
ZVAL_COPY_VALUE(&PG(http_globals)[TRACK_VARS_COOKIE], &v_array);
|
|
break;
|
|
}
|
|
break;
|
|
default:
|
|
ZVAL_COPY_VALUE(&v_array, destArray);
|
|
break;
|
|
}
|
|
|
|
switch (arg) {
|
|
case PARSE_POST:
|
|
sapi_handle_post(&v_array);
|
|
return;
|
|
case PARSE_GET: /* GET data */
|
|
c_var = SG(request_info).query_string;
|
|
if (c_var && *c_var) {
|
|
res = (char *) estrdup(c_var);
|
|
free_buffer = 1;
|
|
}
|
|
break;
|
|
case PARSE_COOKIE: /* Cookie data */
|
|
c_var = SG(request_info).cookie_data;
|
|
if (c_var && *c_var) {
|
|
res = (char *) estrdup(c_var);
|
|
free_buffer = 1;
|
|
}
|
|
break;
|
|
case PARSE_STRING: /* String data */
|
|
res = str;
|
|
free_buffer = 1;
|
|
break;
|
|
}
|
|
|
|
if (!res) {
|
|
return;
|
|
}
|
|
|
|
switch (arg) {
|
|
case PARSE_POST:
|
|
case PARSE_GET:
|
|
case PARSE_STRING:
|
|
separator = (char *) estrdup(PG(arg_separator).input);
|
|
break;
|
|
case PARSE_COOKIE:
|
|
separator = ";\0";
|
|
break;
|
|
}
|
|
|
|
switch (arg) {
|
|
case PARSE_POST:
|
|
MBSTRG(http_input_identify_post) = NULL;
|
|
break;
|
|
case PARSE_GET:
|
|
MBSTRG(http_input_identify_get) = NULL;
|
|
break;
|
|
case PARSE_COOKIE:
|
|
MBSTRG(http_input_identify_cookie) = NULL;
|
|
break;
|
|
case PARSE_STRING:
|
|
MBSTRG(http_input_identify_string) = NULL;
|
|
break;
|
|
}
|
|
|
|
info.data_type = arg;
|
|
info.separator = separator;
|
|
info.report_errors = false;
|
|
info.to_encoding = MBSTRG(internal_encoding);
|
|
info.from_encodings = MBSTRG(http_input_list);
|
|
info.num_from_encodings = MBSTRG(http_input_list_size);
|
|
|
|
MBSTRG(illegalchars) = 0;
|
|
|
|
detected = _php_mb_encoding_handler_ex(&info, &v_array, res);
|
|
MBSTRG(http_input_identify) = detected;
|
|
|
|
if (detected) {
|
|
switch(arg){
|
|
case PARSE_POST:
|
|
MBSTRG(http_input_identify_post) = detected;
|
|
break;
|
|
case PARSE_GET:
|
|
MBSTRG(http_input_identify_get) = detected;
|
|
break;
|
|
case PARSE_COOKIE:
|
|
MBSTRG(http_input_identify_cookie) = detected;
|
|
break;
|
|
case PARSE_STRING:
|
|
MBSTRG(http_input_identify_string) = detected;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (arg != PARSE_COOKIE) {
|
|
efree(separator);
|
|
}
|
|
|
|
if (free_buffer) {
|
|
efree(res);
|
|
}
|
|
}
|
|
/* }}} */
|
|
|
|
/* {{{ mbfl_no_encoding _php_mb_encoding_handler_ex() */
|
|
const mbfl_encoding *_php_mb_encoding_handler_ex(const php_mb_encoding_handler_info_t *info, zval *array_ptr, char *res)
|
|
{
|
|
char *var, *val;
|
|
char *strtok_buf = NULL, **val_list = NULL;
|
|
size_t n, num = 1, *len_list = NULL;
|
|
size_t new_val_len;
|
|
const mbfl_encoding *from_encoding = NULL;
|
|
|
|
if (!res || *res == '\0') {
|
|
goto out;
|
|
}
|
|
|
|
/* count variables contained in `res`.
|
|
* separator may contain multiple separator chars; ANY of them demarcate variables */
|
|
for (char *s1 = res; *s1; s1++) {
|
|
for (const char *s2 = info->separator; *s2; s2++) {
|
|
if (*s1 == *s2) {
|
|
num++;
|
|
}
|
|
}
|
|
}
|
|
num *= 2; /* need space for variable name and value */
|
|
|
|
val_list = (char **)ecalloc(num, sizeof(char *));
|
|
len_list = (size_t *)ecalloc(num, sizeof(size_t));
|
|
|
|
/* split and decode the query */
|
|
n = 0;
|
|
var = php_strtok_r(res, info->separator, &strtok_buf);
|
|
while (var) {
|
|
val = strchr(var, '=');
|
|
if (val) { /* have a value */
|
|
len_list[n] = php_url_decode(var, val-var);
|
|
val_list[n] = var;
|
|
n++;
|
|
|
|
*val++ = '\0';
|
|
val_list[n] = val;
|
|
len_list[n] = php_url_decode(val, strlen(val));
|
|
} else {
|
|
len_list[n] = php_url_decode(var, strlen(var));
|
|
val_list[n] = var;
|
|
n++;
|
|
|
|
val_list[n] = "";
|
|
len_list[n] = 0;
|
|
}
|
|
n++;
|
|
var = php_strtok_r(NULL, info->separator, &strtok_buf);
|
|
}
|
|
|
|
if (ZEND_SIZE_T_GT_ZEND_LONG(n, (PG(max_input_vars) * 2))) {
|
|
php_error_docref(NULL, E_WARNING, "Input variables exceeded " ZEND_LONG_FMT ". To increase the limit change max_input_vars in php.ini.", PG(max_input_vars));
|
|
goto out;
|
|
}
|
|
|
|
num = n; /* make sure to process initialized vars only */
|
|
|
|
/* initialize converter */
|
|
if (info->num_from_encodings == 0) {
|
|
from_encoding = &mbfl_encoding_pass;
|
|
} else if (info->num_from_encodings == 1) {
|
|
from_encoding = info->from_encodings[0];
|
|
} else {
|
|
from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection), false);
|
|
if (!from_encoding) {
|
|
if (info->report_errors) {
|
|
php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
|
|
}
|
|
from_encoding = &mbfl_encoding_pass;
|
|
}
|
|
}
|
|
|
|
/* convert encoding */
|
|
n = 0;
|
|
while (n < num) {
|
|
if (from_encoding != &mbfl_encoding_pass && info->to_encoding != &mbfl_encoding_pass) {
|
|
unsigned int num_errors = 0;
|
|
zend_string *converted_var = mb_fast_convert((unsigned char*)val_list[n], len_list[n], from_encoding, info->to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
|
|
MBSTRG(illegalchars) += num_errors;
|
|
n++;
|
|
|
|
num_errors = 0;
|
|
zend_string *converted_val = mb_fast_convert((unsigned char*)val_list[n], len_list[n], from_encoding, info->to_encoding, MBSTRG(current_filter_illegal_substchar), MBSTRG(current_filter_illegal_mode), &num_errors);
|
|
MBSTRG(illegalchars) += num_errors;
|
|
n++;
|
|
|
|
/* `val` must be a pointer returned by `emalloc` */
|
|
val = estrndup(ZSTR_VAL(converted_val), ZSTR_LEN(converted_val));
|
|
if (sapi_module.input_filter(info->data_type, ZSTR_VAL(converted_var), &val, ZSTR_LEN(converted_val), &new_val_len)) {
|
|
/* add variable to symbol table */
|
|
php_register_variable_safe(ZSTR_VAL(converted_var), val, new_val_len, array_ptr);
|
|
}
|
|
zend_string_free(converted_var);
|
|
zend_string_free(converted_val);
|
|
} else {
|
|
var = val_list[n++];
|
|
val = estrndup(val_list[n], len_list[n]);
|
|
if (sapi_module.input_filter(info->data_type, var, &val, len_list[n], &new_val_len)) {
|
|
php_register_variable_safe(var, val, new_val_len, array_ptr);
|
|
}
|
|
n++;
|
|
}
|
|
efree(val);
|
|
}
|
|
|
|
out:
|
|
if (val_list != NULL) {
|
|
efree((void *)val_list);
|
|
}
|
|
if (len_list != NULL) {
|
|
efree((void *)len_list);
|
|
}
|
|
|
|
return from_encoding;
|
|
}
|
|
/* }}} */
|
|
|
|
/* {{{ SAPI_POST_HANDLER_FUNC(php_mb_post_handler) */
|
|
SAPI_POST_HANDLER_FUNC(php_mb_post_handler)
|
|
{
|
|
const mbfl_encoding *detected;
|
|
php_mb_encoding_handler_info_t info;
|
|
zend_string *post_data_str = NULL;
|
|
|
|
MBSTRG(http_input_identify_post) = NULL;
|
|
|
|
info.data_type = PARSE_POST;
|
|
info.separator = "&";
|
|
info.report_errors = false;
|
|
info.to_encoding = MBSTRG(internal_encoding);
|
|
info.from_encodings = MBSTRG(http_input_list);
|
|
info.num_from_encodings = MBSTRG(http_input_list_size);
|
|
|
|
php_stream_rewind(SG(request_info).request_body);
|
|
post_data_str = php_stream_copy_to_mem(SG(request_info).request_body, PHP_STREAM_COPY_ALL, 0);
|
|
detected = _php_mb_encoding_handler_ex(&info, arg, post_data_str ? ZSTR_VAL(post_data_str) : NULL);
|
|
if (post_data_str) {
|
|
zend_string_release_ex(post_data_str, 0);
|
|
}
|
|
|
|
MBSTRG(http_input_identify) = detected;
|
|
if (detected) {
|
|
MBSTRG(http_input_identify_post) = detected;
|
|
}
|
|
}
|
|
/* }}} */
|