ext/mbstring: Update to Unicode 16

Updates UCD to Unicode 16.0 (released 2024 Sept). Previously: 0fdffc18, #7502, #14680 Unicode 16 adds several new character sets and case folding rules. However, the existing ucgendat script can still parse them. This also adds a couple test cases to make sure the new rules for East Asian Wide characters and case folding work correctly. These tests fail on Unicode 15.1 and older because those verisons do not contain those rules.
2024-09-21 01:47:25 +00:00 · 2024-09-16 00:43:53 +07:00 · 2024-09-16 00:43:53 +07:00 · 3afb96184e
commit 3afb96184e
parent 71edc05139
5 changed files with 3902 additions and 3768 deletions
--- a/1
+++ b/1
@ -10,6 +10,7 @@ PHP                                                                        NEWS
 - MBString:
  . Fixed bug GH-15824 (mb_detect_encoding(): Argument $encodings contains
    invalid encoding "UTF8"). (Yuya Hamada)
+  . Updated Unicode data tables to Unicode 16.0. (Ayesh Karunaratne)

 - Opcache:
  . Fixed bug GH-15657 (Segmentation fault in dasm_x86.h). (nielsdos)
--- a/2
+++ b/2
@ -966,7 +966,7 @@ PHP 8.4 UPGRADE NOTES
  . The libxml extension now requires at least libxml2 2.9.4.

 - MBString:
-  . Unicode data tables have been updated to Unicode 15.1.
+  . Unicode data tables have been updated to Unicode 16.0.

 - Mysqli:
  . The unused and undocumented constant MYSQLI_SET_CHARSET_DIR
--- a/ext/mbstring/libmbfl/mbfl/eaw_table.h
+++ b/ext/mbstring/libmbfl/mbfl/eaw_table.h
@ -28,8 +28,10 @@ static const struct {
 	{ 0x23f3, 0x23f3 },
 	{ 0x25fd, 0x25fe },
 	{ 0x2614, 0x2615 },
+	{ 0x2630, 0x2637 },
 	{ 0x2648, 0x2653 },
 	{ 0x267f, 0x267f },
+	{ 0x268a, 0x268f },
 	{ 0x2693, 0x2693 },
 	{ 0x26a1, 0x26a1 },
 	{ 0x26aa, 0x26ab },
@ -63,11 +65,10 @@ static const struct {
 	{ 0x3099, 0x30ff },
 	{ 0x3105, 0x312f },
 	{ 0x3131, 0x318e },
-	{ 0x3190, 0x31e3 },
+	{ 0x3190, 0x31e5 },
 	{ 0x31ef, 0x321e },
 	{ 0x3220, 0x3247 },
-	{ 0x3250, 0x4dbf },
-	{ 0x4e00, 0xa48c },
+	{ 0x3250, 0xa48c },
 	{ 0xa490, 0xa4c6 },
 	{ 0xa960, 0xa97c },
 	{ 0xac00, 0xd7a3 },
@ -82,7 +83,7 @@ static const struct {
 	{ 0x16ff0, 0x16ff1 },
 	{ 0x17000, 0x187f7 },
 	{ 0x18800, 0x18cd5 },
-	{ 0x18d00, 0x18d08 },
+	{ 0x18cff, 0x18d08 },
 	{ 0x1aff0, 0x1aff3 },
 	{ 0x1aff5, 0x1affb },
 	{ 0x1affd, 0x1affe },
@ -92,6 +93,8 @@ static const struct {
 	{ 0x1b155, 0x1b155 },
 	{ 0x1b164, 0x1b167 },
 	{ 0x1b170, 0x1b2fb },
+	{ 0x1d300, 0x1d356 },
+	{ 0x1d360, 0x1d376 },
 	{ 0x1f004, 0x1f004 },
 	{ 0x1f0cf, 0x1f0cf },
 	{ 0x1f18e, 0x1f18e },
@ -132,11 +135,10 @@ static const struct {
 	{ 0x1f93c, 0x1f945 },
 	{ 0x1f947, 0x1f9ff },
 	{ 0x1fa70, 0x1fa7c },
-	{ 0x1fa80, 0x1fa88 },
-	{ 0x1fa90, 0x1fabd },
-	{ 0x1fabf, 0x1fac5 },
-	{ 0x1face, 0x1fadb },
-	{ 0x1fae0, 0x1fae8 },
+	{ 0x1fa80, 0x1fa89 },
+	{ 0x1fa8f, 0x1fac6 },
+	{ 0x1face, 0x1fadc },
+	{ 0x1fadf, 0x1fae9 },
 	{ 0x1faf0, 0x1faf8 },
 	{ 0x20000, 0x2fffd },
 	{ 0x30000, 0x3fffd },
--- a/ext/mbstring/tests/unicode_versions.phpt
+++ b/ext/mbstring/tests/unicode_versions.phpt
@ -5,6 +5,8 @@ mbstring
 --FILE--
 <?php

+echo "Char widths:\n";
+
 print "ASCII (PHP): " .      mb_strwidth('PHP',    'UTF-8')    . "\n";

 print "Vietnamese (Xin chào): " . mb_strwidth('Xin chào',    'UTF-8')    . "\n";
@ -18,11 +20,22 @@ print "Emoji (\u{1F418}): " . mb_strwidth("\u{1F418}", 'UTF-8') . "\n";
 // New in Unicode 15.0, width=2
 print "Emoji (\u{1F6DC}): " . mb_strwidth("\u{1F6DC}", 'UTF-8') . "\n";

+// Changed in Unicode 16.0, U+2630...U+2637 are wide
+print "Emoji (\u{2630}): " . mb_strwidth("\u{2630}", 'UTF-8') . "\n";
+
+echo "Char case changes:\n";
+
+print "Upper(\u{019b}) = \u{a7dc} : ";
+var_dump(mb_strtoupper("\u{019b}", 'UTF-8') === "\u{a7dc}");
 ?>
 --EXPECT--
+Char widths:
 ASCII (PHP): 3
 Vietnamese (Xin chào): 8
 Traditional Chinese (你好): 4
 Sinhalese (අයේෂ්): 5
 Emoji (🐘): 2
 Emoji (🛜): 2
+Emoji (☰): 2
+Char case changes:
+Upper(ƛ) = Ƛ : bool(true)
--- a/ext/mbstring/unicode_data.h
+++ b/ext/mbstring/unicode_data.h