From af55f69558bafb5c9b88d727daf5844b55294258 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Fri, 24 Mar 2017 01:31:00 +0000 Subject: [PATCH] ICU-13058 Break iteration tests & rules update for new prop data. Tests are now passing, but changes are not yet propagated into all rule tailorings. X-SVN-Rev: 39922 --- icu4c/source/data/brkitr/rules/char.txt | 5 +- icu4c/source/data/brkitr/rules/word.txt | 3 +- icu4c/source/test/intltest/rbbitst.cpp | 81 +++++++++---------- .../test/testdata/break_rules/grapheme.txt | 6 +- .../source/test/testdata/break_rules/word.txt | 6 +- 5 files changed, 48 insertions(+), 53 deletions(-) diff --git a/icu4c/source/data/brkitr/rules/char.txt b/icu4c/source/data/brkitr/rules/char.txt index 24c8b5f272e..77572f5cd68 100644 --- a/icu4c/source/data/brkitr/rules/char.txt +++ b/icu4c/source/data/brkitr/rules/char.txt @@ -9,7 +9,8 @@ # ICU Character Break Rules, also known as Grapheme Cluster Boundaries # See Unicode Standard Annex #29. # These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 -# plus proposed updates for Emoji 4.0 from https://goo.gl/cluFCn +# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088 +# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html # # Character Class Definitions. @@ -35,7 +36,7 @@ $LVT = [\p{Grapheme_Cluster_Break = LVT}]; # Emoji defintions -$E_Base = [[\p{Grapheme_Cluster_Break = EB}] \U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC]; +$E_Base = [\p{Grapheme_Cluster_Break = EB}]; $E_Modifier = [\p{Grapheme_Cluster_Break = EM}]; # Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267 diff --git a/icu4c/source/data/brkitr/rules/word.txt b/icu4c/source/data/brkitr/rules/word.txt index f2c98e054bc..742d8f8fe31 100644 --- a/icu4c/source/data/brkitr/rules/word.txt +++ b/icu4c/source/data/brkitr/rules/word.txt @@ -10,6 +10,7 @@ # See Unicode Standard Annex #29. # These rules are based on UAX #29 Revision 29 for Unicode Version 9.0 # with additions for Emoji Sequences from https://goo.gl/cluFCn +# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html # # Note: Updates to word.txt will usually need to be merged into # word_POSIX.txt also. @@ -44,7 +45,7 @@ $MidLetter = [\p{Word_Break = MidLetter}]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; -$E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC]; +$E_Base = [\p{Word_Break = EB}]; $E_Modifier = [\p{Word_Break = EM}]; # Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267 diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 6bbeeebdef2..f13481a82c3 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1683,25 +1683,30 @@ void RBBITest::TestUnicodeFiles() { // See ticket #7270. UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) { - static const UChar *badTestCases[] = { // Line Numbers from Unicode 7.0.0 file. - u"\u200B\u0020}", // Line 5198 - u"\u200B\u0020)", // Line 5202 - u"\u200B\u0020!", // Line 5214 - u"\u200B\u0020,", // Line 5246 - u"\u200B\u0020/", // Line 5298 - u"\u200B\u0020\u2060" // Line 5302 - }; - if (strcmp(fileName, "LineBreakTest.txt") != 0) { - return FALSE; - } + static struct TestCase { + const char *fFileName; + const UChar *fString; + } badTestCases[] = { // Line Numbers from Unicode 7.0.0 file. + {"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198 + {"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202 + {"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214 + {"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246 + {"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298 + {"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302 + // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt + {"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ + {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG + {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier + + // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt + {"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK + {"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK + }; -#if ((U_PLATFORM == U_PF_OS390) || (U_PLATFORM == U_PF_AIX)) && (U_CPLUSPLUS_VERSION < 11) for (int n=0; naddAll(*fLVSet); fHangulSet->addAll(*fLVTSet); - fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status); + fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}]"), status); fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status); fExtendedPictSet = new UnicodeSet(gExtended_Pict, status); fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status); @@ -2325,8 +2330,7 @@ RBBIWordMonkey::RBBIWordMonkey() fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status); fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status); - fEBaseSet = new UnicodeSet( - u"[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]", status); + fEBaseSet = new UnicodeSet(u"[\\p{Word_Break = EB}]", status); fEBGSet = new UnicodeSet(u"[\\p{Word_Break = EBG}]", status); fEModifierSet = new UnicodeSet(u"[\\p{Word_Break = EM}]", status); fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status); @@ -4757,32 +4761,21 @@ void RBBITest::TestEmoji() { break; } } - if (testString.length() > 1) { - charBreaks->setText(testString); - charBreaks->first(); - int32_t firstBreak = charBreaks->next(); - if (testString.length() != firstBreak) { - if (logKnownIssue("13058", "%s:%d", __FILE__, __LINE__)) { - continue; + // Local function check() + auto check = [=](const char *breakType, BreakIterator *bi) -> void { + if (testString.length() > 1) { + bi->setText(testString); + bi->first(); + int32_t firstBreak = bi->next(); + if (testString.length() != firstBreak) { + errln("%s:%d checking %s. emoji-test.txt:%d Error, uexpected break at offset %d", + __FILE__, __LINE__, breakType, lineNumber, firstBreak); } - errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d", - __FILE__, __LINE__, lineNumber, firstBreak); } - wordBreaks->setText(testString); - wordBreaks->first(); - firstBreak = wordBreaks->next(); - if (testString.length() != firstBreak) { - errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d", - __FILE__, __LINE__, lineNumber, firstBreak); - } - lineBreaks->setText(testString); - lineBreaks->first(); - firstBreak = lineBreaks->next(); - if (testString.length() != firstBreak) { - errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d", - __FILE__, __LINE__, lineNumber, firstBreak); - } - } + }; + check("charBreaks", charBreaks.getAlias()); + check("wordBreaks", wordBreaks.getAlias()); + check("lineBreaks", lineBreaks.getAlias()); } } diff --git a/icu4c/source/test/testdata/break_rules/grapheme.txt b/icu4c/source/test/testdata/break_rules/grapheme.txt index 3e9b9a4f05e..0b551ba1b3b 100644 --- a/icu4c/source/test/testdata/break_rules/grapheme.txt +++ b/icu4c/source/test/testdata/break_rules/grapheme.txt @@ -36,12 +36,12 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}]; # Emoji defintions EmojiNRK = [[\p{Emoji}] - [Regional_Indicator\u002a\u00230-9©®™〰〽]]; -E_Base = [\p{Grapheme_Cluster_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC]; +E_Base = [\p{Grapheme_Cluster_Break = EB}]; E_Modifier = [\p{Grapheme_Cluster_Break = EM}]; E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}]; -# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773 -Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF]; +# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267 +Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF]; GB3: CR LF; diff --git a/icu4c/source/test/testdata/break_rules/word.txt b/icu4c/source/test/testdata/break_rules/word.txt index 2f24d6e20e5..783dfc9201a 100644 --- a/icu4c/source/test/testdata/break_rules/word.txt +++ b/icu4c/source/test/testdata/break_rules/word.txt @@ -32,11 +32,11 @@ MidLetter = [\p{Word_Break = MidLetter}]; MidNum = [\p{Word_Break = MidNum}]; Numeric = [\p{Word_Break = Numeric}]; ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; -E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC]; +E_Base = [\p{Word_Break = EB}]; E_Modifier = [\p{Word_Break = EM}]; EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]]; -# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773 -Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF]; +# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267 +Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF]; EBG = [\p{Word_Break = EBG}]; #define dicitionary, with the effect being that those characters don't appear in test data.