mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-13058 Break iteration tests & rules update for new prop data. Tests are now passing, but changes are not yet propagated into all rule tailorings.
X-SVN-Rev: 39922
This commit is contained in:
parent
1982037316
commit
af55f69558
5 changed files with 48 additions and 53 deletions
icu4c/source
|
@ -9,7 +9,8 @@
|
|||
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# plus proposed updates for Emoji 4.0 from https://goo.gl/cluFCn
|
||||
# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
|
||||
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
|
@ -35,7 +36,7 @@ $LVT = [\p{Grapheme_Cluster_Break = LVT}];
|
|||
|
||||
# Emoji defintions
|
||||
|
||||
$E_Base = [[\p{Grapheme_Cluster_Break = EB}] \U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$E_Base = [\p{Grapheme_Cluster_Break = EB}];
|
||||
$E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# with additions for Emoji Sequences from https://goo.gl/cluFCn
|
||||
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
@ -44,7 +45,7 @@ $MidLetter = [\p{Word_Break = MidLetter}];
|
|||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$E_Base = [\p{Word_Break = EB}];
|
||||
$E_Modifier = [\p{Word_Break = EM}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
|
|
|
@ -1683,25 +1683,30 @@ void RBBITest::TestUnicodeFiles() {
|
|||
// See ticket #7270.
|
||||
|
||||
UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
|
||||
static const UChar *badTestCases[] = { // Line Numbers from Unicode 7.0.0 file.
|
||||
u"\u200B\u0020}", // Line 5198
|
||||
u"\u200B\u0020)", // Line 5202
|
||||
u"\u200B\u0020!", // Line 5214
|
||||
u"\u200B\u0020,", // Line 5246
|
||||
u"\u200B\u0020/", // Line 5298
|
||||
u"\u200B\u0020\u2060" // Line 5302
|
||||
};
|
||||
if (strcmp(fileName, "LineBreakTest.txt") != 0) {
|
||||
return FALSE;
|
||||
}
|
||||
static struct TestCase {
|
||||
const char *fFileName;
|
||||
const UChar *fString;
|
||||
} badTestCases[] = { // Line Numbers from Unicode 7.0.0 file.
|
||||
{"LineBreakTest.txt", u"\u200B\u0020}"}, // Line 5198
|
||||
{"LineBreakTest.txt", u"\u200B\u0020)"}, // Line 5202
|
||||
{"LineBreakTest.txt", u"\u200B\u0020!"}, // Line 5214
|
||||
{"LineBreakTest.txt", u"\u200B\u0020,"}, // Line 5246
|
||||
{"LineBreakTest.txt", u"\u200B\u0020/"}, // Line 5298
|
||||
{"LineBreakTest.txt", u"\u200B\u0020\u2060"}, // Line 5302
|
||||
// Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
|
||||
{"GraphemeBreakTest.txt", u"\u200D\u2640"}, // Line 656, old GB 11 test ZWJ x GAZ
|
||||
{"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
|
||||
{"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
|
||||
|
||||
// Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
|
||||
{"WordBreakTest.txt", u"\u200D\u261D"}, // Line 1356, ZWJ x EmojiNRK
|
||||
{"WordBreakTest.txt", u"\u200D\U0001F3FB"}, // Line 1358, ZWJ x EmojiNRK
|
||||
};
|
||||
|
||||
#if ((U_PLATFORM == U_PF_OS390) || (U_PLATFORM == U_PF_AIX)) && (U_CPLUSPLUS_VERSION < 11)
|
||||
for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
|
||||
const UChar *badCase = badTestCases[n];
|
||||
#else
|
||||
for (const UChar *badCase: badTestCases) {
|
||||
#endif
|
||||
if (testCase == UnicodeString(badCase)) {
|
||||
const TestCase &badCase = badTestCases[n];
|
||||
if (!strcmp(fileName, badCase.fFileName) &&
|
||||
testCase == UnicodeString(badCase.fString)) {
|
||||
return logKnownIssue("7270");
|
||||
}
|
||||
}
|
||||
|
@ -2043,7 +2048,7 @@ RBBICharMonkey::RBBICharMonkey() {
|
|||
fHangulSet->addAll(*fLVSet);
|
||||
fHangulSet->addAll(*fLVTSet);
|
||||
|
||||
fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
|
||||
fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}]"), status);
|
||||
fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
|
||||
fExtendedPictSet = new UnicodeSet(gExtended_Pict, status);
|
||||
fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
|
||||
|
@ -2325,8 +2330,7 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fExtendNumLetSet = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
|
||||
fExtendSet = new UnicodeSet(u"[\\p{Word_Break = Extend}]", status);
|
||||
|
||||
fEBaseSet = new UnicodeSet(
|
||||
u"[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]", status);
|
||||
fEBaseSet = new UnicodeSet(u"[\\p{Word_Break = EB}]", status);
|
||||
fEBGSet = new UnicodeSet(u"[\\p{Word_Break = EBG}]", status);
|
||||
fEModifierSet = new UnicodeSet(u"[\\p{Word_Break = EM}]", status);
|
||||
fZWJSet = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]", status);
|
||||
|
@ -4757,32 +4761,21 @@ void RBBITest::TestEmoji() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (testString.length() > 1) {
|
||||
charBreaks->setText(testString);
|
||||
charBreaks->first();
|
||||
int32_t firstBreak = charBreaks->next();
|
||||
if (testString.length() != firstBreak) {
|
||||
if (logKnownIssue("13058", "%s:%d", __FILE__, __LINE__)) {
|
||||
continue;
|
||||
// Local function check()
|
||||
auto check = [=](const char *breakType, BreakIterator *bi) -> void {
|
||||
if (testString.length() > 1) {
|
||||
bi->setText(testString);
|
||||
bi->first();
|
||||
int32_t firstBreak = bi->next();
|
||||
if (testString.length() != firstBreak) {
|
||||
errln("%s:%d checking %s. emoji-test.txt:%d Error, uexpected break at offset %d",
|
||||
__FILE__, __LINE__, breakType, lineNumber, firstBreak);
|
||||
}
|
||||
errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
|
||||
__FILE__, __LINE__, lineNumber, firstBreak);
|
||||
}
|
||||
wordBreaks->setText(testString);
|
||||
wordBreaks->first();
|
||||
firstBreak = wordBreaks->next();
|
||||
if (testString.length() != firstBreak) {
|
||||
errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
|
||||
__FILE__, __LINE__, lineNumber, firstBreak);
|
||||
}
|
||||
lineBreaks->setText(testString);
|
||||
lineBreaks->first();
|
||||
firstBreak = lineBreaks->next();
|
||||
if (testString.length() != firstBreak) {
|
||||
errln("%s:%d emoji-test.txt:%d Error, uexpected break at offset %d",
|
||||
__FILE__, __LINE__, lineNumber, firstBreak);
|
||||
}
|
||||
}
|
||||
};
|
||||
check("charBreaks", charBreaks.getAlias());
|
||||
check("wordBreaks", wordBreaks.getAlias());
|
||||
check("lineBreaks", lineBreaks.getAlias());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -36,12 +36,12 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
|
|||
# Emoji defintions
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [Regional_Indicator\u002a\u00230-9©®™〰〽]];
|
||||
E_Base = [\p{Grapheme_Cluster_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
E_Base = [\p{Grapheme_Cluster_Break = EB}];
|
||||
E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
|
||||
E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
|
||||
|
||||
GB3: CR LF;
|
||||
|
|
|
@ -32,11 +32,11 @@ MidLetter = [\p{Word_Break = MidLetter}];
|
|||
MidNum = [\p{Word_Break = MidNum}];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
E_Base = [\p{Word_Break = EB}];
|
||||
E_Modifier = [\p{Word_Break = EM}];
|
||||
EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\U00002700-\U00002701\U00002703-\U00002704\U0000270E\U00002710-\U00002711\U00002765-\U00002767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F260-\U0001F265\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F25F\U0001F266-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\U00002605\U00002607-\U0000260D\U0000260F-\U00002610\U00002612\U00002616-\U00002617\U00002619-\U0000261C\U0000261E-\U0000261F\U00002621\U00002624-\U00002625\U00002627-\U00002629\U0000262B-\U0000262D\U00002630-\U00002637\U0000263B-\U00002647\U00002654-\U0000265F\U00002661-\U00002662\U00002664\U00002667\U00002669-\U0000267A\U0000267C-\U0000267E\U00002680-\U00002691\U00002695\U00002698\U0000269A\U0000269D-\U0000269F\U000026A2-\U000026A9\U000026AC-\U000026AF\U000026B2-\U000026BC\U000026BF-\U000026C3\U000026C6-\U000026C7\U000026C9-\U000026CD\U000026D0\U000026D2\U000026D5-\U000026E8\U000026EB-\U000026EF\U000026F6\U000026FB-\U000026FC\U000026FE-\U000026FF\U00002388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90B\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F94C\U0001F95F-\U0001F96B\U0001F992-\U0001F997\U0001F9D0-\U0001F9E6\U0001F90C-\U0001F90F\U0001F93F\U0001F94D-\U0001F94F\U0001F96C-\U0001F97F\U0001F998-\U0001F9BF\U0001F9C1-\U0001F9CF\U0001F9E7-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6D3-\U0001F6D4\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6F7-\U0001F6F8\U0001F6D5-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F9-\U0001F6FF];
|
||||
EBG = [\p{Word_Break = EBG}];
|
||||
|
||||
#define dicitionary, with the effect being that those characters don't appear in test data.
|
||||
|
|
Loading…
Add table
Reference in a new issue