mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-12664 Break rules update for revised Emoji ZWJ sequences.
X-SVN-Rev: 39100
This commit is contained in:
parent
be2b7dc92e
commit
ce42f64b31
23 changed files with 354 additions and 210 deletions
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
|
@ -6,8 +8,8 @@
|
|||
#
|
||||
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 28 (Draft 7) for Unicode Version 9.0
|
||||
#
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# plus proposed updates for Emoji 4.0 from https://goo.gl/cluFCn
|
||||
|
||||
#
|
||||
# Character Class Definitions.
|
||||
|
@ -33,10 +35,13 @@ $LVT = [\p{Grapheme_Cluster_Break = LVT}];
|
|||
|
||||
# Emoji defintions
|
||||
|
||||
$E_Base = [\p{Grapheme_Cluster_Break = EB}];
|
||||
$E_Base = [[\p{Grapheme_Cluster_Break = EB}] \U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
|
||||
$GAZ = [\p{Grapheme_Cluster_Break = GAZ}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
$E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
|
||||
$EmojiNRK = [[\p{Emoji}] - [\p{Grapheme_Cluster_Break = Regional_Indicator}*\u00230-9©®™〰〽]];
|
||||
|
||||
## -------------------------------------------------
|
||||
!!chain;
|
||||
|
@ -62,7 +67,7 @@ $Prepend [^$Control $CR $LF];
|
|||
($E_Base | $E_Base_GAZ) $Extend* $E_Modifier;
|
||||
|
||||
# GB 11
|
||||
$ZWJ ($GAZ | $E_Base_GAZ);
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ ($Extended_Pict | $EmojiNRK);
|
||||
|
||||
# GB 12-13. Keep pairs of regional indicators together
|
||||
# Note that hard break '/' rule triggers only if there are three or more initial RIs,
|
||||
|
@ -91,7 +96,7 @@ $SpacingMark [^$Control $CR $LF];
|
|||
$E_Modifier $Extend* ($E_Base | $E_Base_GAZ);
|
||||
|
||||
# GB 11 Don't break between ZWJ and Glue_After_ZWJ
|
||||
($GAZ | $E_Base_GAZ) $ZWJ;
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ ($Extended_Pict | $EmojiNRK);
|
||||
|
||||
# GB 12-13. Going backwards, we must scan through any number of regional indicators as pairs.
|
||||
#
|
||||
|
@ -104,10 +109,10 @@ $Regional_Indicator $Prepend;
|
|||
|
||||
!!safe_reverse;
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
($Extend | $ZWJ)+ .;
|
||||
($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
!!safe_forward;
|
||||
$Regional_Indicator $Regional_Indicator;
|
||||
($Extend | $ZWJ)+ .;
|
||||
($Extend | $ZWJ | $EmojiNRK | $Extended_Pict)+ .;
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
|
@ -5,11 +7,11 @@
|
|||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
# Includes extensions to the handling of emoji ZWJ sequences from
|
||||
# https://goo.gl/cluFCn
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
|
@ -36,7 +38,7 @@ $CL = [:LineBreak = Close_Punctuation:];
|
|||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
|
@ -69,6 +71,10 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
@ -141,9 +147,9 @@ $CAN_CM $CM* [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
|
@ -319,11 +325,11 @@ $CP $CM* ($ALPlus | $HL | $NU);
|
|||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
@ -375,10 +381,9 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
|
@ -5,12 +7,12 @@
|
|||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
# Includes extensions to the handling of emoji ZWJ sequences from
|
||||
# https://goo.gl/cluFCn
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
|
@ -42,7 +44,7 @@ $CL = [:LineBreak = Close_Punctuation:];
|
|||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
|
@ -75,6 +77,10 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
@ -147,9 +153,9 @@ $CAN_CM $CM* [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
|
@ -328,11 +334,11 @@ $CP $CM* ($ALPlus | $HL | $NU);
|
|||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
@ -384,10 +390,9 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
|
@ -5,11 +7,11 @@
|
|||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
# Includes extensions to the handling of emoji ZWJ sequences from
|
||||
# https://goo.gl/cluFCn
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
|
@ -43,7 +45,7 @@ $CL = [:LineBreak = Close_Punctuation:];
|
|||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
|
@ -77,6 +79,10 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
@ -149,9 +155,9 @@ $CAN_CM $CM* [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
|
@ -330,11 +336,11 @@ $CP $CM* ($ALPlus | $HL | $NU);
|
|||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
@ -386,10 +392,9 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
|
@ -5,11 +7,11 @@
|
|||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
# Includes extensions to the handling of emoji ZWJ sequences from
|
||||
# https://goo.gl/cluFCn
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
|
@ -51,7 +53,7 @@ $CL = [:LineBreak = Close_Punctuation:];
|
|||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EXX = [\uFF01 \uFF1F];
|
||||
$EX = [[:LineBreak = Exclamation:] - $EXX];
|
||||
|
@ -88,6 +90,10 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
@ -160,9 +166,9 @@ $CAN_CM $CM* [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
|
@ -345,11 +351,11 @@ $CP $CM* ($ALPlus | $HL | $NU);
|
|||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
@ -401,10 +407,9 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
|
@ -5,14 +7,12 @@
|
|||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
# tailored as noted in 3rd paragraph below.
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
# Includes extensions to the handling of emoji ZWJ sequences from
|
||||
# https://goo.gl/cluFCn
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
|
@ -42,7 +42,7 @@ $CL = [:LineBreak = Close_Punctuation:];
|
|||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
|
@ -76,6 +76,10 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
@ -148,9 +152,9 @@ $CAN_CM $CM* [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
|
@ -332,11 +336,11 @@ $CP $CM* ($ALPlus | $HL | $NU);
|
|||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
@ -388,10 +392,9 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
|
@ -5,11 +7,11 @@
|
|||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
# Includes extensions to the handling of emoji ZWJ sequences from
|
||||
# https://goo.gl/cluFCn
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
|
@ -40,7 +42,7 @@ $CL = [:LineBreak = Close_Punctuation:];
|
|||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
|
@ -73,6 +75,10 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
@ -145,9 +151,9 @@ $CAN_CM $CM* [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
|
@ -323,11 +329,11 @@ $CP $CM* ($ALPlus | $HL | $NU);
|
|||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
@ -379,10 +385,9 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
|
@ -5,11 +7,11 @@
|
|||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
# Includes extensions to the handling of emoji ZWJ sequences from
|
||||
# https://goo.gl/cluFCn
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
|
@ -42,7 +44,7 @@ $CL = [:LineBreak = Close_Punctuation:];
|
|||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
|
@ -76,6 +78,10 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
@ -148,9 +154,9 @@ $CAN_CM $CM* [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
|
@ -329,11 +335,11 @@ $CP $CM* ($ALPlus | $HL | $NU);
|
|||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
@ -385,10 +391,9 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
|
@ -5,14 +7,12 @@
|
|||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 35 for Unicode 8.0
|
||||
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
# tailored as noted in 3rd paragraph below.
|
||||
#
|
||||
# Includes the Emoji breaking proposals from Unicode L2/16-011R3.
|
||||
# http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
# Includes extensions to the handling of emoji ZWJ sequences from
|
||||
# https://goo.gl/cluFCn
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
|
@ -42,7 +42,7 @@ $CL = [:LineBreak = Close_Punctuation:];
|
|||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
|
@ -75,6 +75,10 @@ $XX = [:LineBreak = Unknown:];
|
|||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
@ -147,9 +151,9 @@ $CAN_CM $CM* [$SP $ZW];
|
|||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
|
||||
# LB 8a ZWJ x (ID | EB | EM) Emoji ZWJ sequences.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ ($ID | $EB | $EM);
|
||||
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
|
@ -328,11 +332,11 @@ $CP $CM* ($ALPlus | $HL | $NU);
|
|||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x ID
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $EB $EM] {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
|
||||
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $EB | $EM);
|
||||
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
@ -384,10 +388,9 @@ $LF $CR;
|
|||
# Requires an engine enhancement.
|
||||
# / $SP* $ZW
|
||||
|
||||
# LB 8a ZWJ x ID Unicode Emoji proposal L2/16-011R3
|
||||
# The ZWJ will look like a CM to whatever precedes it.
|
||||
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
#
|
||||
($ID | $EB | $EM) $ZWJ $CM* $CAN_CM?;
|
||||
($ID | $Extended_Pict | $EmojiNRK) $ZWJ $CM* $CAN_CM?;
|
||||
|
||||
|
||||
# LB 9,10 Combining marks.
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
|
@ -6,7 +8,8 @@
|
|||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 28 (draft 7) for Unicode Version 9.0
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# with additions for Emoji Sequences from https://goo.gl/cluFCn
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
@ -41,10 +44,13 @@ $MidLetter = [\p{Word_Break = MidLetter}];
|
|||
$MidNum = [\p{Word_Break = MidNum}];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$E_Base = [\p{Word_Break = EB}];
|
||||
$E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$E_Modifier = [\p{Word_Break = EM}];
|
||||
$GAZ = [\p{Word_Break = GAZ}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
$EBG = [\p{Word_Break = EBG}];
|
||||
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
|
||||
|
||||
$Han = [:Han:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
|
@ -96,9 +102,9 @@ $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
|
|||
#
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
|
||||
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
$ZWJ ($GAZ | $EBG);
|
||||
$ZWJ ($Extended_Pict | $EmojiNRK);
|
||||
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
|
@ -118,7 +124,7 @@ $IdeographicEx {400}; #
|
|||
|
||||
$E_Base ($Extend | $Format | $ZWJ)*;
|
||||
$E_Modifier ($Extend | $Format | $ZWJ)*;
|
||||
$GAZ ($Extend | $Format | $ZWJ)*;
|
||||
$Extended_Pict ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
#
|
||||
# rule 5
|
||||
|
@ -207,9 +213,9 @@ $BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
|||
# rule 3
|
||||
$LF $CR;
|
||||
|
||||
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
|
||||
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
($GAZ | $EBG) $ZWJ;
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ;
|
||||
|
||||
# rule 4
|
||||
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
|
||||
|
@ -269,9 +275,9 @@ $E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
|
|||
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
($GAZ | $EBG) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
($GAZ | $EBG) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (C) 2002-2016, International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
|
@ -6,7 +8,8 @@
|
|||
#
|
||||
# ICU Word Break Rules, POSIX locale.
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 28 (draft 7) for Unicode Version 9.0
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# with additions for Emoji Sequences from https://goo.gl/cluFCn
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
@ -41,10 +44,13 @@ $MidLetter = [\p{Word_Break = MidLetter} - [\:]];
|
|||
$MidNum = [\p{Word_Break = MidNum} [.]];
|
||||
$Numeric = [\p{Word_Break = Numeric}];
|
||||
$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
$E_Base = [\p{Word_Break = EB}];
|
||||
$E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
$E_Modifier = [\p{Word_Break = EM}];
|
||||
$GAZ = [\p{Word_Break = GAZ}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
$Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
$EBG = [\p{Word_Break = EBG}];
|
||||
$EmojiNRK = [[\p{Emoji}] - [\p{Word_Break = Regional_Indicator}\u002a\u00230-9©®™〰〽]];
|
||||
|
||||
$Han = [:Han:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
|
@ -96,9 +102,9 @@ $IdeographicEx = $Ideographic ($Extend | $Format | $ZWJ)*;
|
|||
#
|
||||
$CR $LF;
|
||||
|
||||
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
|
||||
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
$ZWJ ($GAZ | $EBG);
|
||||
$ZWJ ($Extended_Pict | $EmojiNRK);
|
||||
|
||||
|
||||
# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
|
||||
|
@ -118,7 +124,7 @@ $IdeographicEx {400}; #
|
|||
|
||||
$E_Base ($Extend | $Format | $ZWJ)*;
|
||||
$E_Modifier ($Extend | $Format | $ZWJ)*;
|
||||
$GAZ ($Extend | $Format | $ZWJ)*;
|
||||
$Extended_Pict ($Extend | $Format | $ZWJ)*;
|
||||
|
||||
#
|
||||
# rule 5
|
||||
|
@ -207,9 +213,9 @@ $BackRegional_IndicatorEx = ($Format | $Extend | $ZWJ)* $Regional_Indicator;
|
|||
# rule 3
|
||||
$LF $CR;
|
||||
|
||||
# Rule 3c ZWJ x GAZ. Preceeds WB4, so no intervening Extend chars allowed.
|
||||
# Rule 3c ZWJ x (Extended_Pict | EmojiNRK). Precedes WB4, so no intervening Extend chars allowed.
|
||||
#
|
||||
($GAZ | $EBG) $ZWJ;
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ;
|
||||
|
||||
# rule 4
|
||||
($Format | $Extend | $ZWJ)* [^$CR $LF $Newline]?;
|
||||
|
@ -269,9 +275,9 @@ $E_Modifier ($Format | $Extend | $ZWJ)* ($E_Base | $EBG);
|
|||
^$BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
($GAZ | $EBG) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
($GAZ | $EBG) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Extended_Pict | $EmojiNRK) $ZWJ $BackRegional_IndicatorEx $BackRegional_IndicatorEx / ($BackRegional_IndicatorEx $BackRegional_IndicatorEx)*
|
||||
($Format | $Extend | $ZWJ)* [[^$Regional_Indicator $Format $Extend $ZWJ] {eof}];
|
||||
|
||||
|
||||
|
|
|
@ -1028,7 +1028,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
|
|||
const uint8_t *builtRules;
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
errcheckln(status, "Can't open \"%s\" - %s", dataFile, u_errorName(status));
|
||||
errcheckln(status, "%s:%d Can't open \"%s\" - %s", __FILE__, __LINE__, dataFile, u_errorName(status));
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1036,14 +1036,15 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) {
|
|||
builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource);
|
||||
RuleBasedBreakIterator *brkItr = new RuleBasedBreakIterator(builtSource, parseError, status);
|
||||
if (U_FAILURE(status)) {
|
||||
errln("createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
|
||||
u_errorName(status), parseError.line, parseError.offset);
|
||||
errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n",
|
||||
__FILE__, __LINE__, u_errorName(status), parseError.line, parseError.offset);
|
||||
errln(UnicodeString(builtSource));
|
||||
return;
|
||||
};
|
||||
rbbiRules = brkItr->getBinaryRules(length);
|
||||
logln("Comparing \"%s\" len=%d", dataFile, length);
|
||||
if (memcmp(builtRules, rbbiRules, (int32_t)length) != 0) {
|
||||
errln("Built rules and rebuilt rules are different %s", dataFile);
|
||||
errln("%s:%d Built rules and rebuilt rules are different %s", __FILE__, __LINE__, dataFile);
|
||||
return;
|
||||
}
|
||||
delete brkItr;
|
||||
|
|
|
@ -1949,6 +1949,31 @@ static uint32_t m_rand()
|
|||
}
|
||||
|
||||
|
||||
//
|
||||
// Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
//
|
||||
static const char *gExtended_Pict = "["
|
||||
"\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093"
|
||||
"\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
|
||||
"\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF"
|
||||
"\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395"
|
||||
"\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548"
|
||||
"\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589"
|
||||
"\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0"
|
||||
"\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0"
|
||||
"\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
|
||||
"\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625"
|
||||
"\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667"
|
||||
"\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF"
|
||||
"\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF"
|
||||
"\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF"
|
||||
"\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF"
|
||||
"\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF"
|
||||
"\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F"
|
||||
"\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8"
|
||||
"\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF"
|
||||
"]";
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
//
|
||||
// class RBBICharMonkey Character (Grapheme Cluster) specific implementation
|
||||
|
@ -1980,8 +2005,9 @@ private:
|
|||
UnicodeSet *fHangulSet;
|
||||
UnicodeSet *fEmojiBaseSet;
|
||||
UnicodeSet *fEmojiModifierSet;
|
||||
UnicodeSet *fGAZSet;
|
||||
UnicodeSet *fEBGSet; // ***new
|
||||
UnicodeSet *fExtendedPictSet;
|
||||
UnicodeSet *fEBGSet;
|
||||
UnicodeSet *fEmojiNRKSet;
|
||||
UnicodeSet *fAnySet;
|
||||
|
||||
const UnicodeString *fText;
|
||||
|
@ -1997,7 +2023,7 @@ RBBICharMonkey::RBBICharMonkey() {
|
|||
fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
|
||||
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
|
||||
fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
|
||||
fRegionalIndicatorSet =
|
||||
fRegionalIndicatorSet =
|
||||
new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
|
||||
fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
|
||||
fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
|
||||
|
@ -2013,10 +2039,12 @@ RBBICharMonkey::RBBICharMonkey() {
|
|||
fHangulSet->addAll(*fLVSet);
|
||||
fHangulSet->addAll(*fLVTSet);
|
||||
|
||||
fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}]"), status);
|
||||
fEmojiBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
|
||||
fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
|
||||
fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = GAZ}]"), status);
|
||||
fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
|
||||
fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
|
||||
fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
|
||||
"[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
|
||||
fAnySet = new UnicodeSet(0, 0x10ffff);
|
||||
|
||||
fSets = new UVector(status);
|
||||
|
@ -2033,8 +2061,9 @@ RBBICharMonkey::RBBICharMonkey() {
|
|||
fSets->addElement(fEmojiBaseSet, status);
|
||||
fSets->addElement(fEmojiModifierSet, status);
|
||||
fSets->addElement(fZWJSet, status);
|
||||
fSets->addElement(fGAZSet, status);
|
||||
fSets->addElement(fExtendedPictSet, status);
|
||||
fSets->addElement(fEBGSet, status);
|
||||
fSets->addElement(fEmojiNRKSet,status);
|
||||
if (U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
}
|
||||
|
@ -2163,8 +2192,9 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
|
|||
continue;
|
||||
}
|
||||
|
||||
// Rule (GB11) ZWJ x (Glue_After_ZWJ | EBG)
|
||||
if (fZWJSet->contains(c1) && (fGAZSet->contains(c2) || fEBGSet->contains(c2))) {
|
||||
// Rule (GB11) (Glue_After_ZWJ | Emoji) ZWJ x (Glue_After_ZWJ | Emoji)
|
||||
if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
|
||||
(fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -2214,8 +2244,9 @@ RBBICharMonkey::~RBBICharMonkey() {
|
|||
delete fEmojiBaseSet;
|
||||
delete fEmojiModifierSet;
|
||||
delete fZWJSet;
|
||||
delete fGAZSet;
|
||||
delete fExtendedPictSet;
|
||||
delete fEBGSet;
|
||||
delete fEmojiNRKSet;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------------------------
|
||||
|
@ -2256,7 +2287,8 @@ private:
|
|||
UnicodeSet *fEBGSet;
|
||||
UnicodeSet *fEModifierSet;
|
||||
UnicodeSet *fZWJSet;
|
||||
UnicodeSet *fGAZSet;
|
||||
UnicodeSet *fExtendedPictSet;
|
||||
UnicodeSet *fEmojiNRKSet;
|
||||
|
||||
const UnicodeString *fText;
|
||||
};
|
||||
|
@ -2285,11 +2317,14 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
|
||||
fExtendSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"), status);
|
||||
|
||||
fEBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EB}]"), status);
|
||||
fEBaseSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
|
||||
"[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
|
||||
fEBGSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EBG}]"), status);
|
||||
fEModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = EM}]"), status);
|
||||
fZWJSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ZWJ}]"), status);
|
||||
fGAZSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = GAZ}]"), status);
|
||||
fExtendedPictSet = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
|
||||
fEmojiNRKSet = new UnicodeSet(UNICODE_STRING_SIMPLE(
|
||||
"[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
|
||||
|
||||
fDictionarySet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"), status);
|
||||
fDictionarySet->addAll(*fKatakanaSet);
|
||||
|
@ -2323,8 +2358,9 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fOtherSet->removeAll(*fEBGSet);
|
||||
fOtherSet->removeAll(*fEModifierSet);
|
||||
fOtherSet->removeAll(*fZWJSet);
|
||||
fOtherSet->removeAll(*fGAZSet);
|
||||
|
||||
fOtherSet->removeAll(*fExtendedPictSet);
|
||||
fOtherSet->removeAll(*fEmojiNRKSet);
|
||||
|
||||
// Inhibit dictionary characters from being tested at all.
|
||||
fOtherSet->removeAll(*fDictionarySet);
|
||||
|
||||
|
@ -2352,7 +2388,8 @@ RBBIWordMonkey::RBBIWordMonkey()
|
|||
fSets->addElement(fEBGSet, status);
|
||||
fSets->addElement(fEModifierSet, status);
|
||||
fSets->addElement(fZWJSet, status);
|
||||
fSets->addElement(fGAZSet, status);
|
||||
fSets->addElement(fExtendedPictSet, status);
|
||||
fSets->addElement(fEmojiNRKSet, status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
|
@ -2431,12 +2468,12 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) {
|
|||
break;
|
||||
};
|
||||
|
||||
// Rule (3c) ZWJ x (Glue_after_ZWJ | EBG).
|
||||
// Rule (3c) ZWJ x (Glue_after_ZWJ | EmojiNRK).
|
||||
// Not ignoring extend chars, so peek into input text to
|
||||
// get the potential ZWJ, the character immediately preceding c2.
|
||||
// Sloppy UChar32 indexing: p2-1 may reference trail half
|
||||
// but char32At will get the full code point.
|
||||
if (fZWJSet->contains(fText->char32At(p2-1)) && (fGAZSet->contains(c2) || fEBGSet->contains(c2))) {
|
||||
if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -2581,7 +2618,8 @@ RBBIWordMonkey::~RBBIWordMonkey() {
|
|||
delete fEBGSet;
|
||||
delete fEModifierSet;
|
||||
delete fZWJSet;
|
||||
delete fGAZSet;
|
||||
delete fExtendedPictSet;
|
||||
delete fEmojiNRKSet;
|
||||
}
|
||||
|
||||
|
||||
|
@ -2971,6 +3009,8 @@ private:
|
|||
UnicodeSet *fEB;
|
||||
UnicodeSet *fEM;
|
||||
UnicodeSet *fZJ;
|
||||
UnicodeSet *fExtendedPict;
|
||||
UnicodeSet *fEmojiNRK;
|
||||
|
||||
BreakIterator *fCharBI;
|
||||
const UnicodeString *fText;
|
||||
|
@ -3033,9 +3073,12 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
fRI = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
|
||||
fSG = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
|
||||
fXX = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
|
||||
fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
|
||||
fEB = new UnicodeSet(UNICODE_STRING_SIMPLE(
|
||||
"[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"), status);
|
||||
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
|
||||
fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
|
||||
fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
|
||||
fExtendedPict = new UnicodeSet(UnicodeString(gExtended_Pict, -1, US_INV), status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
|
@ -3047,10 +3090,6 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
|
||||
|
||||
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
|
||||
|
||||
fID->addAll(*fEB); // Emoji Base and Emoji Modifier behave as ID.
|
||||
fID->addAll(*fEM);
|
||||
|
||||
fCM->addAll(*fZJ); // ZWJ behaves as a CM.
|
||||
|
||||
fSets->addElement(fBK, status);
|
||||
|
@ -3094,6 +3133,9 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
fSets->addElement(fEB, status);
|
||||
fSets->addElement(fEM, status);
|
||||
fSets->addElement(fZJ, status);
|
||||
fSets->addElement(fExtendedPict, status);
|
||||
fSets->addElement(fEmojiNRK, status);
|
||||
|
||||
|
||||
const char *rules =
|
||||
"((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
|
||||
|
@ -3280,14 +3322,14 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
break;
|
||||
}
|
||||
|
||||
// LB 8a ZJ x ID
|
||||
// LB 8a ZWJ x (ID | ExtendedPict | Emoji)
|
||||
// The monkey test's way of ignoring combining characters doesn't work
|
||||
// for this rule. ZJ is also a CM. Need to get the actual character
|
||||
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
|
||||
{
|
||||
int32_t prevIdx = fText->moveIndex32(pos, -1);
|
||||
UChar32 prevC = fText->char32At(prevIdx);
|
||||
if (fZJ->contains(prevC) && fID->contains(thisChar)) {
|
||||
if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -3447,7 +3489,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
|
||||
(fEX->contains(prevChar) && fIN->contains(thisChar)) ||
|
||||
(fHL->contains(prevChar) && fIN->contains(thisChar)) ||
|
||||
(fID->contains(prevChar) && fIN->contains(thisChar)) ||
|
||||
((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
|
||||
(fIN->contains(prevChar) && fIN->contains(thisChar)) ||
|
||||
(fNU->contains(prevChar) && fIN->contains(thisChar)) ) {
|
||||
continue;
|
||||
|
@ -3643,6 +3685,8 @@ RBBILineMonkey::~RBBILineMonkey() {
|
|||
delete fEB;
|
||||
delete fEM;
|
||||
delete fZJ;
|
||||
delete fExtendedPict;
|
||||
delete fEmojiNRK;
|
||||
|
||||
delete fCharBI;
|
||||
delete fNumberMatcher;
|
||||
|
|
11
icu4c/source/test/testdata/GraphemeBreakTest.txt
vendored
11
icu4c/source/test/testdata/GraphemeBreakTest.txt
vendored
|
@ -8,6 +8,7 @@
|
|||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# Default Grapheme Break Test
|
||||
# Hand-patched for Emoji ZWJ Proposal L2/16-208R2.
|
||||
#
|
||||
# Format:
|
||||
# <string> (# <comment>)?
|
||||
|
@ -653,9 +654,9 @@
|
|||
÷ 200D × 0308 ÷ 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
|
||||
÷ 200D × 200D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] ZERO WIDTH JOINER (ZWJ) ÷ [0.3]
|
||||
÷ 200D × 0308 × 200D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) × [9.0] ZERO WIDTH JOINER (ZWJ) ÷ [0.3]
|
||||
÷ 200D × 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
|
||||
÷ 200D ÷ 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
|
||||
÷ 200D × 0308 ÷ 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
|
||||
÷ 200D × 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) ÷ [0.3]
|
||||
÷ 200D ÷ 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) ÷ [0.3]
|
||||
÷ 200D × 0308 ÷ 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] BOY (EBG) ÷ [0.3]
|
||||
÷ 200D ÷ 0378 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) ÷ [999.0] <reserved-0378> (Other) ÷ [0.3]
|
||||
÷ 200D × 0308 ÷ 0378 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [9.0] COMBINING DIAERESIS (Extend) ÷ [999.0] <reserved-0378> (Other) ÷ [0.3]
|
||||
|
@ -839,9 +840,9 @@
|
|||
÷ 0061 ÷ 0600 × 0062 ÷ # ÷ [0.2] LATIN SMALL LETTER A (Other) ÷ [999.0] ARABIC NUMBER SIGN (Prepend) × [9.2] LATIN SMALL LETTER B (Other) ÷ [0.3]
|
||||
÷ 261D × 1F3FB ÷ 261D ÷ # ÷ [0.2] WHITE UP POINTING INDEX (E_Base) × [10.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
|
||||
÷ 1F466 × 1F3FB ÷ # ÷ [0.2] BOY (EBG) × [10.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
|
||||
÷ 200D × 1F466 × 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) × [10.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
|
||||
÷ 200D × 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
|
||||
÷ 200D × 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) ÷ [0.3]
|
||||
÷ 200D ÷ 1F466 × 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) × [10.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
|
||||
÷ 200D ÷ 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
|
||||
÷ 200D ÷ 1F466 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ) × [11.0] BOY (EBG) ÷ [0.3]
|
||||
÷ 1F466 ÷ 1F466 ÷ # ÷ [0.2] BOY (EBG) ÷ [999.0] BOY (EBG) ÷ [0.3]
|
||||
#
|
||||
# Lines: 822
|
||||
|
|
5
icu4c/source/test/testdata/WordBreakTest.txt
vendored
5
icu4c/source/test/testdata/WordBreakTest.txt
vendored
|
@ -8,6 +8,7 @@
|
|||
# For documentation, see http://www.unicode.org/reports/tr44/
|
||||
#
|
||||
# Default Word Break Test
|
||||
# Hand-patched for Emoji ZWJ Proposal L2/16-208R2.
|
||||
#
|
||||
# Format:
|
||||
# <string> (# <comment>)?
|
||||
|
@ -1353,9 +1354,9 @@
|
|||
÷ 200D × 0308 ÷ 0022 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] QUOTATION MARK (Double_Quote) ÷ [0.3]
|
||||
÷ 200D ÷ 0027 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
|
||||
÷ 200D × 0308 ÷ 0027 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] APOSTROPHE (Single_Quote) ÷ [0.3]
|
||||
÷ 200D ÷ 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
|
||||
÷ 200D × 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
|
||||
÷ 200D × 0308 ÷ 261D ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] WHITE UP POINTING INDEX (E_Base) ÷ [0.3]
|
||||
÷ 200D ÷ 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
|
||||
÷ 200D × 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
|
||||
÷ 200D × 0308 ÷ 1F3FB ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (E_Modifier) ÷ [0.3]
|
||||
÷ 200D × 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [3.3] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
|
||||
÷ 200D × 0308 ÷ 2764 ÷ # ÷ [0.2] ZERO WIDTH JOINER (ZWJ_FE) × [4.0] COMBINING DIAERESIS (Extend_FE) ÷ [999.0] HEAVY BLACK HEART (Glue_After_Zwj) ÷ [0.3]
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: grapheme.txt
|
||||
|
@ -33,11 +35,14 @@ LVT = [\p{Grapheme_Cluster_Break = LVT}];
|
|||
|
||||
# Emoji defintions
|
||||
|
||||
E_Base = [\p{Grapheme_Cluster_Break = EB}];
|
||||
EmojiNRK = [[\p{Emoji}] - [Regional_Indicator\u002a\u00230-9©®™〰〽]];
|
||||
E_Base = [\p{Grapheme_Cluster_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
E_Modifier = [\p{Grapheme_Cluster_Break = EM}];
|
||||
GAZ = [\p{Grapheme_Cluster_Break = GAZ}];
|
||||
E_Base_GAZ = [\p{Grapheme_Cluster_Break = EBG}];
|
||||
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
|
||||
GB3: CR LF;
|
||||
GB4: (Control | CR | LF) ÷;
|
||||
|
@ -48,11 +53,11 @@ GB7: (LV | V) (V | T);
|
|||
GB8: (LVT | T) T;
|
||||
|
||||
GB10: (E_Base | E_Base_GAZ) Extend* E_Modifier;
|
||||
GB11: (Extended_Pict | EmojiNRK) ZWJ (Extended_Pict | EmojiNRK);
|
||||
GB9: . (Extend | ZWJ);
|
||||
|
||||
GB9a: . SpacingMark;
|
||||
GB9b: Prepend .;
|
||||
GB11: ZWJ (GAZ | E_Base_GAZ);
|
||||
|
||||
# Regional Indicators, split into pairs.
|
||||
# Note that a pair of RIs that is not followed by a third RI will fall into
|
||||
|
|
20
icu4c/source/test/testdata/break_rules/line.txt
vendored
20
icu4c/source/test/testdata/break_rules/line.txt
vendored
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: line.txt
|
||||
|
@ -25,7 +27,7 @@ CL = [:LineBreak = Close_Punctuation:];
|
|||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
EM = [:LineBreak = EM:];
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
|
@ -57,6 +59,10 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
@ -91,9 +97,9 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZWJ (ID | EB | EM);
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -132,7 +138,7 @@ LB19.1: QU CM* [^CM];
|
|||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | EB | EM);
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
|
@ -182,7 +188,7 @@ LB30.2: CP CM* (AL | HL | NU);
|
|||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -190,5 +196,5 @@ LB30b: EB CM* EM;
|
|||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | EB | EM);
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose.txt
|
||||
|
@ -32,7 +34,7 @@ CL = [:LineBreak = Close_Punctuation:];
|
|||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
EM = [:LineBreak = EM:];
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
|
@ -65,6 +67,10 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
@ -99,9 +105,9 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZWJ (ID | EB | EM);
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -140,7 +146,7 @@ LB19.1: QU CM* [^CM];
|
|||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | EB | EM);
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
|
@ -190,7 +196,7 @@ LB30.2: CP CM* (AL | HL | NU);
|
|||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -198,5 +204,5 @@ LB30b: EB CM* EM;
|
|||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | EB | EM);
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose_cj.txt
|
||||
|
@ -46,7 +48,7 @@ CL = [:LineBreak = Close_Punctuation:];
|
|||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
EM = [:LineBreak = EM:];
|
||||
EXX = [\uFF01 \uFF1F];
|
||||
EX = [[:LineBreak = Exclamation:] - EXX];
|
||||
|
@ -82,6 +84,10 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
@ -116,9 +122,9 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZWJ (ID | EB | EM);
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -157,7 +163,7 @@ LB19.1: QU CM* [^CM];
|
|||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | EB | EM);
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
|
@ -211,7 +217,7 @@ LB30.2: CP CM* (AL | HL | NU);
|
|||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -219,5 +225,5 @@ LB30b: EB CM* EM;
|
|||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | EB | EM);
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal.txt
|
||||
|
@ -39,7 +41,7 @@ CL = [:LineBreak = Close_Punctuation:];
|
|||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
EM = [:LineBreak = EM:];
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
|
@ -71,6 +73,10 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
@ -105,9 +111,9 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZWJ (ID | EB | EM);
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -146,7 +152,7 @@ LB19.1: QU CM* [^CM];
|
|||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | EB | EM);
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
|
@ -196,7 +202,7 @@ LB30.2: CP CM* (AL | HL | NU);
|
|||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -204,5 +210,5 @@ LB30b: EB CM* EM;
|
|||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | EB | EM);
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
|
@ -1,4 +1,7 @@
|
|||
# Copyright (c) 2016 International Business Machines Corporation and # others. All Rights Reserved.
|
||||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016 International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal_cj.txt
|
||||
#
|
||||
|
@ -39,7 +42,7 @@ CL = [:LineBreak = Close_Punctuation:];
|
|||
CM = [:LineBreak = Combining_Mark:];
|
||||
CP = [:LineBreak = Close_Parenthesis:];
|
||||
CR = [:LineBreak = Carriage_Return:];
|
||||
EB = [:LineBreak = EB:];
|
||||
EB = [[:LineBreak = EB:]\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
EM = [:LineBreak = EM:];
|
||||
EX = [:LineBreak = Exclamation:];
|
||||
GL = [:LineBreak = Glue:];
|
||||
|
@ -72,6 +75,10 @@ XX = [:LineBreak = Unknown:];
|
|||
ZW = [:LineBreak = ZWSpace:];
|
||||
ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
|
||||
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
|
||||
AL = [AL AI SG XX ];
|
||||
dictionary = SA;
|
||||
|
@ -109,9 +116,9 @@ LB7.2: [ZW SP] [SP ZW];
|
|||
# UAX 14: ZW SP* ÷;
|
||||
LB8: ZW ÷;
|
||||
|
||||
# LB8a, from Emoji proposal L2/16-011R3
|
||||
# ZWJ x ID
|
||||
LB8a: ZWJ (ID | EB | EM);
|
||||
# LB8a
|
||||
# ZWJ x (ID | Extended_Pict | EmojiNRK)
|
||||
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
|
||||
|
||||
# LB9: X CM -> X
|
||||
|
@ -150,7 +157,7 @@ LB19.1: QU CM* [^CM];
|
|||
# ZWJ acts like a CM to the left, combining with CB.
|
||||
# ZWJ acts independently to the right, no break from ID by LB8a.
|
||||
LB20: . CM* ÷ CB;
|
||||
LB20.1a: CB CM* ZWJ (ID | EB | EM);
|
||||
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB20.1b: CB CM* ÷;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
|
@ -204,7 +211,7 @@ LB30.2: CP CM* (AL | HL | NU);
|
|||
|
||||
# LB31 keep pairs of RI together.
|
||||
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | EB | EM);
|
||||
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB30a.3: RI CM* RI CM* ÷;
|
||||
|
||||
# LB30b Do not break between Emoji Base and Emoji Modifier
|
||||
|
@ -212,5 +219,5 @@ LB30b: EB CM* EM;
|
|||
|
||||
# LB31 Break Everywhere Else.
|
||||
# Include combining marks
|
||||
LB31.1: . CM* ZWJ (ID | EB | EM);
|
||||
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
|
||||
LB31.2: . CM* ÷;
|
||||
|
|
16
icu4c/source/test/testdata/break_rules/word.txt
vendored
16
icu4c/source/test/testdata/break_rules/word.txt
vendored
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: word.txt
|
||||
|
@ -30,9 +32,11 @@ MidLetter = [\p{Word_Break = MidLetter}];
|
|||
MidNum = [\p{Word_Break = MidNum}];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
E_Base = [\p{Word_Break = EB}];
|
||||
E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
E_Modifier = [\p{Word_Break = EM}];
|
||||
GAZ = [\p{Word_Break = GAZ}];
|
||||
EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
EBG = [\p{Word_Break = EBG}];
|
||||
|
||||
#define dicitionary, with the effect being that those characters don't appear in test data.
|
||||
|
@ -64,7 +68,7 @@ WB3: CR LF;
|
|||
WB3a: (Newline | CR | LF) ÷;
|
||||
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
|
||||
# (but needed with UAX treat-as scheme.)
|
||||
WB3c: ZWJ (GAZ | EBG);
|
||||
WB3c: ZWJ (Extended_Pict | EmojiNRK);
|
||||
|
||||
WB5: AHLetter ExtFmt* AHLetter;
|
||||
|
||||
|
@ -86,13 +90,13 @@ WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
|
|||
|
||||
# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
|
||||
# Interacts with WB3c.
|
||||
WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (GAZ | EBG);
|
||||
WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
|
||||
WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
|
||||
|
||||
WB14: (E_Base | EBG) ExtFmt* E_Modifier;
|
||||
|
||||
# Rule WB 999 Any ÷ Any
|
||||
# Interacts with WB3c, do not break between ZWJ and (GAZ | EBG).
|
||||
WB999.1: . ExtFmt* ZWJ (GAZ | EBG);
|
||||
# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
|
||||
WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
|
||||
WB999.2: . ExtFmt* ÷;
|
||||
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
#
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
|
||||
|
||||
# file: word_POSIX.txt
|
||||
|
@ -29,9 +31,11 @@ MidLetter = [\p{Word_Break = MidLetter} - [\:]];
|
|||
MidNum = [\p{Word_Break = MidNum} [.]];
|
||||
Numeric = [\p{Word_Break = Numeric}];
|
||||
ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
|
||||
E_Base = [\p{Word_Break = EB}];
|
||||
E_Base = [\p{Word_Break = EB}\U0001F3C2\U0001F3C7\U0001F3CC\U0001F46A-\U0001F46D\U0001F46F\U0001F574\U0001F6CC];
|
||||
E_Modifier = [\p{Word_Break = EM}];
|
||||
GAZ = [\p{Word_Break = GAZ}];
|
||||
EmojiNRK = [[\p{Emoji}] - [[Regional_Indicator]\u002a\u00230-9©®™〰〽]];
|
||||
# Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
|
||||
Extended_Pict = [\U0001F774-\U0001F77F\u2700-\u2701\u2703-\u2704\u270E\u2710-\u2711\u2765-\u2767\U0001F030-\U0001F093\U0001F094-\U0001F09F\U0001F10D-\U0001F10F\U0001F12F\U0001F16C-\U0001F16F\U0001F1AD-\U0001F1E5\U0001F203-\U0001F20F\U0001F23C-\U0001F23F\U0001F249-\U0001F24F\U0001F252-\U0001F2FF\U0001F7D5-\U0001F7FF\U0001F000-\U0001F003\U0001F005-\U0001F02B\U0001F02C-\U0001F02F\U0001F322-\U0001F323\U0001F394-\U0001F395\U0001F398\U0001F39C-\U0001F39D\U0001F3F1-\U0001F3F2\U0001F3F6\U0001F4FE\U0001F53E-\U0001F548\U0001F54F\U0001F568-\U0001F56E\U0001F571-\U0001F572\U0001F57B-\U0001F586\U0001F588-\U0001F589\U0001F58E-\U0001F58F\U0001F591-\U0001F594\U0001F597-\U0001F5A3\U0001F5A6-\U0001F5A7\U0001F5A9-\U0001F5B0\U0001F5B3-\U0001F5BB\U0001F5BD-\U0001F5C1\U0001F5C5-\U0001F5D0\U0001F5D4-\U0001F5DB\U0001F5DF-\U0001F5E0\U0001F5E2\U0001F5E4-\U0001F5E7\U0001F5E9-\U0001F5EE\U0001F5F0-\U0001F5F2\U0001F5F4-\U0001F5F9\u2605\u2607-\u260D\u260F-\u2610\u2612\u2616-\u2617\u2619-\u261C\u261E-\u261F\u2621\u2624-\u2625\u2627-\u2629\u262B-\u262D\u2630-\u2637\u263B-\u2647\u2654-\u265F\u2661-\u2662\u2664\u2667\u2669-\u267A\u267C-\u267E\u2680-\u2691\u2695\u2698\u269A\u269D-\u269F\u26A2-\u26A9\u26AC-\u26AF\u26B2-\u26BC\u26BF-\u26C3\u26C6-\u26C7\u26C9-\u26CD\u26D0\u26D2\u26D5-\u26E8\u26EB-\u26EF\u26F6\u26FB-\u26FC\u26FE-\u26FF\u2388\U0001FA00-\U0001FFFD\U0001F0A0-\U0001F0AE\U0001F0B1-\U0001F0BF\U0001F0C1-\U0001F0CF\U0001F0D1-\U0001F0F5\U0001F0AF-\U0001F0B0\U0001F0C0\U0001F0D0\U0001F0F6-\U0001F0FF\U0001F80C-\U0001F80F\U0001F848-\U0001F84F\U0001F85A-\U0001F85F\U0001F888-\U0001F88F\U0001F8AE-\U0001F8FF\U0001F900-\U0001F90F\U0001F91F\U0001F928-\U0001F92F\U0001F931-\U0001F932\U0001F93F\U0001F94C-\U0001F94F\U0001F95F-\U0001F97F\U0001F992-\U0001F9BF\U0001F9C1-\U0001F9FF\U0001F6C6-\U0001F6CA\U0001F6E6-\U0001F6E8\U0001F6EA\U0001F6F1-\U0001F6F2\U0001F6D3-\U0001F6DF\U0001F6ED-\U0001F6EF\U0001F6F7-\U0001F6FF];
|
||||
EBG = [\p{Word_Break = EBG}];
|
||||
|
||||
#define dicitionary, with the effect being that those characters don't appear in test data.
|
||||
|
@ -63,7 +67,7 @@ WB3: CR LF;
|
|||
WB3a: (Newline | CR | LF) ÷;
|
||||
WB3b: . ÷ (Newline | CR | LF); # actually redundant? No other rule combines.
|
||||
# (but needed with UAX treat-as scheme.)
|
||||
WB3c: ZWJ (GAZ | EBG);
|
||||
WB3c: ZWJ (Extended_Pict | EmojiNRK);
|
||||
|
||||
WB5: AHLetter ExtFmt* AHLetter;
|
||||
|
||||
|
@ -85,13 +89,13 @@ WB13b: ExtendNumLet ExtFmt* (AHLetter | Numeric | Katakana);
|
|||
|
||||
# WB rule 15 - 17, pairs of Regional Indicators stay unbroken.
|
||||
# Interacts with WB3c.
|
||||
WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (GAZ | EBG);
|
||||
WB15: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
|
||||
WB17: Regional_Indicator ExtFmt* Regional_Indicator ExtFmt* ÷;
|
||||
|
||||
WB14: (E_Base | EBG) ExtFmt* E_Modifier;
|
||||
|
||||
# Rule WB 999 Any ÷ Any
|
||||
# Interacts with WB3c, do not break between ZWJ and (GAZ | EBG).
|
||||
WB999.1: . ExtFmt* ZWJ (GAZ | EBG);
|
||||
# Interacts with WB3c, do not break between ZWJ and (Extended_Pict | EBG).
|
||||
WB999.1: . ExtFmt* ZWJ (Extended_Pict | EmojiNRK);
|
||||
WB999.2: . ExtFmt* ÷;
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue