ICU-13770 Line Break update for Unicode 11 - revise rule LB8a.

X-SVN-Rev: 41425
This commit is contained in:
Andy Heninger 2018-05-21 23:47:40 +00:00
commit fc31932587
24 changed files with 462 additions and 502 deletions

View file

@ -7,12 +7,9 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
#
# Includes extensions to the handling of emoji ZWJ sequences from
# https://goo.gl/cluFCn
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
@ -72,9 +69,6 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
$Extended_Pict = [:ExtPict:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -145,9 +139,9 @@ $CAN_CM $CM* [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
@ -321,13 +315,13 @@ $IS $CM* ($ALPlus | $HL);
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;

View file

@ -7,13 +7,10 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 2nd paragraph below.
#
# Includes extensions to the handling of emoji ZWJ sequences from
# https://goo.gl/cluFCn
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
@ -78,9 +75,6 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
$Extended_Pict = [:ExtPict:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -151,9 +145,9 @@ $CAN_CM $CM* [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
@ -330,13 +324,13 @@ $IS $CM* ($ALPlus | $HL);
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;

View file

@ -8,12 +8,9 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
#
# Includes extensions to the handling of emoji ZWJ sequences from
# https://goo.gl/cluFCn
#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
@ -81,9 +78,6 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
$Extended_Pict = [:ExtPict:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -154,9 +148,9 @@ $CAN_CM $CM* [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
@ -333,13 +327,13 @@ $IS $CM* ($ALPlus | $HL);
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;

View file

@ -7,12 +7,9 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
#
# Includes extensions to the handling of emoji ZWJ sequences from
# https://goo.gl/cluFCn
#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
@ -91,9 +88,6 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
$Extended_Pict = [:ExtPict:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -164,9 +158,9 @@ $CAN_CM $CM* [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
@ -347,13 +341,13 @@ $IS $CM* ($ALPlus | $HL);
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;

View file

@ -7,13 +7,10 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 3rd paragraph below.
#
# Includes extensions to the handling of emoji ZWJ sequences from
# https://goo.gl/cluFCn
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
@ -77,9 +74,6 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
$Extended_Pict = [:ExtPict:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -150,9 +144,9 @@ $CAN_CM $CM* [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
@ -332,13 +326,13 @@ $IS $CM* ($ALPlus | $HL);
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;

View file

@ -7,12 +7,9 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
#
# Includes extensions to the handling of emoji ZWJ sequences from
# https://goo.gl/cluFCn
#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
@ -76,9 +73,6 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
$Extended_Pict = [:ExtPict:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -149,9 +143,9 @@ $CAN_CM $CM* [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
@ -325,13 +319,13 @@ $IS $CM* ($ALPlus | $HL);
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;

View file

@ -7,12 +7,9 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
#
# Includes extensions to the handling of emoji ZWJ sequences from
# https://goo.gl/cluFCn
#
# tailored as noted in 2nd paragraph below.
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
@ -79,9 +76,6 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
$Extended_Pict = [:ExtPict:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -152,9 +146,9 @@ $CAN_CM $CM* [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
@ -331,13 +325,13 @@ $IS $CM* ($ALPlus | $HL);
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;

View file

@ -7,13 +7,10 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 37 for Unicode 9.0
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# http://www.unicode.org/reports/tr14/
# tailored as noted in 3rd paragraph below.
#
# Includes extensions to the handling of emoji ZWJ sequences from
# https://goo.gl/cluFCn
#
# TODO: Rule LB 8 remains as it was in Unicode 5.2
# This is only because of a limitation of ICU break engine implementation,
# not because the older behavior is desirable.
@ -76,9 +73,6 @@ $XX = [:LineBreak = Unknown:];
$ZW = [:LineBreak = ZWSpace:];
$ZWJ = [:LineBreak = ZWJ:];
$EmojiNRK = [[\p{Emoji}] - [$RI \u002a\u00230-9©®™〰〽]];
$Extended_Pict = [:ExtPict:];
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
@ -149,9 +143,9 @@ $CAN_CM $CM* [$SP $ZW];
$LB8Breaks = [$LB4Breaks $ZW];
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
# LB 8a ZWJ x (ID | Extended_Pict | EmojiNRK) Emoji ZWJ sequences.
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
#
$ZWJ ($ID | $Extended_Pict | $EmojiNRK);
$ZWJ [^$CM];
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
# $CM not covered by the above needs to behave like $AL
@ -328,13 +322,13 @@ $IS $CM* ($ALPlus | $HL);
$CP $CM* ($ALPlus | $HL | $NU);
# LB 30a Do not break between regional indicators. Break after pairs of them.
# Tricky interaction with LB8a: ZWJ x ID
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* $ZWJ / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM $ID $Extended_Pict $EmojiNRK] {eof}];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM] {eof}];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS {eof}];
$RI $CM* $RI $CM* $ZWJ ($ID | $Extended_Pict | $EmojiNRK);
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
$EB $CM* $EM;

View file

@ -32,7 +32,7 @@
// intltest rbbi/RBBIMonkeyTest@loop=-1,rules=word.txt
// will run word break testing in an infinite loop.
// Summary of options
// test=name Test against the named reference rule file.
// rules=name Test against the named reference rule file.
// Files are found in source/test/testdata/break_rules
// loop=nnn Loop nnn times. -1 for no limit. loop of 1 is useful for debugging.
// seed=nnnn Random number generator seed. Allows recreation of a failure.

View file

@ -2551,8 +2551,6 @@ private:
UnicodeSet *fEB;
UnicodeSet *fEM;
UnicodeSet *fZJ;
UnicodeSet *fExtendedPict;
UnicodeSet *fEmojiNRK;
BreakIterator *fCharBI;
const UnicodeString *fText;
@ -2618,8 +2616,6 @@ RBBILineMonkey::RBBILineMonkey() :
fEB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EB}]"), status);
fEM = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
fZJ = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
fExtendedPict = new UnicodeSet(u"[:Extended_Pictographic:]", status);
if (U_FAILURE(status)) {
deferredStatus = status;
@ -2674,8 +2670,6 @@ RBBILineMonkey::RBBILineMonkey() :
fSets->addElement(fEB, status);
fSets->addElement(fEM, status);
fSets->addElement(fZJ, status);
fSets->addElement(fExtendedPict, status);
fSets->addElement(fEmojiNRK, status);
const char *rules =
@ -2863,14 +2857,40 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
break;
}
// LB 8a ZWJ x (ID | ExtendedPict | Emoji)
// LB 25 Numbers
// Move this test up, before LB8a, because numbers can match a longer sequence that would
// also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
if (fNumberMatcher->lookingAt(prevPos, status)) {
if (U_FAILURE(status)) {
break;
}
// Matched a number. But could have been just a single digit, which would
// not represent a "no break here" between prevChar and thisChar
int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
if (numEndIdx > pos) {
// Number match includes at least our two chars being checked
if (numEndIdx > nextPos) {
// Number match includes additional chars. Update pos and nextPos
// so that next loop iteration will continue at the end of the number,
// checking for breaks between last char in number & whatever follows.
pos = nextPos = numEndIdx;
do {
pos = fText->moveIndex32(pos, -1);
thisChar = fText->char32At(pos);
} while (fCM->contains(thisChar));
}
continue;
}
}
// LB 8a ZWJ x
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZJ.
{
int32_t prevIdx = fText->moveIndex32(pos, -1);
UChar32 prevC = fText->char32At(prevIdx);
if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
if (fZJ->contains(prevC)) {
continue;
}
}
@ -3070,32 +3090,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
continue;
}
// LB 25 Numbers
if (fNumberMatcher->lookingAt(prevPos, status)) {
if (U_FAILURE(status)) {
break;
}
// Matched a number. But could have been just a single digit, which would
// not represent a "no break here" between prevChar and thisChar
int32_t numEndIdx = fNumberMatcher->end(status); // idx of first char following num
if (numEndIdx > pos) {
// Number match includes at least our two chars being checked
if (numEndIdx > nextPos) {
// Number match includes additional chars. Update pos and nextPos
// so that next loop iteration will continue at the end of the number,
// checking for breaks between last char in number & whatever follows.
pos = nextPos = numEndIdx;
do {
pos = fText->moveIndex32(pos, -1);
thisChar = fText->char32At(pos);
} while (fCM->contains(thisChar));
}
continue;
}
}
// LB 25 numbers match, moved up, before LB 8a,
// LB 26 Do not break a Korean syllable.
if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
@ -3226,8 +3221,6 @@ RBBILineMonkey::~RBBILineMonkey() {
delete fEB;
delete fEM;
delete fZJ;
delete fExtendedPict;
delete fEmojiNRK;
delete fCharBI;
delete fNumberMatcher;

View file

@ -1,5 +1,5 @@
# LineBreakTest-11.0.0.txt
# Date: 2018-01-31, 08:20:17 GMT
# Date: 2018-05-20, 09:03:09 GMT
# © 2018 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
@ -6242,174 +6242,174 @@
× 0001 × 0020 ÷ 3041 ÷ # × [0.3] <START OF HEADING> (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 0001 × 0308 × 3041 ÷ # × [0.3] <START OF HEADING> (CM1_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [21.03] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 0001 × 0308 × 0020 ÷ 3041 ÷ # × [0.3] <START OF HEADING> (CM1_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 200D × 0023 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [28.0] NUMBER SIGN (AL) ÷ [0.3]
× 200D × 0023 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] NUMBER SIGN (AL) ÷ [0.3]
× 200D × 0020 ÷ 0023 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] NUMBER SIGN (AL) ÷ [0.3]
× 200D × 0308 × 0023 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [28.0] NUMBER SIGN (AL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0023 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] NUMBER SIGN (AL) ÷ [0.3]
× 200D ÷ 2014 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [999.0] EM DASH (B2) ÷ [0.3]
× 200D × 0308 × 0023 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [28.0] NUMBER SIGN (AL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0023 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] NUMBER SIGN (AL) ÷ [0.3]
× 200D × 2014 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] EM DASH (B2) ÷ [0.3]
× 200D × 0020 ÷ 2014 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] EM DASH (B2) ÷ [0.3]
× 200D × 0308 ÷ 2014 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] EM DASH (B2) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 2014 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] EM DASH (B2) ÷ [0.3]
× 200D × 0009 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [21.01] <CHARACTER TABULATION> (BA) ÷ [0.3]
× 200D × 0308 ÷ 2014 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] EM DASH (B2) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 2014 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] EM DASH (B2) ÷ [0.3]
× 200D × 0009 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] <CHARACTER TABULATION> (BA) ÷ [0.3]
× 200D × 0020 ÷ 0009 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] <CHARACTER TABULATION> (BA) ÷ [0.3]
× 200D × 0308 × 0009 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [21.01] <CHARACTER TABULATION> (BA) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0009 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] <CHARACTER TABULATION> (BA) ÷ [0.3]
× 200D ÷ 00B4 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [999.0] ACUTE ACCENT (BB) ÷ [0.3]
× 200D × 0308 × 0009 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [21.01] <CHARACTER TABULATION> (BA) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0009 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] <CHARACTER TABULATION> (BA) ÷ [0.3]
× 200D × 00B4 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] ACUTE ACCENT (BB) ÷ [0.3]
× 200D × 0020 ÷ 00B4 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] ACUTE ACCENT (BB) ÷ [0.3]
× 200D × 0308 ÷ 00B4 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] ACUTE ACCENT (BB) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 00B4 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] ACUTE ACCENT (BB) ÷ [0.3]
× 200D × 0308 ÷ 00B4 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] ACUTE ACCENT (BB) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 00B4 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] ACUTE ACCENT (BB) ÷ [0.3]
× 200D × 000B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [6.0] <LINE TABULATION> (BK) ÷ [0.3]
× 200D × 0020 × 000B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [6.0] <LINE TABULATION> (BK) ÷ [0.3]
× 200D × 0308 × 000B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [6.0] <LINE TABULATION> (BK) ÷ [0.3]
× 200D × 0308 × 0020 × 000B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [6.0] <LINE TABULATION> (BK) ÷ [0.3]
× 200D ÷ FFFC ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [20.01] OBJECT REPLACEMENT CHARACTER (CB) ÷ [0.3]
× 200D × 0308 × 000B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [6.0] <LINE TABULATION> (BK) ÷ [0.3]
× 200D × 0308 × 0020 × 000B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [6.0] <LINE TABULATION> (BK) ÷ [0.3]
× 200D × FFFC ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] OBJECT REPLACEMENT CHARACTER (CB) ÷ [0.3]
× 200D × 0020 ÷ FFFC ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] OBJECT REPLACEMENT CHARACTER (CB) ÷ [0.3]
× 200D × 0308 ÷ FFFC ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [20.01] OBJECT REPLACEMENT CHARACTER (CB) ÷ [0.3]
× 200D × 0308 × 0020 ÷ FFFC ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] OBJECT REPLACEMENT CHARACTER (CB) ÷ [0.3]
× 200D × 007D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [13.04] RIGHT CURLY BRACKET (CL) ÷ [0.3]
× 200D × 0308 ÷ FFFC ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [20.01] OBJECT REPLACEMENT CHARACTER (CB) ÷ [0.3]
× 200D × 0308 × 0020 ÷ FFFC ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] OBJECT REPLACEMENT CHARACTER (CB) ÷ [0.3]
× 200D × 007D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] RIGHT CURLY BRACKET (CL) ÷ [0.3]
× 200D × 0020 × 007D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [13.02] RIGHT CURLY BRACKET (CL) ÷ [0.3]
× 200D × 0308 × 007D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [13.04] RIGHT CURLY BRACKET (CL) ÷ [0.3]
× 200D × 0308 × 0020 × 007D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.02] RIGHT CURLY BRACKET (CL) ÷ [0.3]
× 200D × 0029 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [13.04] RIGHT PARENTHESIS (CP) ÷ [0.3]
× 200D × 0308 × 007D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [13.04] RIGHT CURLY BRACKET (CL) ÷ [0.3]
× 200D × 0308 × 0020 × 007D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.02] RIGHT CURLY BRACKET (CL) ÷ [0.3]
× 200D × 0029 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] RIGHT PARENTHESIS (CP) ÷ [0.3]
× 200D × 0020 × 0029 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [13.02] RIGHT PARENTHESIS (CP) ÷ [0.3]
× 200D × 0308 × 0029 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [13.04] RIGHT PARENTHESIS (CP) ÷ [0.3]
× 200D × 0308 × 0020 × 0029 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.02] RIGHT PARENTHESIS (CP) ÷ [0.3]
× 200D × 0308 × 0029 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [13.04] RIGHT PARENTHESIS (CP) ÷ [0.3]
× 200D × 0308 × 0020 × 0029 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.02] RIGHT PARENTHESIS (CP) ÷ [0.3]
× 200D × 000D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [6.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
× 200D × 0020 × 000D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [6.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
× 200D × 0308 × 000D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [6.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
× 200D × 0308 × 0020 × 000D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [6.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
× 200D × 0021 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [13.01] EXCLAMATION MARK (EX) ÷ [0.3]
× 200D × 0308 × 000D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [6.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
× 200D × 0308 × 0020 × 000D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [6.0] <CARRIAGE RETURN (CR)> (CR) ÷ [0.3]
× 200D × 0021 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] EXCLAMATION MARK (EX) ÷ [0.3]
× 200D × 0020 × 0021 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [13.01] EXCLAMATION MARK (EX) ÷ [0.3]
× 200D × 0308 × 0021 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [13.01] EXCLAMATION MARK (EX) ÷ [0.3]
× 200D × 0308 × 0020 × 0021 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.01] EXCLAMATION MARK (EX) ÷ [0.3]
× 200D × 00A0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [12.3] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D × 0308 × 0021 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [13.01] EXCLAMATION MARK (EX) ÷ [0.3]
× 200D × 0308 × 0020 × 0021 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.01] EXCLAMATION MARK (EX) ÷ [0.3]
× 200D × 00A0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D × 0020 ÷ 00A0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D × 0308 × 00A0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [12.3] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 00A0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D ÷ AC00 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [999.0] HANGUL SYLLABLE GA (H2) ÷ [0.3]
× 200D × 0308 × 00A0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [12.3] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 00A0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D × AC00 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] HANGUL SYLLABLE GA (H2) ÷ [0.3]
× 200D × 0020 ÷ AC00 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL SYLLABLE GA (H2) ÷ [0.3]
× 200D × 0308 ÷ AC00 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL SYLLABLE GA (H2) ÷ [0.3]
× 200D × 0308 × 0020 ÷ AC00 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL SYLLABLE GA (H2) ÷ [0.3]
× 200D ÷ AC01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [999.0] HANGUL SYLLABLE GAG (H3) ÷ [0.3]
× 200D × 0308 ÷ AC00 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL SYLLABLE GA (H2) ÷ [0.3]
× 200D × 0308 × 0020 ÷ AC00 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL SYLLABLE GA (H2) ÷ [0.3]
× 200D × AC01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] HANGUL SYLLABLE GAG (H3) ÷ [0.3]
× 200D × 0020 ÷ AC01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL SYLLABLE GAG (H3) ÷ [0.3]
× 200D × 0308 ÷ AC01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL SYLLABLE GAG (H3) ÷ [0.3]
× 200D × 0308 × 0020 ÷ AC01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL SYLLABLE GAG (H3) ÷ [0.3]
× 200D × 05D0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [28.0] HEBREW LETTER ALEF (HL) ÷ [0.3]
× 200D × 0308 ÷ AC01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL SYLLABLE GAG (H3) ÷ [0.3]
× 200D × 0308 × 0020 ÷ AC01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL SYLLABLE GAG (H3) ÷ [0.3]
× 200D × 05D0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] HEBREW LETTER ALEF (HL) ÷ [0.3]
× 200D × 0020 ÷ 05D0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] HEBREW LETTER ALEF (HL) ÷ [0.3]
× 200D × 0308 × 05D0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [28.0] HEBREW LETTER ALEF (HL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 05D0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HEBREW LETTER ALEF (HL) ÷ [0.3]
× 200D × 002D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [21.02] HYPHEN-MINUS (HY) ÷ [0.3]
× 200D × 0308 × 05D0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [28.0] HEBREW LETTER ALEF (HL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 05D0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HEBREW LETTER ALEF (HL) ÷ [0.3]
× 200D × 002D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] HYPHEN-MINUS (HY) ÷ [0.3]
× 200D × 0020 ÷ 002D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] HYPHEN-MINUS (HY) ÷ [0.3]
× 200D × 0308 × 002D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [21.02] HYPHEN-MINUS (HY) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 002D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HYPHEN-MINUS (HY) ÷ [0.3]
× 200D × 0308 × 002D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [21.02] HYPHEN-MINUS (HY) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 002D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HYPHEN-MINUS (HY) ÷ [0.3]
× 200D × 231A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] WATCH (ID) ÷ [0.3]
× 200D × 0020 ÷ 231A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] WATCH (ID) ÷ [0.3]
× 200D × 0308 ÷ 231A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] WATCH (ID) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 231A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] WATCH (ID) ÷ [0.3]
× 200D × 2024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [22.01] ONE DOT LEADER (IN) ÷ [0.3]
× 200D × 0308 ÷ 231A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] WATCH (ID) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 231A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] WATCH (ID) ÷ [0.3]
× 200D × 2024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] ONE DOT LEADER (IN) ÷ [0.3]
× 200D × 0020 ÷ 2024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] ONE DOT LEADER (IN) ÷ [0.3]
× 200D × 0308 × 2024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [22.01] ONE DOT LEADER (IN) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 2024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] ONE DOT LEADER (IN) ÷ [0.3]
× 200D × 002C ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [13.04] COMMA (IS) ÷ [0.3]
× 200D × 0308 × 2024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [22.01] ONE DOT LEADER (IN) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 2024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] ONE DOT LEADER (IN) ÷ [0.3]
× 200D × 002C ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMMA (IS) ÷ [0.3]
× 200D × 0020 × 002C ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [13.02] COMMA (IS) ÷ [0.3]
× 200D × 0308 × 002C ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [13.04] COMMA (IS) ÷ [0.3]
× 200D × 0308 × 0020 × 002C ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.02] COMMA (IS) ÷ [0.3]
× 200D ÷ 1100 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [999.0] HANGUL CHOSEONG KIYEOK (JL) ÷ [0.3]
× 200D × 0308 × 002C ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [13.04] COMMA (IS) ÷ [0.3]
× 200D × 0308 × 0020 × 002C ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.02] COMMA (IS) ÷ [0.3]
× 200D × 1100 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] HANGUL CHOSEONG KIYEOK (JL) ÷ [0.3]
× 200D × 0020 ÷ 1100 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL CHOSEONG KIYEOK (JL) ÷ [0.3]
× 200D × 0308 ÷ 1100 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL CHOSEONG KIYEOK (JL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 1100 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL CHOSEONG KIYEOK (JL) ÷ [0.3]
× 200D ÷ 11A8 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [999.0] HANGUL JONGSEONG KIYEOK (JT) ÷ [0.3]
× 200D × 0308 ÷ 1100 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL CHOSEONG KIYEOK (JL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 1100 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL CHOSEONG KIYEOK (JL) ÷ [0.3]
× 200D × 11A8 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] HANGUL JONGSEONG KIYEOK (JT) ÷ [0.3]
× 200D × 0020 ÷ 11A8 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL JONGSEONG KIYEOK (JT) ÷ [0.3]
× 200D × 0308 ÷ 11A8 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL JONGSEONG KIYEOK (JT) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 11A8 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL JONGSEONG KIYEOK (JT) ÷ [0.3]
× 200D ÷ 1160 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [999.0] HANGUL JUNGSEONG FILLER (JV) ÷ [0.3]
× 200D × 0308 ÷ 11A8 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL JONGSEONG KIYEOK (JT) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 11A8 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL JONGSEONG KIYEOK (JT) ÷ [0.3]
× 200D × 1160 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] HANGUL JUNGSEONG FILLER (JV) ÷ [0.3]
× 200D × 0020 ÷ 1160 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL JUNGSEONG FILLER (JV) ÷ [0.3]
× 200D × 0308 ÷ 1160 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL JUNGSEONG FILLER (JV) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 1160 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL JUNGSEONG FILLER (JV) ÷ [0.3]
× 200D × 0308 ÷ 1160 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] HANGUL JUNGSEONG FILLER (JV) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 1160 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL JUNGSEONG FILLER (JV) ÷ [0.3]
× 200D × 000A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [6.0] <LINE FEED (LF)> (LF) ÷ [0.3]
× 200D × 0020 × 000A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [6.0] <LINE FEED (LF)> (LF) ÷ [0.3]
× 200D × 0308 × 000A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [6.0] <LINE FEED (LF)> (LF) ÷ [0.3]
× 200D × 0308 × 0020 × 000A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [6.0] <LINE FEED (LF)> (LF) ÷ [0.3]
× 200D × 0308 × 000A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [6.0] <LINE FEED (LF)> (LF) ÷ [0.3]
× 200D × 0308 × 0020 × 000A ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [6.0] <LINE FEED (LF)> (LF) ÷ [0.3]
× 200D × 0085 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [6.0] <NEXT LINE (NEL)> (NL) ÷ [0.3]
× 200D × 0020 × 0085 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [6.0] <NEXT LINE (NEL)> (NL) ÷ [0.3]
× 200D × 0308 × 0085 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [6.0] <NEXT LINE (NEL)> (NL) ÷ [0.3]
× 200D × 0308 × 0020 × 0085 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [6.0] <NEXT LINE (NEL)> (NL) ÷ [0.3]
× 200D × 17D6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [21.03] KHMER SIGN CAMNUC PII KUUH (NS) ÷ [0.3]
× 200D × 0308 × 0085 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [6.0] <NEXT LINE (NEL)> (NL) ÷ [0.3]
× 200D × 0308 × 0020 × 0085 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [6.0] <NEXT LINE (NEL)> (NL) ÷ [0.3]
× 200D × 17D6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] KHMER SIGN CAMNUC PII KUUH (NS) ÷ [0.3]
× 200D × 0020 ÷ 17D6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] KHMER SIGN CAMNUC PII KUUH (NS) ÷ [0.3]
× 200D × 0308 × 17D6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [21.03] KHMER SIGN CAMNUC PII KUUH (NS) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 17D6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] KHMER SIGN CAMNUC PII KUUH (NS) ÷ [0.3]
× 200D × 0030 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [23.02] DIGIT ZERO (NU) ÷ [0.3]
× 200D × 0308 × 17D6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [21.03] KHMER SIGN CAMNUC PII KUUH (NS) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 17D6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] KHMER SIGN CAMNUC PII KUUH (NS) ÷ [0.3]
× 200D × 0030 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] DIGIT ZERO (NU) ÷ [0.3]
× 200D × 0020 ÷ 0030 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] DIGIT ZERO (NU) ÷ [0.3]
× 200D × 0308 × 0030 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [23.02] DIGIT ZERO (NU) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0030 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] DIGIT ZERO (NU) ÷ [0.3]
× 200D × 0028 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [30.01] LEFT PARENTHESIS (OP) ÷ [0.3]
× 200D × 0308 × 0030 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [23.02] DIGIT ZERO (NU) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0030 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] DIGIT ZERO (NU) ÷ [0.3]
× 200D × 0028 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] LEFT PARENTHESIS (OP) ÷ [0.3]
× 200D × 0020 ÷ 0028 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] LEFT PARENTHESIS (OP) ÷ [0.3]
× 200D × 0308 × 0028 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [30.01] LEFT PARENTHESIS (OP) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0028 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] LEFT PARENTHESIS (OP) ÷ [0.3]
× 200D × 0025 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [24.03] PERCENT SIGN (PO) ÷ [0.3]
× 200D × 0308 × 0028 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [30.01] LEFT PARENTHESIS (OP) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0028 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] LEFT PARENTHESIS (OP) ÷ [0.3]
× 200D × 0025 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] PERCENT SIGN (PO) ÷ [0.3]
× 200D × 0020 ÷ 0025 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] PERCENT SIGN (PO) ÷ [0.3]
× 200D × 0308 × 0025 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [24.03] PERCENT SIGN (PO) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0025 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] PERCENT SIGN (PO) ÷ [0.3]
× 200D × 0024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [24.03] DOLLAR SIGN (PR) ÷ [0.3]
× 200D × 0308 × 0025 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [24.03] PERCENT SIGN (PO) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0025 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] PERCENT SIGN (PO) ÷ [0.3]
× 200D × 0024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] DOLLAR SIGN (PR) ÷ [0.3]
× 200D × 0020 ÷ 0024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] DOLLAR SIGN (PR) ÷ [0.3]
× 200D × 0308 × 0024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [24.03] DOLLAR SIGN (PR) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] DOLLAR SIGN (PR) ÷ [0.3]
× 200D × 0022 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [19.01] QUOTATION MARK (QU) ÷ [0.3]
× 200D × 0308 × 0024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [24.03] DOLLAR SIGN (PR) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0024 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] DOLLAR SIGN (PR) ÷ [0.3]
× 200D × 0022 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] QUOTATION MARK (QU) ÷ [0.3]
× 200D × 0020 ÷ 0022 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] QUOTATION MARK (QU) ÷ [0.3]
× 200D × 0308 × 0022 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [19.01] QUOTATION MARK (QU) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0022 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] QUOTATION MARK (QU) ÷ [0.3]
× 200D × 0308 × 0022 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [19.01] QUOTATION MARK (QU) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0022 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] QUOTATION MARK (QU) ÷ [0.3]
× 200D × 0020 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [0.3]
× 200D × 0020 × 0020 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [7.01] SPACE (SP) ÷ [0.3]
× 200D × 0308 × 0020 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [0.3]
× 200D × 0308 × 0020 × 0020 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [7.01] SPACE (SP) ÷ [0.3]
× 200D × 002F ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [13.04] SOLIDUS (SY) ÷ [0.3]
× 200D × 0308 × 0020 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [0.3]
× 200D × 0308 × 0020 × 0020 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [7.01] SPACE (SP) ÷ [0.3]
× 200D × 002F ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] SOLIDUS (SY) ÷ [0.3]
× 200D × 0020 × 002F ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [13.02] SOLIDUS (SY) ÷ [0.3]
× 200D × 0308 × 002F ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [13.04] SOLIDUS (SY) ÷ [0.3]
× 200D × 0308 × 0020 × 002F ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.02] SOLIDUS (SY) ÷ [0.3]
× 200D × 2060 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [11.01] WORD JOINER (WJ) ÷ [0.3]
× 200D × 0308 × 002F ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [13.04] SOLIDUS (SY) ÷ [0.3]
× 200D × 0308 × 0020 × 002F ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [13.02] SOLIDUS (SY) ÷ [0.3]
× 200D × 2060 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] WORD JOINER (WJ) ÷ [0.3]
× 200D × 0020 × 2060 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [11.01] WORD JOINER (WJ) ÷ [0.3]
× 200D × 0308 × 2060 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [11.01] WORD JOINER (WJ) ÷ [0.3]
× 200D × 0308 × 0020 × 2060 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [11.01] WORD JOINER (WJ) ÷ [0.3]
× 200D × 0308 × 2060 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [11.01] WORD JOINER (WJ) ÷ [0.3]
× 200D × 0308 × 0020 × 2060 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [11.01] WORD JOINER (WJ) ÷ [0.3]
× 200D × 200B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [0.3]
× 200D × 0020 × 200B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [0.3]
× 200D × 0308 × 200B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [0.3]
× 200D × 0308 × 0020 × 200B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [0.3]
× 200D ÷ 1F1E6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
× 200D × 0308 × 200B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [0.3]
× 200D × 0308 × 0020 × 200B ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) × [7.02] ZERO WIDTH SPACE (ZW) ÷ [0.3]
× 200D × 1F1E6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
× 200D × 0020 ÷ 1F1E6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
× 200D × 0308 ÷ 1F1E6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 1F1E6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
× 200D × 0308 ÷ 1F1E6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 1F1E6 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] REGIONAL INDICATOR SYMBOL LETTER A (RI) ÷ [0.3]
× 200D × 261D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] WHITE UP POINTING INDEX (EB) ÷ [0.3]
× 200D × 0020 ÷ 261D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] WHITE UP POINTING INDEX (EB) ÷ [0.3]
× 200D × 0308 ÷ 261D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] WHITE UP POINTING INDEX (EB) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 261D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] WHITE UP POINTING INDEX (EB) ÷ [0.3]
× 200D × 0308 ÷ 261D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] WHITE UP POINTING INDEX (EB) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 261D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] WHITE UP POINTING INDEX (EB) ÷ [0.3]
× 200D × 1F3FB ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (EM) ÷ [0.3]
× 200D × 0020 ÷ 1F3FB ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (EM) ÷ [0.3]
× 200D × 0308 ÷ 1F3FB ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (EM) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 1F3FB ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (EM) ÷ [0.3]
× 200D × 0001 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] <START OF HEADING> (CM1_CM) ÷ [0.3]
× 200D × 0308 ÷ 1F3FB ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) ÷ [999.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (EM) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 1F3FB ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] EMOJI MODIFIER FITZPATRICK TYPE-1-2 (EM) ÷ [0.3]
× 200D × 0001 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] <START OF HEADING> (CM1_CM) ÷ [0.3]
× 200D × 0020 ÷ 0001 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] <START OF HEADING> (CM1_CM) ÷ [0.3]
× 200D × 0308 × 0001 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [9.0] <START OF HEADING> (CM1_CM) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0001 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] <START OF HEADING> (CM1_CM) ÷ [0.3]
× 200D × 200D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [0.3]
× 200D × 0308 × 0001 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [9.0] <START OF HEADING> (CM1_CM) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0001 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] <START OF HEADING> (CM1_CM) ÷ [0.3]
× 200D × 200D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [0.3]
× 200D × 0020 ÷ 200D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [0.3]
× 200D × 0308 × 200D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [9.0] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 200D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [0.3]
× 200D × 00A7 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [28.0] SECTION SIGN (AI_AL) ÷ [0.3]
× 200D × 0308 × 200D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [9.0] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 200D ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) ÷ [0.3]
× 200D × 00A7 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] SECTION SIGN (AI_AL) ÷ [0.3]
× 200D × 0020 ÷ 00A7 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] SECTION SIGN (AI_AL) ÷ [0.3]
× 200D × 0308 × 00A7 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [28.0] SECTION SIGN (AI_AL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 00A7 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] SECTION SIGN (AI_AL) ÷ [0.3]
× 200D × 50005 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [28.0] <reserved-50005> (XX_AL) ÷ [0.3]
× 200D × 0308 × 00A7 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [28.0] SECTION SIGN (AI_AL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 00A7 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] SECTION SIGN (AI_AL) ÷ [0.3]
× 200D × 50005 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] <reserved-50005> (XX_AL) ÷ [0.3]
× 200D × 0020 ÷ 50005 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] <reserved-50005> (XX_AL) ÷ [0.3]
× 200D × 0308 × 50005 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [28.0] <reserved-50005> (XX_AL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 50005 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] <reserved-50005> (XX_AL) ÷ [0.3]
× 200D × 0E01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [28.0] THAI CHARACTER KO KAI (SA_AL) ÷ [0.3]
× 200D × 0308 × 50005 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [28.0] <reserved-50005> (XX_AL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 50005 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] <reserved-50005> (XX_AL) ÷ [0.3]
× 200D × 0E01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] THAI CHARACTER KO KAI (SA_AL) ÷ [0.3]
× 200D × 0020 ÷ 0E01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] THAI CHARACTER KO KAI (SA_AL) ÷ [0.3]
× 200D × 0308 × 0E01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [28.0] THAI CHARACTER KO KAI (SA_AL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0E01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] THAI CHARACTER KO KAI (SA_AL) ÷ [0.3]
× 200D × 3041 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [21.03] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 200D × 0308 × 0E01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [28.0] THAI CHARACTER KO KAI (SA_AL) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 0E01 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] THAI CHARACTER KO KAI (SA_AL) ÷ [0.3]
× 200D × 3041 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 200D × 0020 ÷ 3041 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [18.0] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 200D × 0308 × 3041 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [21.03] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 3041 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 200D × 0308 × 3041 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [21.03] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 200D × 0308 × 0020 ÷ 3041 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 00A7 × 0023 ÷ # × [0.3] SECTION SIGN (AI_AL) × [28.0] NUMBER SIGN (AL) ÷ [0.3]
× 00A7 × 0020 ÷ 0023 ÷ # × [0.3] SECTION SIGN (AI_AL) × [7.01] SPACE (SP) ÷ [18.0] NUMBER SIGN (AL) ÷ [0.3]
× 00A7 × 0308 × 0023 ÷ # × [0.3] SECTION SIGN (AI_AL) × [9.0] COMBINING DIAERESIS (CM1_CM) × [28.0] NUMBER SIGN (AL) ÷ [0.3]
@ -7084,7 +7084,7 @@
× 3041 × 0308 × 0020 ÷ 3041 ÷ # × [0.3] HIRAGANA LETTER SMALL A (CJ_NS) × [9.0] COMBINING DIAERESIS (CM1_CM) × [7.01] SPACE (SP) ÷ [18.0] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 000D × 000A ÷ 0061 × 000A ÷ 0308 ÷ # × [0.3] <CARRIAGE RETURN (CR)> (CR) × [5.01] <LINE FEED (LF)> (LF) ÷ [5.03] LATIN SMALL LETTER A (AL) × [6.0] <LINE FEED (LF)> (LF) ÷ [5.03] COMBINING DIAERESIS (CM1_CM) ÷ [0.3]
× 0061 × 0308 ÷ # × [0.3] LATIN SMALL LETTER A (AL) × [9.0] COMBINING DIAERESIS (CM1_CM) ÷ [0.3]
× 0020 ÷ 200D × 0646 ÷ # × [0.3] SPACE (SP) ÷ [18.0] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [28.0] ARABIC LETTER NOON (AL) ÷ [0.3]
× 0020 ÷ 200D × 0646 ÷ # × [0.3] SPACE (SP) ÷ [18.0] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] ARABIC LETTER NOON (AL) ÷ [0.3]
× 0646 × 200D × 0020 ÷ # × [0.3] ARABIC LETTER NOON (AL) × [9.0] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [7.01] SPACE (SP) ÷ [0.3]
× 000B ÷ 3041 ÷ # × [0.3] <LINE TABULATION> (BK) ÷ [4.0] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 000D ÷ 3041 ÷ # × [0.3] <CARRIAGE RETURN (CR)> (CR) ÷ [5.02] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
@ -7093,8 +7093,8 @@
× 3041 × 2060 ÷ # × [0.3] HIRAGANA LETTER SMALL A (CJ_NS) × [11.01] WORD JOINER (WJ) ÷ [0.3]
× 2060 × 3041 ÷ # × [0.3] WORD JOINER (WJ) × [11.02] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]
× 3041 × 0308 × 00A0 ÷ # × [0.3] HIRAGANA LETTER SMALL A (CJ_NS) × [9.0] COMBINING DIAERESIS (CM1_CM) × [12.2] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D × 00A0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [12.3] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D × 002F ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [13.04] SOLIDUS (SY) ÷ [0.3]
× 200D × 00A0 ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] NO-BREAK SPACE (GL) ÷ [0.3]
× 200D × 002F ÷ # × [0.3] ZERO WIDTH JOINER (ZWJ_O_ZWJ_CM) × [8.1] SOLIDUS (SY) ÷ [0.3]
× 2014 × 2014 ÷ # × [0.3] EM DASH (B2) × [17.0] EM DASH (B2) ÷ [0.3]
× 3041 ÷ FFFC ÷ # × [0.3] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [20.01] OBJECT REPLACEMENT CHARACTER (CB) ÷ [0.3]
× FFFC ÷ 3041 ÷ # × [0.3] OBJECT REPLACEMENT CHARACTER (CB) ÷ [20.02] HIRAGANA LETTER SMALL A (CJ_NS) ÷ [0.3]

View file

@ -5,7 +5,8 @@
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -59,9 +60,6 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -97,8 +95,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -107,7 +107,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -133,12 +133,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -185,15 +187,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;

View file

@ -5,7 +5,8 @@
#
# file: line_loose.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -67,9 +68,6 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -105,8 +103,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -115,7 +115,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -141,12 +141,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -193,15 +195,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;

View file

@ -5,7 +5,8 @@
#
# file: line_loose_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -84,9 +85,6 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -122,8 +120,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -132,7 +132,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -158,12 +158,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -214,15 +216,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;

View file

@ -5,7 +5,8 @@
#
# file: line_normal.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -73,9 +74,6 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -111,8 +109,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -121,7 +121,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -147,12 +147,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -199,15 +201,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;

View file

@ -5,7 +5,8 @@
#
# file: line_normal_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -75,9 +76,6 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
@ -116,8 +114,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -126,7 +126,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -152,12 +152,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -208,15 +210,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* [CM-ZWJ] ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4bd87b532fc7ad362740dde413999961c7f372cbbce5fb54160d201b783fec33
size 12503004
oid sha256:e7ec8322573fc8fddb5d6315aaab644a653821edddb021a8b1e68d20ecb8ba02
size 12500142

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5c25f29e8e9f5b7244a63ddc48dd7d69f56612b310e1de8351c9ea80a84afc6f
oid sha256:73151064e196a8b53c7929eef7583108e656e25081d41e45fe0e31b3e2efbc29
size 92867

View file

@ -646,8 +646,6 @@ public class RBBITestMonkey extends TestFmwk {
UnicodeSet fEB;
UnicodeSet fEM;
UnicodeSet fZWJ;
UnicodeSet fExtendedPict;
UnicodeSet fEmojiNRK;
StringBuffer fText;
int fOrigPositions;
@ -701,9 +699,6 @@ public class RBBITestMonkey extends TestFmwk {
fEB = new UnicodeSet("[\\p{Line_break=EB}]");
fEM = new UnicodeSet("[\\p{Line_break=EM}]");
fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]");
fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9©®™〰〽]]");
fExtendedPict = new UnicodeSet("[:Extended_Pictographic:]");
// Remove dictionary characters.
// The monkey test reference implementation of line break does not replicate the dictionary behavior,
@ -760,8 +755,6 @@ public class RBBITestMonkey extends TestFmwk {
fSets.add(fEB);
fSets.add(fEM);
fSets.add(fZWJ);
fSets.add(fExtendedPict);
fSets.add(fEmojiNRK);
}
@Override
@ -897,13 +890,39 @@ public class RBBITestMonkey extends TestFmwk {
break;
}
// LB 25 Numbers
// Move this test up, before LB8a, because numbers can match a longer sequence that would
// also match 8a. e.g. NU ZWJ IS PO (ZWJ acts like CM)
matchVals = LBNumberCheck(fText, prevPos, matchVals);
if (matchVals[0] != -1) {
// Matched a number. But could have been just a single digit, which would
// not represent a "no break here" between prevChar and thisChar
int numEndIdx = matchVals[1]; // idx of first char following num
if (numEndIdx > pos) {
// Number match includes at least the two chars being checked
if (numEndIdx > nextPos) {
// Number match includes additional chars. Update pos and nextPos
// so that next loop iteration will continue at the end of the number,
// checking for breaks between last char in number & whatever follows.
nextPos = numEndIdx;
pos = numEndIdx;
do {
pos = moveIndex32(fText, pos, -1);
thisChar = UTF16.charAt(fText, pos);
}
while (fCM.contains(thisChar));
}
continue;
}
}
// LB 8a: ZWJ x (ID | Extended_Pictographic | Emoji)
// The monkey test's way of ignoring combining characters doesn't work
// for this rule. ZWJ is also a CM. Need to get the actual character
// preceding "thisChar", not ignoring combining marks, possibly ZWJ.
{
int prevC = fText.codePointBefore(pos);
if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) {
if (fZWJ.contains(prevC)) {
continue;
}
}
@ -1088,31 +1107,7 @@ public class RBBITestMonkey extends TestFmwk {
continue;
}
// LB 25 Numbers
matchVals = LBNumberCheck(fText, prevPos, matchVals);
if (matchVals[0] != -1) {
// Matched a number. But could have been just a single digit, which would
// not represent a "no break here" between prevChar and thisChar
int numEndIdx = matchVals[1]; // idx of first char following num
if (numEndIdx > pos) {
// Number match includes at least the two chars being checked
if (numEndIdx > nextPos) {
// Number match includes additional chars. Update pos and nextPos
// so that next loop iteration will continue at the end of the number,
// checking for breaks between last char in number & whatever follows.
nextPos = numEndIdx;
pos = numEndIdx;
do {
pos = moveIndex32(fText, pos, -1);
thisChar = UTF16.charAt(fText, pos);
}
while (fCM.contains(thisChar));
}
continue;
}
}
// LB 25 Numbers match, moved up, before LB 8a.
// LB 26 Do not break Korean Syllables
if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||

View file

@ -5,7 +5,8 @@
# file: line.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -24,7 +25,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -59,16 +60,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -97,8 +95,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -107,7 +107,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -133,12 +133,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -185,15 +187,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;

View file

@ -5,7 +5,8 @@
#
# file: line_loose.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -31,7 +32,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -67,16 +68,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -105,8 +103,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -115,7 +115,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -141,12 +141,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -193,15 +195,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;

View file

@ -5,7 +5,8 @@
#
# file: line_loose_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -45,7 +46,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -84,16 +85,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -122,8 +120,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -132,7 +132,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -158,12 +158,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -214,15 +216,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;

View file

@ -5,7 +5,8 @@
#
# file: line_normal.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -38,7 +39,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -73,16 +74,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -111,8 +109,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -121,7 +121,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -147,12 +147,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -199,15 +201,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;

View file

@ -5,7 +5,8 @@
#
# file: line_normal_cj.txt
#
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest
# Reference Line Break rules for intltest rbbi/RBBIMonkeyTest.
# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
@ -39,7 +40,7 @@ B2 = [:LineBreak = Break_Both:];
CB = [:LineBreak = Contingent_Break:];
CJ = [:LineBreak = Conditional_Japanese_Starter:];
CL = [:LineBreak = Close_Punctuation:];
CM = [:LineBreak = Combining_Mark:];
CM_ = [:LineBreak = Combining_Mark:];
CP = [:LineBreak = Close_Parenthesis:];
CR = [:LineBreak = Carriage_Return:];
EB = [:LineBreak = EB:];
@ -75,16 +76,13 @@ XX = [:LineBreak = Unknown:];
ZW = [:LineBreak = ZWSpace:];
ZWJ = [:LineBreak = ZWJ:];
EmojiNRK = [[\p{Emoji}] - [[RI]\u002a\u00230-9©®™〰〽]];
Extended_Pict = [:ExtPict:];
# LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes
AL = [AL AI SG XX ];
dictionary = SA;
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
# list it in the numerous rules that use CM.
CM = [CM ZWJ];
CM = [CM_ ZWJ];
LB4: BK ÷;
LB5: CR LF;
@ -116,8 +114,10 @@ LB7.2: [ZW SP] [SP ZW];
LB8: ZW ÷;
# LB8a
# ZWJ x (ID | Extended_Pict | EmojiNRK)
LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
# ZWJ x
# Don't match a CM on the right - let other rules pick up CM sequences, where
# the ZWJ behaves as just another generic CM.
LB8a: ZWJ [^CM];
# LB9: X CM -> X
@ -126,7 +126,7 @@ LB8a: ZWJ (ID | Extended_Pict | EmojiNRK);
#LB11: × WJ;
# WJ ×
LB11.1: [^BK CR LF NL SP ZW] CM* WJ;
LB11.1: [^SP] CM* WJ;
LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
@ -152,12 +152,14 @@ LB19: . CM* QU;
LB19.1: QU CM* [^CM];
# LB 20 Break before and after CB.
# Interaction with LB8a: ZWJ x ID is tricky because CM includes ZWJ.
# Interaction with LB8a: ZWJ x . is tricky because CM includes ZWJ.
# ZWJ acts like a CM to the left, combining with CB.
# ZWJ acts independently to the right, no break from ID by LB8a.
LB20: . CM* ÷ CB;
LB20.1a: CB CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB20.1b: CB CM* ÷;
# ZWJ acts independently to the right, no break after by LB8a.
LB20.1: . CM* ZWJ CB;
LB20.2: . CM* ÷ CB;
LB20.3: CB CM* ZWJ [^CM];
LB20.4: CB CM* ÷;
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
@ -208,15 +210,15 @@ LB29: IS CM* (AL | HL);
LB30.1: (AL | CM | HL | NU) CM* OP;
LB30.2: CP CM* (AL | HL | NU);
# LB31 keep pairs of RI together.
LB30a.1: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS];
LB30a.2: RI CM* RI CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB30a.3: RI CM* RI CM* ÷;
# LB30a keep pairs of RI together.
LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.2: RI CM* RI CM* CM_ ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM];
LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?;
# LB30b Do not break between Emoji Base and Emoji Modifier
LB30b: EB CM* EM;
# LB31 Break Everywhere Else.
# Include combining marks
LB31.1: . CM* ZWJ (ID | Extended_Pict | EmojiNRK);
LB31.1: . CM* ZWJ [^CM];
LB31.2: . CM* ÷;