diff --git a/icu4c/source/data/brkitr/rules/line_cj.txt b/icu4c/source/data/brkitr/rules/line_cj.txt index 1aac16e40e1..6403e62fe70 100644 --- a/icu4c/source/data/brkitr/rules/line_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_cj.txt @@ -7,7 +7,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 +# Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when @@ -70,6 +70,13 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14, +# without a formal name. Because ICU rules require multiple uses of the expressions, +# give them a single definition with a name + +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -109,7 +116,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; # @@ -212,7 +219,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # See issue ICU-20303 -$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL]; +$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN]; $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; @@ -283,16 +290,13 @@ $HL $CM* ($HY | $BA) $CM* [^$CB]?; # (break between HL and SY already disallowed by LB 13 above) $SY $CM* $HL; -# LB 22 -($ALPlus | $HL) $CM* $IN; -^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL -$EX $CM* $IN; -($ID | $EB | $EM) $CM* $IN; -$IN $CM* $IN; -$NU $CM* $IN; +# LB 22 Do not break before ellipses +# +$LB20NonBreaks $CM* $IN; +^$CM+ $IN; -# $LB 23 +# LB 23 # ($ALPlus | $HL) $CM* $NU; ^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL @@ -338,15 +342,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); $IS $CM* ($ALPlus | $HL); # LB 30 -($ALPlus | $HL | $NU) $CM* $OP; -^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL. -$CP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $OP30; +^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL. +$CP30 $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. # Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}]; # note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' # because of the chain-out behavior difference. The rule must chain out only from the [set characters], # not from the preceding $RI or $CM, which it would be able to do if the set were optional. diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index f38ea1b5a54..2794554e0b1 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -8,7 +8,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 +# Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when @@ -76,6 +76,13 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14, +# without a formal name. Because ICU rules require multiple uses of the expressions, +# give them a single definition with a name + +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -115,7 +122,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; # @@ -218,7 +225,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # See issue ICU-20303 -$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL]; +$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN]; $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; @@ -292,16 +299,14 @@ $HL $CM* ($HY | $BA) $CM* [^$CB]?; # (break between HL and SY already disallowed by LB 13 above) $SY $CM* $HL; -# LB 22 -($ALPlus | $HL) $CM* $IN; -^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL -$EX $CM* $IN; -($ID | $EB | $EM) $CM* $IN; -# $IN $CM* $IN; # delete this rule for CSS loose -$NU $CM* $IN; + +# LB 22 Do not break before ellipses +# +[$LB20NonBreaks - $IN] $CM* $IN; # line_loose tailoring +^$CM+ $IN; -# $LB 23 +# LB 23 # ($ALPlus | $HL) $CM* $NU; ^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL @@ -347,15 +352,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); $IS $CM* ($ALPlus | $HL); # LB 30 -($ALPlus | $HL | $NU) $CM* $OP; -^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL. -$CP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $OP30; +^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL. +$CP30 $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. # Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}]; # note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' # because of the chain-out behavior difference. The rule must chain out only from the [set characters], # not from the preceding $RI or $CM, which it would be able to do if the set were optional. diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 80d57c76a1b..54f06097dfb 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -7,7 +7,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 +# Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when @@ -87,6 +87,13 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14, +# without a formal name. Because ICU rules require multiple uses of the expressions, +# give them a single definition with a name + +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -126,7 +133,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus]; # @@ -229,7 +236,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # See issue ICU-20303 -$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL]; +$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN]; $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; @@ -303,16 +310,14 @@ $HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?; # (break between HL and SY already disallowed by LB 13 above) $SY $CM* $HL; -# LB 22 -($ALPlus | $HL) $CM* $IN; -^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL -$EX $CM* $IN; -($ID | $EB | $EM) $CM* $IN; -# $IN $CM* $IN; # delete this rule for CSS loose -$NU $CM* $IN; + +# LB 22 Do not break before ellipses +# +[$LB20NonBreaks - $IN] $CM* $IN; # line_loose tailoring +^$CM+ $IN; -# $LB 23 +# LB 23 # ($ALPlus | $HL) $CM* $NU; ^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL @@ -362,15 +367,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); $IS $CM* ($ALPlus | $HL); # LB 30 -($ALPlus | $HL | $NU) $CM* $OP; -^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL. -$CP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $OP30; +^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL. +$CP30 $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. # Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}]; # note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' # because of the chain-out behavior difference. The rule must chain out only from the [set characters], # not from the preceding $RI or $CM, which it would be able to do if the set were optional. diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index b46e8d16728..433a0b20eb9 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -7,7 +7,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 +# Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when @@ -71,6 +71,13 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14, +# without a formal name. Because ICU rules require multiple uses of the expressions, +# give them a single definition with a name + +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -110,7 +117,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; # @@ -213,7 +220,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # See issue ICU-20303 -$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL]; +$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN]; $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; @@ -284,16 +291,13 @@ $HL $CM* ($HY | $BA) $CM* [^$CB]?; # (break between HL and SY already disallowed by LB 13 above) $SY $CM* $HL; -# LB 22 -($ALPlus | $HL) $CM* $IN; -^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL -$EX $CM* $IN; -($ID | $EB | $EM) $CM* $IN; -$IN $CM* $IN; -$NU $CM* $IN; +# LB 22 Do not break before ellipses +# +$LB20NonBreaks $CM* $IN; +^$CM+ $IN; -# $LB 23 +# LB 23 # ($ALPlus | $HL) $CM* $NU; ^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL @@ -339,15 +343,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); $IS $CM* ($ALPlus | $HL); # LB 30 -($ALPlus | $HL | $NU) $CM* $OP; -^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL. -$CP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $OP30; +^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL. +$CP30 $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. # Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}]; # note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' # because of the chain-out behavior difference. The rule must chain out only from the [set characters], # not from the preceding $RI or $CM, which it would be able to do if the set were optional. diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index d3083cb02a0..6e1e846bf89 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -7,7 +7,7 @@ # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 42 for Unicode 12.0 +# Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # http://www.unicode.org/reports/tr14/, with the following modification: # # Boundaries between hyphens and following letters are suppressed when @@ -75,6 +75,13 @@ $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; $ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14, +# without a formal name. Because ICU rules require multiple uses of the expressions, +# give them a single definition with a name + +$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly # list it in the numerous rules that use CM. # By LB1, SA characters with general categor of Mn or Mc also resolve to CM. @@ -114,7 +121,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs # AL_FOLLOW set of chars that can unconditionally follow an AL # Needed in rules where stand-alone $CM s are treated as AL. # -$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; +$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus]; # @@ -217,7 +224,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL # See issue ICU-20303 -$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL]; +$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN]; $SP $IS / [^ $CanFollowIS $NU $CM]; $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM]; @@ -291,16 +298,13 @@ $HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?; # (break between HL and SY already disallowed by LB 13 above) $SY $CM* $HL; -# LB 22 -($ALPlus | $HL) $CM* $IN; -^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL -$EX $CM* $IN; -($ID | $EB | $EM) $CM* $IN; -$IN $CM* $IN; -$NU $CM* $IN; +# LB 22 Do not break before ellipses +# +$LB20NonBreaks $CM* $IN; +^$CM+ $IN; -# $LB 23 +# LB 23 # ($ALPlus | $HL) $CM* $NU; ^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL @@ -346,15 +350,15 @@ $PR $CM* ($JL | $JV | $JT | $H2 | $H3); $IS $CM* ($ALPlus | $HL); # LB 30 -($ALPlus | $HL | $NU) $CM* $OP; -^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL. -$CP $CM* ($ALPlus | $HL | $NU); +($ALPlus | $HL | $NU) $CM* $OP30; +^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL. +$CP30 $CM* ($ALPlus | $HL | $NU); # LB 30a Do not break between regional indicators. Break after pairs of them. # Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM. -$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]]; -$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}]; +$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]]; +$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}]; # note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?' # because of the chain-out behavior difference. The rule must chain out only from the [set characters], # not from the preceding $RI or $CM, which it would be able to do if the set were optional. diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt index 682c35d17f6..10513005486 100644 --- a/icu4c/source/test/testdata/break_rules/line.txt +++ b/icu4c/source/test/testdata/break_rules/line.txt @@ -6,7 +6,14 @@ # file: line.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0, +# with the following modification: +# +# Boundaries between hyphens and following letters are suppressed when +# there is a boundary preceding the hyphen. See rule 20.9 +# +# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict). +# It sets characters of class CJ to behave like NS. # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -172,7 +179,7 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.2: . CM* IN; +LB22: . CM* IN; LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); diff --git a/icu4c/source/test/testdata/break_rules/line_cj.txt b/icu4c/source/test/testdata/break_rules/line_cj.txt index 765953bb0c0..a4a541a7d69 100644 --- a/icu4c/source/test/testdata/break_rules/line_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_cj.txt @@ -6,7 +6,7 @@ # file: line.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -61,6 +61,20 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -165,11 +179,7 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -LB22.4: IN CM* IN; -LB22.5: NU CM* IN; +LB22: . CM* IN; LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -196,13 +206,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt index 86eb170c46e..904b9d13dfd 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose.txt @@ -6,7 +6,7 @@ # file: line_loose.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -69,6 +69,13 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -173,11 +180,8 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -# LB22.4: IN CM* IN; # delete this rule for CSS loose. -LB22.5: NU CM* IN; + +LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters. LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -204,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt index 049ecd017a3..8ad6a62d790 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -6,16 +6,15 @@ # file: line_loose_cj.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 -# http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below.. +# Unicode Standard Annex #14 +# http://www.unicode.org/reports/tr14/, tailored as noted below. # # This tailors the line break behavior to correspond to CSS # line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese. @@ -87,6 +86,13 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -196,11 +202,8 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -# LB22.4: IN CM* IN; # delete this rule for CSS loose. -LB22.5: NU CM* IN; + +LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters. LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -227,13 +230,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt index 2cf6e7c1158..db7ba4209a5 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal.txt @@ -6,20 +6,15 @@ # file: line_normal.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 -# http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. +# Unicode Standard Annex #14 +# http://www.unicode.org/reports/tr14/, tailored as noted below. # # This tailors the line break behavior to correspond to CSS # line-break=normal (BCP47 -u-lb-normal) as defined for languages other than @@ -75,6 +70,13 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -179,11 +181,7 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -LB22.4: IN CM* IN; -LB22.5: NU CM* IN; +LB22: . CM* IN; LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -210,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt index 57139f4b231..2c47c7fcba8 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt @@ -6,20 +6,15 @@ # file: line_normal_cj.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 -# http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. +# Unicode Standard Annex #14 +# http://www.unicode.org/reports/tr14/, tailored as noted below. # # This tailors the line break behavior to correspond to CSS # line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese. @@ -78,6 +73,13 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -188,11 +190,7 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -LB22.4: IN CM* IN; -LB22.5: NU CM* IN; +LB22: . CM* IN; LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -218,13 +216,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 73ea4a89421..298a0040dae 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96831df582da28121b19cf3faed7e84529ad0c1113b14cad0e01fabd4875c679 -size 12998991 +oid sha256:f9b73d720421a85704fc64aa0949c94d52e450a44af96c715881e9e6ab0fa3e6 +size 12998988 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt index 765953bb0c0..a4a541a7d69 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt @@ -6,7 +6,7 @@ # file: line.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -61,6 +61,20 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -165,11 +179,7 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -LB22.4: IN CM* IN; -LB22.5: NU CM* IN; +LB22: . CM* IN; LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -196,13 +206,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt index 86eb170c46e..904b9d13dfd 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt @@ -6,7 +6,7 @@ # file: line_loose.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. @@ -69,6 +69,13 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -173,11 +180,8 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -# LB22.4: IN CM* IN; # delete this rule for CSS loose. -LB22.5: NU CM* IN; + +LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters. LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -204,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt index 049ecd017a3..8ad6a62d790 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt @@ -6,16 +6,15 @@ # file: line_loose_cj.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 -# http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below.. +# Unicode Standard Annex #14 +# http://www.unicode.org/reports/tr14/, tailored as noted below. # # This tailors the line break behavior to correspond to CSS # line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese. @@ -87,6 +86,13 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -196,11 +202,8 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -# LB22.4: IN CM* IN; # delete this rule for CSS loose. -LB22.5: NU CM* IN; + +LB22: [^IN] CM* IN; # For CSS Loose, allow breaks between adjacent ellipses characters. LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -227,13 +230,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt index 2cf6e7c1158..db7ba4209a5 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt @@ -6,20 +6,15 @@ # file: line_normal.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 -# http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. +# Unicode Standard Annex #14 +# http://www.unicode.org/reports/tr14/, tailored as noted below. # # This tailors the line break behavior to correspond to CSS # line-break=normal (BCP47 -u-lb-normal) as defined for languages other than @@ -75,6 +70,13 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -179,11 +181,7 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -LB22.4: IN CM* IN; -LB22.5: NU CM* IN; +LB22: . CM* IN; LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -210,13 +208,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt index 57139f4b231..2c47c7fcba8 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt @@ -6,20 +6,15 @@ # file: line_normal_cj.txt # # Reference Line Break rules for intltest rbbi/RBBIMonkeyTest. -# Rules derived from Unicode Standard Annex #14 Revision 40 for Unicode 11.0 +# Rules derived from Unicode Standard Annex #14 Revision 44 for Unicode 13.0 # # Note: Rule syntax and the monkey test itself are still a work in progress. # They are expected to change with review and the addition of support for rule tailoring. # # Line Breaking Rules # Implement default line breaking as defined by -# Unicode Standard Annex #14 Revision 34 for Unicode 8.0 -# http://www.unicode.org/reports/tr14/ -# tailored as noted in 2nd paragraph below. -# -# TODO: Rule LB 8 remains as it was in Unicode 5.2 -# This is only because of a limitation of ICU break engine implementation, -# not because the older behavior is desirable. +# Unicode Standard Annex #14 +# http://www.unicode.org/reports/tr14/, tailored as noted below. # # This tailors the line break behavior to correspond to CSS # line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese. @@ -78,6 +73,13 @@ XX = [:LineBreak = Unknown:]; ZW = [:LineBreak = ZWSpace:]; ZWJ = [:LineBreak = ZWJ:]; +# OP30 and CP30 are variants of OP and CP that appear in rule LB30 from UAX 14. +# Limitations of this monkey test rule parser require that these definitions be pulled out +# rather than appearing in-line in LB 30. + +OP30 = [OP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; +CP30 = [CP - [\p{ea=F}\p{ea=W}\p{ea=H}]]; + # LB1 - Resolve AI, CB, CJ, SA, SG, and XX into other line breaking classes AL = [AL AI SG XX ]; dictionary = SA; @@ -188,11 +190,7 @@ LB21.2: BB CM* [^CM CB]; LB21b: SY CM* HL; -LB22.1: (AL | HL | CM) CM* IN; # The CM is from LB10, treat an unattached CM as AL. -LB22.2: EX CM* IN; -LB22.3: (ID | EB | EM) CM* IN; -LB22.4: IN CM* IN; -LB22.5: NU CM* IN; +LB22: . CM* IN; LB23.1: (AL | HL | CM) CM* NU; LB23.2: NU CM* (AL | HL); @@ -218,13 +216,13 @@ LB28: (AL | HL | CM)CM* (AL | HL); LB29: IS CM* (AL | HL); # LB30 is adjusted for unattached leading CM being treated as AL. -LB30.1: (AL | CM | HL | NU) CM* OP; -LB30.2: CP CM* (AL | HL | NU); +LB30.1: (AL | CM | HL | NU) CM* OP30; +LB30.2: CP30 CM* (AL | HL | NU); # LB30a keep pairs of RI together. -LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS CM]; -LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS ZWJ]?; +LB30a.1: RI CM* RI ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.2: RI CM* RI CM* CMS ÷ [^BK CR LF NL SP ZW WJ CL CP EX IS SY GL QU BA HY NS IN CM]; +LB30a.3: RI CM* RI CM* [BK CR LF NL SP ZW WJ GL CL CP EX IS SY QU BA HY NS IN ZWJ]?; # LB30b Do not break between Emoji Base and Emoji Modifier LB30b: EB CM* EM;