mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 08:53:20 +00:00
ICU-8151 Simplify Finnish Line Break Tailoring, move to root. (#99)
This commit is contained in:
parent
e6a5f0ee0a
commit
740b24118f
30 changed files with 229 additions and 1113 deletions
|
@ -39,13 +39,13 @@ BRK_DICT_SOURCE = burmesedict.txt cjdict.txt khmerdict.txt laodict.txt\
|
|||
|
||||
|
||||
# List of break iterator files (brk).
|
||||
BRK_SOURCE = char.txt line.txt line_fi.txt line_loose.txt\
|
||||
line_loose_cj.txt line_loose_fi.txt line_normal.txt line_normal_cj.txt line_normal_fi.txt\
|
||||
BRK_SOURCE = char.txt line.txt line_loose.txt\
|
||||
line_loose_cj.txt line_normal.txt line_normal_cj.txt\
|
||||
sent.txt sent_el.txt title.txt word.txt word_POSIX.txt
|
||||
|
||||
|
||||
# Ordinary resources
|
||||
BRK_RES_SOURCE = de.txt el.txt en.txt en_US.txt\
|
||||
en_US_POSIX.txt es.txt fi.txt fr.txt it.txt\
|
||||
en_US_POSIX.txt es.txt fr.txt it.txt\
|
||||
ja.txt pt.txt ru.txt zh.txt zh_Hant.txt
|
||||
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
fi{
|
||||
Version{"2.1.19.14"}
|
||||
boundaries{
|
||||
line:process(dependency){"line_fi.brk"}
|
||||
line_loose:process(dependency){"line_loose_fi.brk"}
|
||||
line_normal:process(dependency){"line_normal_fi.brk"}
|
||||
line_strict:process(dependency){"line_fi.brk"}
|
||||
}
|
||||
}
|
|
@ -8,11 +8,10 @@
|
|||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
|
||||
# It sets characters of class CJ to behave like NS.
|
||||
|
@ -27,6 +26,7 @@
|
|||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -229,17 +229,24 @@ $LB18NonBreaks $CM* $QU;
|
|||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
|
|
|
@ -1,339 +0,0 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_fi.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
#
|
||||
# This tailors the line break behavior for Finnish, while otherwise behaving
|
||||
# per UAX 14 which corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
|
||||
# It sets characters of class CJ to behave like NS.
|
||||
#
|
||||
# This corresponds to CSS line-break=strict (BCP47 -u-lb-strict).
|
||||
# It sets characters of class CJ to behave like NS.
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
$HL = [:LineBreak = Hebrew_Letter:];
|
||||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [:LineBreak = Ideographic:];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
$JV = [:LineBreak = JV:];
|
||||
$JT = [:LineBreak = JT:];
|
||||
$LF = [:LineBreak = Line_Feed:];
|
||||
$NL = [:LineBreak = Next_Line:];
|
||||
# NS includes CJ for CSS strict line breaking.
|
||||
$NS = [[:LineBreak = Nonstarter:] $CJ];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [:LineBreak = Open_Punctuation:];
|
||||
$PO = [:LineBreak = Postfix_Numeric:];
|
||||
$PR = [:LineBreak = Prefix_Numeric:];
|
||||
$QU = [:LineBreak = Quotation:];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
$SP = [:LineBreak = Space:];
|
||||
$SY = [:LineBreak = Break_Symbols:];
|
||||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
||||
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context (SA).
|
||||
|
||||
$dictionary = [$SA];
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
# SA (Dictionary chars, excluding Mn and Mc)
|
||||
# SG (Unpaired Surrogates)
|
||||
# XX (Unknown, unassigned)
|
||||
# as $AL (Alphabetic)
|
||||
#
|
||||
$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
# for what they can combine with are _very_ different from the rest of Unicode.
|
||||
#
|
||||
# Note that $CM itself is left out of this set. If CM is needed as a base
|
||||
# it must be listed separately in the rule.
|
||||
#
|
||||
$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
|
||||
$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
# LB 6 Do not break before hard line breaks.
|
||||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ [^$CM];
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
#
|
||||
# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
|
||||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
#
|
||||
$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
||||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 added rule for Finnish tailoring
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL;
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS);
|
||||
($HY | $HH) $AL;
|
||||
^$CM+ ($BA | $HY | $HH | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
|
@ -9,13 +9,10 @@
|
|||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
|
||||
|
@ -35,6 +32,7 @@
|
|||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -240,18 +238,25 @@ $LB18NonBreaks $CM* $QU;
|
|||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before NSX, so don't include it
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
|
|
|
@ -8,13 +8,10 @@
|
|||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
|
@ -42,6 +39,7 @@ $AI = [:LineBreak = Ambiguous:];
|
|||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BAX = [\u2010 \u2013];
|
||||
$BA = [[:LineBreak = Break_After:] - $BAX];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -250,18 +248,25 @@ $LB18NonBreaks $CM* $QU;
|
|||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $BAX and $NSX, so don't include them
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
|
|
|
@ -1,341 +0,0 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_loose_fi.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 3rd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
#
|
||||
# This tailors the line break behavior both for Finnish and to correpond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for languages other than
|
||||
# Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks before 3005, 303B, 309D, 309E, 30FD, 30FE (all NS).
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
$HL = [:LineBreak = Hebrew_Letter:];
|
||||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
# CSS Loose tailoring: CJ resolves to ID
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
$JV = [:LineBreak = JV:];
|
||||
$JT = [:LineBreak = JT:];
|
||||
$LF = [:LineBreak = Line_Feed:];
|
||||
$NL = [:LineBreak = Next_Line:];
|
||||
$NSX = [\u3005 \u303B \u309D \u309E \u30FD \u30FE];
|
||||
$NS = [[:LineBreak = Nonstarter:] - $NSX];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [:LineBreak = Open_Punctuation:];
|
||||
$PO = [:LineBreak = Postfix_Numeric:];
|
||||
$PR = [:LineBreak = Prefix_Numeric:];
|
||||
$QU = [:LineBreak = Quotation:];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
$SP = [:LineBreak = Space:];
|
||||
$SY = [:LineBreak = Break_Symbols:];
|
||||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
||||
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context (SA).
|
||||
|
||||
$dictionary = [$SA];
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
# SA (Dictionary chars, excluding Mn and Mc)
|
||||
# SG (Unpaired Surrogates)
|
||||
# XX (Unknown, unassigned)
|
||||
# as $AL (Alphabetic)
|
||||
#
|
||||
$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
# for what they can combine with are _very_ different from the rest of Unicode.
|
||||
#
|
||||
# Note that $CM itself is left out of this set. If CM is needed as a base
|
||||
# it must be listed separately in the rule.
|
||||
#
|
||||
$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
|
||||
$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
# LB 6 Do not break before hard line breaks.
|
||||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ [^$CM];
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
#
|
||||
# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
|
||||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
# Do not break between closing punctuation and $NS, even with intervening spaces
|
||||
# But DO allow a break between closing punctuation and $NSX, don't include it here
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
#
|
||||
$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
||||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 added rule for Finnish tailoring
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before NSX, so don't include it
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL;
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS);
|
||||
($HY | $HH) $AL;
|
||||
^$CM+ ($BA | $HY | $HH | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
# $IN $CM* $IN; # delete this rule for CSS loose
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
|
@ -8,13 +8,10 @@
|
|||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
|
||||
|
@ -31,6 +28,7 @@
|
|||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -233,17 +231,24 @@ $LB18NonBreaks $CM* $QU;
|
|||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
|
|
|
@ -8,13 +8,10 @@
|
|||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# tailored as noted in 2nd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
|
@ -33,6 +30,7 @@ $AI = [:LineBreak = Ambiguous:];
|
|||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BAX = [\u2010 \u2013];
|
||||
$BA = [[:LineBreak = Break_After:] - $BAX];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -238,18 +236,25 @@ $LB18NonBreaks $CM* $QU;
|
|||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
|
||||
# DO allow breaks here before $BAX and $NSX, so don't include them
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
|
|
|
@ -1,337 +0,0 @@
|
|||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
# Copyright (c) 2002-2016 International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#
|
||||
# file: line_normal_fi.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# http://www.unicode.org/reports/tr14/
|
||||
# tailored as noted in 3rd paragraph below.
|
||||
#
|
||||
# TODO: Rule LB 8 remains as it was in Unicode 5.2
|
||||
# This is only because of a limitation of ICU break engine implementation,
|
||||
# not because the older behavior is desirable.
|
||||
#
|
||||
# This tailors the line break behavior for Finnish, and to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for languages other than
|
||||
# Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [:LineBreak = Close_Punctuation:];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
$HL = [:LineBreak = Hebrew_Letter:];
|
||||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
# CSS Normal tailoring: CJ resolves to ID
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
$JV = [:LineBreak = JV:];
|
||||
$JT = [:LineBreak = JT:];
|
||||
$LF = [:LineBreak = Line_Feed:];
|
||||
$NL = [:LineBreak = Next_Line:];
|
||||
$NS = [:LineBreak = Nonstarter:];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [:LineBreak = Open_Punctuation:];
|
||||
$PO = [:LineBreak = Postfix_Numeric:];
|
||||
$PR = [:LineBreak = Prefix_Numeric:];
|
||||
$QU = [:LineBreak = Quotation:];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
$SP = [:LineBreak = Space:];
|
||||
$SY = [:LineBreak = Break_Symbols:];
|
||||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
||||
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context (SA).
|
||||
|
||||
$dictionary = [$SA];
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
# SA (Dictionary chars, excluding Mn and Mc)
|
||||
# SG (Unpaired Surrogates)
|
||||
# XX (Unknown, unassigned)
|
||||
# as $AL (Alphabetic)
|
||||
#
|
||||
$ALPlus = [$AL $AI $SG $XX [$SA-[[:Mn:][:Mc:]]]];
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
# for what they can combine with are _very_ different from the rest of Unicode.
|
||||
#
|
||||
# Note that $CM itself is left out of this set. If CM is needed as a base
|
||||
# it must be listed separately in the rule.
|
||||
#
|
||||
$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
|
||||
$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP $QU $BA $HH $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
# LB 6 Do not break before hard line breaks.
|
||||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ [^$CM];
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HH $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
#
|
||||
# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces.
|
||||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $IS;
|
||||
$CAN_CM $CM* $IS;
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
#
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
# LB 15
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
#
|
||||
$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
||||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 added rule for Finnish tailoring
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS) $CM* / $AL;
|
||||
$LB20NonBreaks $CM* ($BA | $HH | $HY | $NS);
|
||||
($HY | $HH) $AL;
|
||||
^$CM+ ($BA | $HY | $HH | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA | $HH) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22
|
||||
($ALPlus | $HL) $CM* $IN;
|
||||
^$CM+ $IN; # by rule 10, any otherwise unattached CM behaves as AL
|
||||
$EX $CM* $IN;
|
||||
($ID | $EB | $EM) $CM* $IN;
|
||||
$IN $CM* $IN;
|
||||
$NU $CM* $IN;
|
||||
|
||||
|
||||
# $LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $IN;
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP;
|
||||
^$CM+ $OP; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
||||
# LB 30b Do not break between an Emoji Base and an Emoji Modifier
|
||||
$EB $CM* $EM;
|
||||
|
||||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
|
@ -784,15 +784,18 @@ static const int32_t heTestOffs_heFwd[] = { 1, 5, 7, 9, 12, 14, 19,
|
|||
/*static const int32_t heTestOffs_enRev[] = { 22, 19, 17, 14, 12, 9, 7, 5, 1, 0 };*/
|
||||
static const int32_t heTestOffs_heRev[] = { 19, 14, 12, 9, 7, 5, 1, 0 };
|
||||
|
||||
/* Finnish line break tailoring, for cldrbug 3029 */
|
||||
/* Finnish line break tailoring, for cldrbug 3029.
|
||||
* As of ICU 63, Finnish tailoring moved to root, Finnish and English should be the same. */
|
||||
static const UChar fiTest[] = { /* 00 */ 0x0020, 0x002D, 0x0031, 0x0032, 0x0020,
|
||||
/* 05 */ 0x0061, 0x002D, 0x006B, 0x0020,
|
||||
/* 09 */ 0x0061, 0x0300, 0x2010, 0x006B, 0x0020,
|
||||
/* 14 */ 0x0061, 0x0020, 0x002D, 0x006B, 0x0020,
|
||||
/* 19 */ 0x0061, 0x0300, 0x0020, 0x2010, 0x006B, 0x0020, 0 };
|
||||
static const int32_t fiTestOffs_enFwd[] = { 1, 5, 7, 9, 12, 14, 16, 17, 19, 22, 23, 25 };
|
||||
//static const int32_t fiTestOffs_enFwd[] = { 1, 5, 7, 9, 12, 14, 16, 17, 19, 22, 23, 25 };
|
||||
static const int32_t fiTestOffs_enFwd[] = { 1, 5, 7, 9, 12, 14, 16, 19, 22, 25 };
|
||||
static const int32_t fiTestOffs_fiFwd[] = { 1, 5, 7, 9, 12, 14, 16, 19, 22, 25 };
|
||||
static const int32_t fiTestOffs_enRev[] = { 23, 22, 19, 17, 16, 14, 12, 9, 7, 5, 1, 0 };
|
||||
//static const int32_t fiTestOffs_enRev[] = { 23, 22, 19, 17, 16, 14, 12, 9, 7, 5, 1, 0 };
|
||||
static const int32_t fiTestOffs_enRev[] = { 22, 19, 16, 14, 12, 9, 7, 5, 1, 0 };
|
||||
static const int32_t fiTestOffs_fiRev[] = { 22, 19, 16, 14, 12, 9, 7, 5, 1, 0 };
|
||||
|
||||
/* Khmer dictionary-based work break, for ICU ticket #8329 */
|
||||
|
|
|
@ -184,6 +184,14 @@ void BreakRules::addRule(const UnicodeString &name, const UnicodeString &definit
|
|||
}
|
||||
fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
|
||||
|
||||
// If rule begins with a '^' rule chaining is disallowed.
|
||||
// Strip off the '^' from the rule expression, and set the flag.
|
||||
if (thisRule->fExpandedRule.charAt(0) == u'^') {
|
||||
thisRule->fInitialMatchOnly = true;
|
||||
thisRule->fExpandedRule.remove(0, 1);
|
||||
thisRule->fExpandedRule.trim();
|
||||
}
|
||||
|
||||
// Replace the divide sign (\u00f7) with a regular expression named capture.
|
||||
// When running the rules, a match that includes this group means we found a break position.
|
||||
|
||||
|
@ -442,6 +450,8 @@ void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode
|
|||
// ICU always reports a break there.
|
||||
// The reference rules do not have a means to do so.
|
||||
int32_t strIdx = 0;
|
||||
bool initialMatch = true; // True at start of text, and immediately after each boundary,
|
||||
// for control over rule chaining.
|
||||
while (strIdx < fString.length()) {
|
||||
BreakRule *matchingRule = NULL;
|
||||
UBool hasBreak = FALSE;
|
||||
|
@ -451,6 +461,10 @@ void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode
|
|||
int32_t breakGroup = 0;
|
||||
for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
|
||||
BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
|
||||
if (rule->fInitialMatchOnly && !initialMatch) {
|
||||
// Skip checking this '^' rule. (No rule chaining)
|
||||
continue;
|
||||
}
|
||||
rule->fRuleMatcher->reset();
|
||||
if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
|
||||
// A candidate rule match, check further to see if we take it or continue to check other rules.
|
||||
|
@ -512,10 +526,12 @@ void MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode
|
|||
// which may differ from end of the match. The matching rule may have included
|
||||
// context following the boundary that needs to be looked at again.
|
||||
strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
|
||||
initialMatch = true;
|
||||
} else {
|
||||
// Original rule didn't specify a break.
|
||||
// Continue applying rules starting on the last code point of this match.
|
||||
strIdx = fString.moveIndex32(matchEnd, -1);
|
||||
initialMatch = false;
|
||||
if (strIdx == matchStart) {
|
||||
// Match was only one code point, no progress if we continue.
|
||||
// Shouldn't get here, case is filtered out at top of loop.
|
||||
|
|
|
@ -102,6 +102,7 @@ class BreakRule: public UObject {
|
|||
UnicodeString fRule; // Rule expression, excluding the name, as written in user source.
|
||||
UnicodeString fExpandedRule; // Rule expression after expanding the set definitions.
|
||||
LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule.
|
||||
bool fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining.
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -1284,7 +1284,8 @@ void RBBITest::TestUnicodeFiles() {
|
|||
|
||||
// Check for test cases from the Unicode test data files that are known to fail
|
||||
// and should be skipped as known issues because ICU does not fully implement
|
||||
// the Unicode specifications.
|
||||
// the Unicode specifications, or because ICU includes tailorings that differ from
|
||||
// the Unicode standard.
|
||||
//
|
||||
// Test cases are identified by the test data sequence, which tends to be more stable
|
||||
// across Unicode versions than the test file line numbers.
|
||||
|
@ -1297,7 +1298,18 @@ UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *
|
|||
const char *fFileName;
|
||||
const UChar *fString;
|
||||
} badTestCases[] = {
|
||||
{"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"} // Fake example, for illustration.
|
||||
{"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"}, // Fake example, for illustration.
|
||||
// Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
|
||||
// This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
|
||||
// ICU is out of sync with Unicode.
|
||||
{"8151", "LineBreakTest.txt", u"-#"},
|
||||
{"8151", "LineBreakTest.txt", u"\u002d\u0308\u0023"},
|
||||
{"8151", "LineBreakTest.txt", u"\u002d\u00a7"},
|
||||
{"8151", "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
|
||||
{"8151", "LineBreakTest.txt", u"\u002d\U00050005"},
|
||||
{"8151", "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
|
||||
{"8151", "LineBreakTest.txt", u"\u002d\u0e01"},
|
||||
{"8151", "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
|
||||
};
|
||||
|
||||
for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
|
||||
|
@ -2516,6 +2528,7 @@ private:
|
|||
UnicodeSet *fB2;
|
||||
UnicodeSet *fBA;
|
||||
UnicodeSet *fBB;
|
||||
UnicodeSet *fHH;
|
||||
UnicodeSet *fHY;
|
||||
UnicodeSet *fH2;
|
||||
UnicodeSet *fH3;
|
||||
|
@ -2580,6 +2593,7 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
fB2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
|
||||
fBA = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
|
||||
fBB = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
|
||||
fHH = new UnicodeSet();
|
||||
fHY = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
|
||||
fH2 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
|
||||
fH3 = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
|
||||
|
@ -2620,7 +2634,9 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
fAL->addAll(*fSG); // Default behavior for SG is identical to AL.
|
||||
|
||||
fNS->addAll(*fCJ); // Default behavior for CJ is identical to NS.
|
||||
fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
|
||||
fCM->addAll(*fZWJ); // ZWJ behaves as a CM.
|
||||
|
||||
fHH->add(u'\u2010'); // Hyphen, '‐'
|
||||
|
||||
fSets->addElement(fBK, status);
|
||||
fSets->addElement(fCR, status);
|
||||
|
@ -3024,6 +3040,15 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
break;
|
||||
}
|
||||
|
||||
// LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen.
|
||||
// Formerly this was a Finnish tailoring.
|
||||
// Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
|
||||
// ^($HY | $HH) $AL;
|
||||
if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
|
||||
prevPosX2 == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 21
|
||||
if (fBA->contains(thisChar) ||
|
||||
fHY->contains(thisChar) ||
|
||||
|
@ -3195,6 +3220,7 @@ RBBILineMonkey::~RBBILineMonkey() {
|
|||
delete fB2;
|
||||
delete fBA;
|
||||
delete fBB;
|
||||
delete fHH;
|
||||
delete fHY;
|
||||
delete fH2;
|
||||
delete fH3;
|
||||
|
|
|
@ -19,6 +19,7 @@ locale = en;
|
|||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -144,6 +145,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
|
|
@ -26,6 +26,7 @@ locale = en@lb=loose;
|
|||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -125,7 +126,7 @@ LB12: GL CM* [^CM];
|
|||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
@ -152,6 +153,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
@ -176,7 +180,7 @@ LB23a.2: (ID | EB | EM) CM* PO;
|
|||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAX 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
|
|
|
@ -37,9 +37,10 @@ locale = ja@lb=loose;
|
|||
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [[:LineBreak = Alphabetic:]];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -169,6 +170,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
|
|
|
@ -33,6 +33,7 @@ locale = en@lb=normal;
|
|||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -158,6 +159,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
|
|
@ -34,6 +34,7 @@ AI = [:LineBreak = Ambiguous:];
|
|||
AL = [:LineBreak = Alphabetic:];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -163,6 +164,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
|
||||
|
|
16
icu4c/source/test/testdata/rbbitst.txt
vendored
16
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -1086,23 +1086,21 @@ Bangkok)•</data>
|
|||
# Finnish line breaking
|
||||
#
|
||||
# These rules deal with hyphens when there is a space on the leading side.
|
||||
# There should be a break opportunity between the space and the hyphen, and not after the hyphen.
|
||||
# When followed by a letter, there should be a break opportunity between
|
||||
# the space and the hyphen, and not after the hyphen.
|
||||
# See CLDR ticket 3029.
|
||||
# See ICU ticket 8151
|
||||
# As of ICU 63, the Finnish tailoring behavior is moved to root.
|
||||
|
||||
<locale root>
|
||||
<line>
|
||||
<data>•abc •- •def •abc •-•def •abc- •def •abc-•def•</data> # With ASCII hyphen
|
||||
<data>•abc •‐ •def •abc •‐•def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen
|
||||
<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASCII hyphen
|
||||
<data>•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen
|
||||
|
||||
<locale fi>
|
||||
<line>
|
||||
# TODO: problems with Finnish line break rules cause these two lines to fail.
|
||||
#<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASCII hyphen
|
||||
#<data>•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen
|
||||
|
||||
<data>•abc •- •def •abc •-def •abc- •def •</data> # With ASCII hyphen
|
||||
<data>•abc •‐ •def •abc •‐def •abc‐ •def •</data> # With Unicode u2010 hyphen
|
||||
<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASCII hyphen
|
||||
<data>•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen
|
||||
|
||||
# Test for #10176 (in fi)
|
||||
<line>
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:956147318ffa776ff18c71ab09c5ae63e336e14e240128c8602abf07ef7d7d3f
|
||||
size 12510547
|
||||
oid sha256:36d0ec0c543d1dccafcc6985a7c18285b255afb98bc2bdb16a867a22600bfddb
|
||||
size 12487287
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:55923dda88f8bf3affc2cf6d774a92a49e5fbc4be5583769bfe90fc7f319d2b1
|
||||
oid sha256:469f76e391dced8e9ae4a9543513dddd6d4d2026ad6cbc0ab79d9553da803e6a
|
||||
size 92857
|
||||
|
|
|
@ -60,10 +60,11 @@ public class RBBIMonkeyTest extends TestFmwk {
|
|||
// is compiled to a regular expression.
|
||||
|
||||
static class BreakRule {
|
||||
String fName; // Name of the rule.
|
||||
String fRule; // Rule expression, excluding the name, as written in user source.
|
||||
String fExpandedRule; // Rule expression after expanding the set definitions.
|
||||
Matcher fRuleMatcher; // Regular expression that matches the rule.
|
||||
String fName; // Name of the rule.
|
||||
String fRule; // Rule expression, excluding the name, as written in user source.
|
||||
String fExpandedRule; // Rule expression after expanding the set definitions.
|
||||
Matcher fRuleMatcher; // Regular expression that matches the rule.
|
||||
boolean fInitialMatchOnly = false; // True if rule begins with '^', meaning no chaining.
|
||||
};
|
||||
|
||||
|
||||
|
@ -220,6 +221,14 @@ public class RBBIMonkeyTest extends TestFmwk {
|
|||
}
|
||||
fPropertyMatcher.appendTail(expandedRule);
|
||||
|
||||
// If rule begins with a '^' rule chaining is disallowed.
|
||||
// Strip off the '^' from the rule expression, and set the flag.
|
||||
if (expandedRule.charAt(0) == '^') {
|
||||
thisRule.fInitialMatchOnly = true;
|
||||
expandedRule.deleteCharAt(0);
|
||||
expandedRule = new StringBuffer(expandedRule.toString().trim());
|
||||
}
|
||||
|
||||
// Replace any [^negated sets] with equivalent flattened sets generated by
|
||||
// ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
|
||||
// to any nested classes. Variable substitution in rules produces
|
||||
|
@ -549,6 +558,9 @@ public class RBBIMonkeyTest extends TestFmwk {
|
|||
// ICU always reports a break there.
|
||||
// The reference rules do not have a means to do so.
|
||||
int strIdx = 0;
|
||||
boolean initialMatch = true; // True at start of text, and immediately after each boundary,
|
||||
// // for control over rule chaining.
|
||||
|
||||
while (strIdx < fString.length()) {
|
||||
BreakRule matchingRule = null;
|
||||
boolean hasBreak = false;
|
||||
|
@ -557,6 +569,10 @@ public class RBBIMonkeyTest extends TestFmwk {
|
|||
int matchEnd = 0;
|
||||
for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
|
||||
BreakRule rule = rules.fBreakRules.get(ruleNum);
|
||||
if (rule.fInitialMatchOnly && !initialMatch) {
|
||||
// Skip checking this '^' rule. (No rule chaining)
|
||||
continue;
|
||||
}
|
||||
rule.fRuleMatcher.reset(fString.substring(strIdx));
|
||||
if (rule.fRuleMatcher.lookingAt()) {
|
||||
// A candidate rule match, check further to see if we take it or continue to check other rules.
|
||||
|
@ -607,6 +623,7 @@ public class RBBIMonkeyTest extends TestFmwk {
|
|||
// which may differ from end of the match. The matching rule may have included
|
||||
// context following the boundary that needs to be looked at again.
|
||||
strIdx = breakPos;
|
||||
initialMatch = true;
|
||||
} else {
|
||||
// Original rule didn't specify a break.
|
||||
// Continue applying rules starting on the last code point of this match.
|
||||
|
@ -618,6 +635,7 @@ public class RBBIMonkeyTest extends TestFmwk {
|
|||
rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
|
||||
}
|
||||
strIdx = updatedStrIdx;
|
||||
initialMatch = false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -617,6 +617,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
UnicodeSet fB2;
|
||||
UnicodeSet fBA;
|
||||
UnicodeSet fBB;
|
||||
UnicodeSet fHH;
|
||||
UnicodeSet fHY;
|
||||
UnicodeSet fCB;
|
||||
UnicodeSet fCL;
|
||||
|
@ -659,6 +660,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
|
||||
class XUnicodeSet extends UnicodeSet {
|
||||
XUnicodeSet(String pattern) { super(pattern); }
|
||||
XUnicodeSet() { super(); }
|
||||
@Override
|
||||
public boolean contains(int codePoint) {
|
||||
return codePoint < UnicodeSet.MIN_VALUE || codePoint > UnicodeSet.MAX_VALUE ?
|
||||
|
@ -684,6 +686,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fB2 = new XUnicodeSet("[\\p{Line_break=B2}]");
|
||||
fBA = new XUnicodeSet("[\\p{Line_break=BA}]");
|
||||
fBB = new XUnicodeSet("[\\p{Line_break=BB}]");
|
||||
fHH = new XUnicodeSet();
|
||||
fHY = new XUnicodeSet("[\\p{Line_break=HY}]");
|
||||
fCB = new XUnicodeSet("[\\p{Line_break=CB}]");
|
||||
fCL = new XUnicodeSet("[\\p{Line_break=CL}]");
|
||||
|
@ -728,6 +731,8 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
fNS.addAll(fCJ); // Default behavior for CJ is identical to NS.
|
||||
fCM.addAll(fZWJ); // ZWJ behaves as a CM.
|
||||
|
||||
fHH.add('\u2010'); // Hyphen, '‐'
|
||||
|
||||
fSets.add(fBK);
|
||||
fSets.add(fCR);
|
||||
fSets.add(fLF);
|
||||
|
@ -786,12 +791,14 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
|
||||
int prevPos; // Index of the char preceding a potential break position
|
||||
int prevChar; // Character at above position. Note that prevChar
|
||||
// and thisChar may not be adjacent because combining
|
||||
// characters between them will be ignored.
|
||||
int prevCharX2; // Character before prevChar, more contex for LB 21a
|
||||
// // and thisChar may not be adjacent because combining
|
||||
// // characters between them will be ignored.
|
||||
|
||||
int prevPosX2;
|
||||
int prevCharX2; // Character before prevChar, more context for LB 21a
|
||||
|
||||
int nextPos; // Index of the next character following pos.
|
||||
// Usually skips over combining marks.
|
||||
// // Usually skips over combining marks.
|
||||
int tPos; // temp value.
|
||||
int matchVals[] = null; // Number Expression Match Results
|
||||
|
||||
|
@ -804,8 +811,8 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
// Initial values for loop. Loop will run the first time without finding breaks,
|
||||
// while the invalid values shift out and the "this" and
|
||||
// "prev" positions are filled in with good values.
|
||||
pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration.
|
||||
thisChar = prevChar = prevCharX2 = 0;
|
||||
pos = prevPos = prevPosX2 = -1; // Invalid value, serves as flag for initial loop iteration.
|
||||
thisChar = prevChar = prevCharX2 = 0;
|
||||
nextPos = startPos;
|
||||
|
||||
|
||||
|
@ -816,6 +823,7 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
// "prevPos" can be arbitrarily far before "pos".
|
||||
for (;;) {
|
||||
// Advance to the next position to be tested.
|
||||
prevPosX2 = prevPos;
|
||||
prevCharX2 = prevChar;
|
||||
prevPos = pos;
|
||||
prevChar = thisChar;
|
||||
|
@ -1066,6 +1074,15 @@ public class RBBITestMonkey extends TestFmwk {
|
|||
break;
|
||||
}
|
||||
|
||||
// LB 20.09 Don't break between Hyphens and letters if a break precedes the hyphen.
|
||||
// Formerly this was a Finnish tailoring.
|
||||
// Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
|
||||
// ^($HY | $HH) $AL;
|
||||
if (fAL.contains(thisChar) && (fHY.contains(prevChar) || fHH.contains(prevChar)) &&
|
||||
prevPosX2 == -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 21
|
||||
if (fBA.contains(thisChar) ||
|
||||
fHY.contains(thisChar) ||
|
||||
|
|
|
@ -19,6 +19,7 @@ locale = en;
|
|||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -144,6 +145,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
|
|
@ -26,6 +26,7 @@ locale = en@lb=loose;
|
|||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -125,7 +126,7 @@ LB12: GL CM* [^CM];
|
|||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
||||
# LB 13 ICU Tailoring, matches tailoring exmaple 8 from UAX 14.
|
||||
# LB 13 ICU Tailoring, matches tailoring example 8 from UAX 14.
|
||||
#
|
||||
# LB13.1 [^SP] CM* [CL CP EX IS SY] # original UAX 14 rule.
|
||||
# LB13.2 SP CM* [CL CP EX IS SY]
|
||||
|
@ -152,6 +153,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
@ -176,7 +180,7 @@ LB23a.2: (ID | EB | EM) CM* PO;
|
|||
LB24.2: (PR | PO) CM* (AL | HL);
|
||||
LB24.3: (AL | HL | CM) CM* (PR | PO);
|
||||
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAx 14.
|
||||
# Numbers. Equivalent to Tailoring example 8 from UAX 14.
|
||||
LB25: ((PR | PO)CM*)? ((OP | HY)CM*)? NU (CM*(NU | SY | IS))* (CM*(CL | CP))? (CM*(PR | PO))?;
|
||||
|
||||
LB26.1: JL CM* (JL | JV | H2 | H3);
|
||||
|
|
|
@ -37,9 +37,10 @@ locale = ja@lb=loose;
|
|||
|
||||
|
||||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [[:LineBreak = Alphabetic:]];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -169,6 +170,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
|
|
|
@ -33,6 +33,7 @@ locale = en@lb=normal;
|
|||
AI = [:LineBreak = Ambiguous:];
|
||||
AL = [:LineBreak = Alphabetic:];
|
||||
BA = [:LineBreak = Break_After:];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -158,6 +159,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
LB21a: HL CM* (HY | BA) CM* [^CM CB];
|
||||
|
|
|
@ -34,6 +34,7 @@ AI = [:LineBreak = Ambiguous:];
|
|||
AL = [:LineBreak = Alphabetic:];
|
||||
BAX = [\u2010 \u2013];
|
||||
BA = [[:LineBreak = Break_After:] - BAX];
|
||||
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
BB = [:LineBreak = Break_Before:];
|
||||
BK = [:LineBreak = Mandatory_Break:];
|
||||
B2 = [:LineBreak = Break_Both:];
|
||||
|
@ -163,6 +164,9 @@ LB20.2: . CM* ÷ CB;
|
|||
LB20.3: CB CM* ZWJ [^CM];
|
||||
LB20.4: CB CM* ÷;
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
LB20.09: ^(HY | HH) CM* AL;
|
||||
|
||||
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
|
||||
# not picking up the continuing match after the BA from 21a.
|
||||
# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
|
||||
|
|
|
@ -1086,23 +1086,21 @@ Bangkok)•</data>
|
|||
# Finnish line breaking
|
||||
#
|
||||
# These rules deal with hyphens when there is a space on the leading side.
|
||||
# There should be a break opportunity between the space and the hyphen, and not after the hyphen.
|
||||
# When followed by a letter, there should be a break opportunity between
|
||||
# the space and the hyphen, and not after the hyphen.
|
||||
# See CLDR ticket 3029.
|
||||
# See ICU ticket 8151
|
||||
# As of ICU 63, the Finnish tailoring behavior is moved to root.
|
||||
|
||||
<locale root>
|
||||
<line>
|
||||
<data>•abc •- •def •abc •-•def •abc- •def •abc-•def•</data> # With ASCII hyphen
|
||||
<data>•abc •‐ •def •abc •‐•def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen
|
||||
<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASCII hyphen
|
||||
<data>•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen
|
||||
|
||||
<locale fi>
|
||||
<line>
|
||||
# TODO: problems with Finnish line break rules cause these two lines to fail.
|
||||
#<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASCII hyphen
|
||||
#<data>•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen
|
||||
|
||||
<data>•abc •- •def •abc •-def •abc- •def •</data> # With ASCII hyphen
|
||||
<data>•abc •‐ •def •abc •‐def •abc‐ •def •</data> # With Unicode u2010 hyphen
|
||||
<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASCII hyphen
|
||||
<data>•abc •‐ •def •abc •‐def •abc‐ •def •abc‐•def•</data> # With Unicode u2010 hyphen
|
||||
|
||||
# Test for #10176 (in fi)
|
||||
<line>
|
||||
|
|
Loading…
Add table
Reference in a new issue