mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 13:35:32 +00:00
ICU-22986 GL takes CM
This commit is contained in:
parent
e3bc073737
commit
7d60bb844e
34 changed files with 81 additions and 39 deletions
|
@ -297,7 +297,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
$GL $CM* ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
|
|
|
@ -298,7 +298,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
$GL $CM* ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
|
|
|
@ -306,7 +306,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
$GL $CM* ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
|
|
|
@ -318,7 +318,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
$GL $CM* ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
|
|
|
@ -331,7 +331,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
$GL $CM* ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
|
|
|
@ -299,7 +299,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
$GL $CM* ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
|
|
|
@ -304,7 +304,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
$GL $CM* ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
|
|
|
@ -317,7 +317,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
$GL $CM* ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
|
|
|
@ -310,7 +310,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# and then to default UAX #14 behaviour (UTC-179-C32).
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
$GL ($HY | $HH) $CM* $ALPlus;
|
||||
$GL $CM* ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking CB from LB8a:
|
||||
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
|
||||
# Non-breaking SP from LB14:
|
||||
|
|
|
@ -1705,7 +1705,34 @@ class RemapRule : public SegmentationRule {
|
|||
resolved[i].appliedRule = this;
|
||||
resolved[i].indexInRemapped.reset();
|
||||
}
|
||||
// While replacing, we need to check that we are not creating
|
||||
// surrogate pairs. Since appendReplacement performs two
|
||||
// concatenations (the unreplaced segment and the replacement), we
|
||||
// need to check in two places: whether the unreplaced segment
|
||||
// starts with a trailing surrogate that ends up after a leading
|
||||
// surrogate, and whether the replaced segment starts with a leading
|
||||
// surrogate that ends up after a trailing surrogate.
|
||||
// We break the pair by replacing one of the surrogates with U+FFFF,
|
||||
// which has the same properties for all but line breaking, and the
|
||||
// same behaviour in line breaking (lb=SG and lb=XX are both treated
|
||||
// as lb=AL).
|
||||
std::optional<int32_t> trailingLead;
|
||||
if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) {
|
||||
trailingLead = result.length() - 1;
|
||||
}
|
||||
|
||||
matcher->appendReplacement(result, replacement_, status);
|
||||
|
||||
if (trailingLead && *trailingLead + 1 < result.length() &&
|
||||
U16_IS_TRAIL(result[*trailingLead + 1])) {
|
||||
result.setCharAt(*trailingLead, u'\uFFFF');
|
||||
}
|
||||
|
||||
if (matcher->start(status) + offset > 0 &&
|
||||
U16_IS_LEAD(result[matcher->start(status) + offset - 1]) &&
|
||||
U16_IS_TRAIL(result[matcher->start(status) + offset])) {
|
||||
result.setCharAt(matcher->start(status) + offset, u'\uFFFF');
|
||||
}
|
||||
offset = result.length() - *resolved[i].indexInRemapped;
|
||||
}
|
||||
for (; i < static_cast<int32_t>(resolved.size()); ++i) {
|
||||
|
@ -1714,7 +1741,17 @@ class RemapRule : public SegmentationRule {
|
|||
}
|
||||
*resolved[i].indexInRemapped += offset;
|
||||
}
|
||||
|
||||
std::optional<int32_t> trailingLead;
|
||||
if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) {
|
||||
trailingLead = result.length() - 1;
|
||||
}
|
||||
matcher->appendTail(result);
|
||||
if (trailingLead && *trailingLead + 1 < result.length() &&
|
||||
U16_IS_TRAIL(result[*trailingLead + 1])) {
|
||||
result.setCharAt(*trailingLead, u'\uFFFF');
|
||||
}
|
||||
|
||||
if (resolved.back().indexInRemapped != result.length()) {
|
||||
std::string indices;
|
||||
for (const auto r : resolved) {
|
||||
|
@ -2906,20 +2943,11 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
|
||||
std::list<std::pair<std::string, UnicodeSet>> partition;
|
||||
|
||||
// TODO(egg): The following two workarounds for what seems to be ICU bugs;
|
||||
// with UREGEX_DOTALL (but not UREGEX_MULTILINE):
|
||||
// 1. /.*\u000A/ does not match CR LF;
|
||||
// 2. /$/ matches ( BK | CR | LF | NL ) eot.
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(CR LF ÷)", uR"(\u000D\u000A)", u'÷', uR"()"));
|
||||
rules.push_back(std::make_unique<RegexRule>(
|
||||
uR"([^ BK CR LF NL ] × [ BK CR LF NL ] eot)",
|
||||
uR"([^ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ])",
|
||||
u'×',
|
||||
uR"([ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ] $)"));
|
||||
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(sot ÷ contra LB2)", uR"(^)", u'÷', uR"()"));
|
||||
// This one could be part of the rules.
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"($)"));
|
||||
// Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead.
|
||||
// The generated rules use the same (?!.).
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"((?!.))"));
|
||||
|
||||
// --- NOLI ME TANGERE ---
|
||||
// Generated by GenerateBreakTest.java in the Unicode tools.
|
||||
|
@ -3015,7 +3043,7 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
rules.push_back(std::make_unique<RegexRule>(uR"(× $CP)", uR"()", u'×', uR"(\p{Line_Break=CP})"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(× $SY)", uR"()", u'×', uR"(\p{Line_Break=Break_Symbols})"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"($OP $SP* ×)", uR"(\p{Line_Break=Open_Punctuation} \p{Line_Break=Space}*)", u'×', uR"()"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* ×)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(( $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW | $sot ) $QU_Pi $SP* ×)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | ^ ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(× $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ))", uR"()", u'×', uR"([\p{Line_Break=Quotation} && \p{gc=Pf}] ( \p{Line_Break=Space} | \p{Line_Break=Glue} | \p{Line_Break=Word_Joiner} | \p{Line_Break=Close_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=CP} | \p{Line_Break=Exclamation} | \p{Line_Break=Infix_Numeric} | \p{Line_Break=Break_Symbols} | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=ZWSpace} | (?!.) ))"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"($SP ÷ $IS $NU)", uR"(\p{Line_Break=Space})", u'÷', uR"(\p{Line_Break=Infix_Numeric} \p{Line_Break=Numeric})"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(× $IS)", uR"()", u'×', uR"(\p{Line_Break=Infix_Numeric})"));
|
||||
|
@ -3027,10 +3055,10 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
rules.push_back(std::make_unique<RegexRule>(uR"([^$EastAsian] × $QU)", uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])", u'×', uR"(\p{Line_Break=Quotation})"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(× $QU ( [^$EastAsian] | $eot ))", uR"()", u'×', uR"(\p{Line_Break=Quotation} ( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | (?!.) ))"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"($QU × [^$EastAsian])", uR"(\p{Line_Break=Quotation})", u'×', uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | [^$EastAsian] ) $QU ×)", uR"(( ^ | [^[\p{ea=F}\p{ea=W}\p{ea=H}]] ) \p{Line_Break=Quotation})", u'×', uR"()"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(( [^$EastAsian] | $sot ) $QU ×)", uR"(( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | ^ ) \p{Line_Break=Quotation})", u'×', uR"()"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(÷ $CB)", uR"()", u'÷', uR"(\p{Line_Break=Contingent_Break})"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"($CB ÷)", uR"(\p{Line_Break=Contingent_Break})", u'÷', uR"()"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(( $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL | $sot ) ( $HY | $Hyphen ) × $AL)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} | ^ ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(× $BA)", uR"()", u'×', uR"(\p{Line_Break=Break_After})"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(× $HY)", uR"()", u'×', uR"(\p{Line_Break=Hyphen})"));
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(× $NS)", uR"()", u'×', uR"([\p{Line_Break=Nonstarter} \p{Line_Break=Conditional_Japanese_Starter}])"));
|
||||
|
@ -3080,6 +3108,7 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
// --- End of generated code. ---
|
||||
|
||||
|
||||
|
||||
// TODO(egg): This could just as well be part of the rules…
|
||||
rules.push_back(std::make_unique<RegexRule>(uR"(ALL ÷ / ÷ ALL)",
|
||||
uR"()", u'÷',
|
||||
|
@ -3122,7 +3151,12 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
|
|||
}
|
||||
for (std::size_t i = 0; i < resolved.size(); ++i) {
|
||||
if (resolved[i].appliedRule == nullptr) {
|
||||
printf("Failed to resolve at %zu" , i);
|
||||
printf("Failed to resolve at %zu between U+%04X and U+%04X ", i, s.char32At(i-1), s.char32At(i));
|
||||
if (resolved[i].indexInRemapped.has_value()) {
|
||||
printf("which is remapped %d between U+%04X and U+%04X", *resolved[i].indexInRemapped,
|
||||
remapped.char32At(*resolved[i].indexInRemapped - 1),
|
||||
remapped.char32At(*resolved[i].indexInRemapped));
|
||||
}
|
||||
std::terminate();
|
||||
} else {
|
||||
setAppliedRule(i, resolved[i].appliedRule->name().c_str());
|
||||
|
|
|
@ -176,7 +176,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
|
@ -180,7 +180,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
|
@ -181,7 +181,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
|
@ -200,7 +200,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA BAX HY] CM* GL;
|
||||
|
|
|
@ -182,7 +182,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
|
@ -186,7 +186,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
4
icu4c/source/test/testdata/rbbitst.txt
vendored
4
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -2214,3 +2214,7 @@ Bangkok)•</data>
|
|||
<data>•« Complex »« chaining » •</data>
|
||||
<data>•« .618 »•</data> # Interaction with the ICU tailoring to break before such numbers.
|
||||
|
||||
# A hyphen following non-breaking space that carries an intervening combining
|
||||
# mark is treated as word-initial; by LB20a it has no break opportunity after
|
||||
# it. A bug in ICU 76 incorrectly handled that case (ICU-22986).
|
||||
<data>• ̄-k•</data>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1276,7 +1276,6 @@ public class RBBITestMonkey extends CoreTestFmwk {
|
|||
fLF.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
|
||||
fNL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
|
||||
fSP.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
|
||||
fGL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
|
||||
fZW.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
|
||||
setAppliedRule(pos, "LB 20a");
|
||||
continue;
|
||||
|
@ -1285,7 +1284,8 @@ public class RBBITestMonkey extends CoreTestFmwk {
|
|||
fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
|
||||
breakObliviousPrevPosX2 = moveIndex32(fText, breakObliviousPrevPosX2, -1);
|
||||
}
|
||||
if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
|
||||
if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
|
||||
fGL.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
|
||||
setAppliedRule(pos, "LB 20a");
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -176,7 +176,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
|
@ -180,7 +180,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
|
@ -181,7 +181,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
|
@ -200,7 +200,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA BAX HY] CM* GL;
|
||||
|
|
|
@ -182,7 +182,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
|
@ -186,7 +186,7 @@ LB11.2: SP WJ;
|
|||
LB11.3: WJ CM* [^CM];
|
||||
|
||||
# Needs to apply before LB12, because the new monkeys are not greedy.
|
||||
LB20a.2: GL (HY | HH) CM* AL;
|
||||
LB20a.2: GL CM* (HY | HH) CM* AL;
|
||||
LB12: GL CM* [^CM];
|
||||
|
||||
LB12a: [^SP BA HY] CM* GL;
|
||||
|
|
|
@ -2214,3 +2214,7 @@ Bangkok)•</data>
|
|||
<data>•« Complex »« chaining » •</data>
|
||||
<data>•« .618 »•</data> # Interaction with the ICU tailoring to break before such numbers.
|
||||
|
||||
# A hyphen following non-breaking space that carries an intervening combining
|
||||
# mark is treated as word-initial; by LB20a it has no break opportunity after
|
||||
# it. A bug in ICU 76 incorrectly handled that case (ICU-22986).
|
||||
<data>• ̄-k•</data>
|
Loading…
Add table
Reference in a new issue