ICU-22986 GL takes CM

This commit is contained in:
Robin Leroy 2024-12-10 18:04:54 +01:00
parent e3bc073737
commit 7d60bb844e
34 changed files with 81 additions and 39 deletions

View file

@ -297,7 +297,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:

View file

@ -298,7 +298,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:

View file

@ -306,7 +306,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:

View file

@ -318,7 +318,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:

View file

@ -331,7 +331,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:

View file

@ -299,7 +299,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:

View file

@ -304,7 +304,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:

View file

@ -317,7 +317,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:

View file

@ -310,7 +310,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
# and then to default UAX #14 behaviour (UTC-179-C32).
#
^($HY | $HH) $CM* $ALPlus;
$GL ($HY | $HH) $CM* $ALPlus;
$GL $CM* ($HY | $HH) $CM* $ALPlus;
# Non-breaking CB from LB8a:
$CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus;
# Non-breaking SP from LB14:

View file

@ -1705,7 +1705,34 @@ class RemapRule : public SegmentationRule {
resolved[i].appliedRule = this;
resolved[i].indexInRemapped.reset();
}
// While replacing, we need to check that we are not creating
// surrogate pairs. Since appendReplacement performs two
// concatenations (the unreplaced segment and the replacement), we
// need to check in two places: whether the unreplaced segment
// starts with a trailing surrogate that ends up after a leading
// surrogate, and whether the replaced segment starts with a leading
// surrogate that ends up after a trailing surrogate.
// We break the pair by replacing one of the surrogates with U+FFFF,
// which has the same properties for all but line breaking, and the
// same behaviour in line breaking (lb=SG and lb=XX are both treated
// as lb=AL).
std::optional<int32_t> trailingLead;
if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) {
trailingLead = result.length() - 1;
}
matcher->appendReplacement(result, replacement_, status);
if (trailingLead && *trailingLead + 1 < result.length() &&
U16_IS_TRAIL(result[*trailingLead + 1])) {
result.setCharAt(*trailingLead, u'\uFFFF');
}
if (matcher->start(status) + offset > 0 &&
U16_IS_LEAD(result[matcher->start(status) + offset - 1]) &&
U16_IS_TRAIL(result[matcher->start(status) + offset])) {
result.setCharAt(matcher->start(status) + offset, u'\uFFFF');
}
offset = result.length() - *resolved[i].indexInRemapped;
}
for (; i < static_cast<int32_t>(resolved.size()); ++i) {
@ -1714,7 +1741,17 @@ class RemapRule : public SegmentationRule {
}
*resolved[i].indexInRemapped += offset;
}
std::optional<int32_t> trailingLead;
if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) {
trailingLead = result.length() - 1;
}
matcher->appendTail(result);
if (trailingLead && *trailingLead + 1 < result.length() &&
U16_IS_TRAIL(result[*trailingLead + 1])) {
result.setCharAt(*trailingLead, u'\uFFFF');
}
if (resolved.back().indexInRemapped != result.length()) {
std::string indices;
for (const auto r : resolved) {
@ -2906,20 +2943,11 @@ RBBILineMonkey::RBBILineMonkey() :
std::list<std::pair<std::string, UnicodeSet>> partition;
// TODO(egg): The following two workarounds for what seems to be ICU bugs;
// with UREGEX_DOTALL (but not UREGEX_MULTILINE):
// 1. /.*\u000A/ does not match CR LF;
// 2. /$/ matches ( BK | CR | LF | NL ) eot.
rules.push_back(std::make_unique<RegexRule>(uR"(CR LF ÷)", uR"(\u000D\u000A)", u'÷', uR"()"));
rules.push_back(std::make_unique<RegexRule>(
uR"([^ BK CR LF NL ] × [ BK CR LF NL ] eot)",
uR"([^ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ])",
u'×',
uR"([ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ] $)"));
rules.push_back(std::make_unique<RegexRule>(uR"(sot ÷ contra LB2)", uR"(^)", u'÷', uR"()"));
// This one could be part of the rules.
rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"($)"));
// Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead.
// The generated rules use the same (?!.).
rules.push_back(std::make_unique<RegexRule>(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"((?!.))"));
// --- NOLI ME TANGERE ---
// Generated by GenerateBreakTest.java in the Unicode tools.
@ -3015,7 +3043,7 @@ RBBILineMonkey::RBBILineMonkey() :
rules.push_back(std::make_unique<RegexRule>(uR"(× $CP)", uR"()", u'×', uR"(\p{Line_Break=CP})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $SY)", uR"()", u'×', uR"(\p{Line_Break=Break_Symbols})"));
rules.push_back(std::make_unique<RegexRule>(uR"($OP $SP* ×)", uR"(\p{Line_Break=Open_Punctuation} \p{Line_Break=Space}*)", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* ×)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW | $sot ) $QU_Pi $SP* ×)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | ^ ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ))", uR"()", u'×', uR"([\p{Line_Break=Quotation} && \p{gc=Pf}] ( \p{Line_Break=Space} | \p{Line_Break=Glue} | \p{Line_Break=Word_Joiner} | \p{Line_Break=Close_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=CP} | \p{Line_Break=Exclamation} | \p{Line_Break=Infix_Numeric} | \p{Line_Break=Break_Symbols} | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=ZWSpace} | (?!.) ))"));
rules.push_back(std::make_unique<RegexRule>(uR"($SP ÷ $IS $NU)", uR"(\p{Line_Break=Space})", u'÷', uR"(\p{Line_Break=Infix_Numeric} \p{Line_Break=Numeric})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $IS)", uR"()", u'×', uR"(\p{Line_Break=Infix_Numeric})"));
@ -3027,10 +3055,10 @@ RBBILineMonkey::RBBILineMonkey() :
rules.push_back(std::make_unique<RegexRule>(uR"([^$EastAsian] × $QU)", uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])", u'×', uR"(\p{Line_Break=Quotation})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $QU ( [^$EastAsian] | $eot ))", uR"()", u'×', uR"(\p{Line_Break=Quotation} ( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | (?!.) ))"));
rules.push_back(std::make_unique<RegexRule>(uR"($QU × [^$EastAsian])", uR"(\p{Line_Break=Quotation})", u'×', uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | [^$EastAsian] ) $QU ×)", uR"(( ^ | [^[\p{ea=F}\p{ea=W}\p{ea=H}]] ) \p{Line_Break=Quotation})", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(( [^$EastAsian] | $sot ) $QU ×)", uR"(( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | ^ ) \p{Line_Break=Quotation})", u'×', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(÷ $CB)", uR"()", u'÷', uR"(\p{Line_Break=Contingent_Break})"));
rules.push_back(std::make_unique<RegexRule>(uR"($CB ÷)", uR"(\p{Line_Break=Contingent_Break})", u'÷', uR"()"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])"));
rules.push_back(std::make_unique<RegexRule>(uR"(( $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL | $sot ) ( $HY | $Hyphen ) × $AL)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} | ^ ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $BA)", uR"()", u'×', uR"(\p{Line_Break=Break_After})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $HY)", uR"()", u'×', uR"(\p{Line_Break=Hyphen})"));
rules.push_back(std::make_unique<RegexRule>(uR"(× $NS)", uR"()", u'×', uR"([\p{Line_Break=Nonstarter} \p{Line_Break=Conditional_Japanese_Starter}])"));
@ -3080,6 +3108,7 @@ RBBILineMonkey::RBBILineMonkey() :
// --- End of generated code. ---
// TODO(egg): This could just as well be part of the rules…
rules.push_back(std::make_unique<RegexRule>(uR"(ALL ÷ / ÷ ALL)",
uR"()", u'÷',
@ -3122,7 +3151,12 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
}
for (std::size_t i = 0; i < resolved.size(); ++i) {
if (resolved[i].appliedRule == nullptr) {
printf("Failed to resolve at %zu" , i);
printf("Failed to resolve at %zu between U+%04X and U+%04X ", i, s.char32At(i-1), s.char32At(i));
if (resolved[i].indexInRemapped.has_value()) {
printf("which is remapped %d between U+%04X and U+%04X", *resolved[i].indexInRemapped,
remapped.char32At(*resolved[i].indexInRemapped - 1),
remapped.char32At(*resolved[i].indexInRemapped));
}
std::terminate();
} else {
setAppliedRule(i, resolved[i].appliedRule->name().c_str());

View file

@ -176,7 +176,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -180,7 +180,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -181,7 +181,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -200,7 +200,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA BAX HY] CM* GL;

View file

@ -182,7 +182,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -186,7 +186,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -2214,3 +2214,7 @@ Bangkok)•</data>
<data>•« Complex »« chaining » •</data>
<data>•« .618 »•</data> # Interaction with the ICU tailoring to break before such numbers.
# A hyphen following non-breaking space that carries an intervening combining
# mark is treated as word-initial; by LB20a it has no break opportunity after
# it. A bug in ICU 76 incorrectly handled that case (ICU-22986).
<data>• ̄-k•</data>

View file

@ -1276,7 +1276,6 @@ public class RBBITestMonkey extends CoreTestFmwk {
fLF.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fNL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fSP.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fGL.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fZW.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
setAppliedRule(pos, "LB 20a");
continue;
@ -1285,7 +1284,8 @@ public class RBBITestMonkey extends CoreTestFmwk {
fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
breakObliviousPrevPosX2 = moveIndex32(fText, breakObliviousPrevPosX2, -1);
}
if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2)) ||
fGL.contains(fText.codePointAt(breakObliviousPrevPosX2))) {
setAppliedRule(pos, "LB 20a");
continue;
}

View file

@ -176,7 +176,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -180,7 +180,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -181,7 +181,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -200,7 +200,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA BAX HY] CM* GL;

View file

@ -182,7 +182,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -186,7 +186,7 @@ LB11.2: SP WJ;
LB11.3: WJ CM* [^CM];
# Needs to apply before LB12, because the new monkeys are not greedy.
LB20a.2: GL (HY | HH) CM* AL;
LB20a.2: GL CM* (HY | HH) CM* AL;
LB12: GL CM* [^CM];
LB12a: [^SP BA HY] CM* GL;

View file

@ -2214,3 +2214,7 @@ Bangkok)•</data>
<data>•« Complex »« chaining » •</data>
<data>•« .618 »•</data> # Interaction with the ICU tailoring to break before such numbers.
# A hyphen following non-breaking space that carries an intervening combining
# mark is treated as word-initial; by LB20a it has no break opportunity after
# it. A bug in ICU 76 incorrectly handled that case (ICU-22986).
<data>• ̄-k•</data>