diff --git a/icu4c/source/data/brkitr/rules/line.txt b/icu4c/source/data/brkitr/rules/line.txt index 9f3e44984ea..e43e70b3631 100644 --- a/icu4c/source/data/brkitr/rules/line.txt +++ b/icu4c/source/data/brkitr/rules/line.txt @@ -297,7 +297,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_cj.txt b/icu4c/source/data/brkitr/rules/line_cj.txt index fc615f55db2..793163898e0 100644 --- a/icu4c/source/data/brkitr/rules/line_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_cj.txt @@ -298,7 +298,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_loose.txt b/icu4c/source/data/brkitr/rules/line_loose.txt index 2bb9be5845f..9ff4e17eb3a 100644 --- a/icu4c/source/data/brkitr/rules/line_loose.txt +++ b/icu4c/source/data/brkitr/rules/line_loose.txt @@ -306,7 +306,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt index 15715a22512..428d225f16d 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt @@ -318,7 +318,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt index 87ab33b48a1..2edf4b3bc33 100644 --- a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt @@ -331,7 +331,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_normal.txt b/icu4c/source/data/brkitr/rules/line_normal.txt index c41280c28d1..bf6dee8c05c 100644 --- a/icu4c/source/data/brkitr/rules/line_normal.txt +++ b/icu4c/source/data/brkitr/rules/line_normal.txt @@ -299,7 +299,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt index 31dd65854cb..f596454621d 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt @@ -304,7 +304,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt index 85d771fcdbf..e0bbd00025f 100644 --- a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt @@ -317,7 +317,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/data/brkitr/rules/line_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_phrase_cj.txt index 41e05bf4963..14b118789e7 100644 --- a/icu4c/source/data/brkitr/rules/line_phrase_cj.txt +++ b/icu4c/source/data/brkitr/rules/line_phrase_cj.txt @@ -310,7 +310,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB]; # and then to default UAX #14 behaviour (UTC-179-C32). # ^($HY | $HH) $CM* $ALPlus; -$GL ($HY | $HH) $CM* $ALPlus; +$GL $CM* ($HY | $HH) $CM* $ALPlus; # Non-breaking CB from LB8a: $CB $CM* $ZWJ ($HY | $HH) $CM* $ALPlus; # Non-breaking SP from LB14: diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index c043a0a5d83..fd72c8e4afd 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -1705,7 +1705,34 @@ class RemapRule : public SegmentationRule { resolved[i].appliedRule = this; resolved[i].indexInRemapped.reset(); } + // While replacing, we need to check that we are not creating + // surrogate pairs. Since appendReplacement performs two + // concatenations (the unreplaced segment and the replacement), we + // need to check in two places: whether the unreplaced segment + // starts with a trailing surrogate that ends up after a leading + // surrogate, and whether the replaced segment starts with a leading + // surrogate that ends up after a trailing surrogate. + // We break the pair by replacing one of the surrogates with U+FFFF, + // which has the same properties for all but line breaking, and the + // same behaviour in line breaking (lb=SG and lb=XX are both treated + // as lb=AL). + std::optional trailingLead; + if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) { + trailingLead = result.length() - 1; + } + matcher->appendReplacement(result, replacement_, status); + + if (trailingLead && *trailingLead + 1 < result.length() && + U16_IS_TRAIL(result[*trailingLead + 1])) { + result.setCharAt(*trailingLead, u'\uFFFF'); + } + + if (matcher->start(status) + offset > 0 && + U16_IS_LEAD(result[matcher->start(status) + offset - 1]) && + U16_IS_TRAIL(result[matcher->start(status) + offset])) { + result.setCharAt(matcher->start(status) + offset, u'\uFFFF'); + } offset = result.length() - *resolved[i].indexInRemapped; } for (; i < static_cast(resolved.size()); ++i) { @@ -1714,7 +1741,17 @@ class RemapRule : public SegmentationRule { } *resolved[i].indexInRemapped += offset; } + + std::optional trailingLead; + if (result.length() > 0 && U16_IS_LEAD(result[result.length() - 1])) { + trailingLead = result.length() - 1; + } matcher->appendTail(result); + if (trailingLead && *trailingLead + 1 < result.length() && + U16_IS_TRAIL(result[*trailingLead + 1])) { + result.setCharAt(*trailingLead, u'\uFFFF'); + } + if (resolved.back().indexInRemapped != result.length()) { std::string indices; for (const auto r : resolved) { @@ -2906,20 +2943,11 @@ RBBILineMonkey::RBBILineMonkey() : std::list> partition; - // TODO(egg): The following two workarounds for what seems to be ICU bugs; - // with UREGEX_DOTALL (but not UREGEX_MULTILINE): - // 1. /.*\u000A/ does not match CR LF; - // 2. /$/ matches ( BK | CR | LF | NL ) eot. - rules.push_back(std::make_unique(uR"(CR LF ÷)", uR"(\u000D\u000A)", u'÷', uR"()")); - rules.push_back(std::make_unique( - uR"([^ BK CR LF NL ] × [ BK CR LF NL ] eot)", - uR"([^ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ])", - u'×', - uR"([ \p{lb=BK} \p{lb=CR} \p{lb=LF} \p{lb=NL} ] $)")); - rules.push_back(std::make_unique(uR"(sot ÷ contra LB2)", uR"(^)", u'÷', uR"()")); // This one could be part of the rules. - rules.push_back(std::make_unique(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"($)")); + // Note that /$/ matches ( BK | CR | LF | NL ) eot, so we use (?!.) instead. + // The generated rules use the same (?!.). + rules.push_back(std::make_unique(uR"(LB3 ÷ eot)", uR"()", u'÷', uR"((?!.))")); // --- NOLI ME TANGERE --- // Generated by GenerateBreakTest.java in the Unicode tools. @@ -3015,7 +3043,7 @@ RBBILineMonkey::RBBILineMonkey() : rules.push_back(std::make_unique(uR"(× $CP)", uR"()", u'×', uR"(\p{Line_Break=CP})")); rules.push_back(std::make_unique(uR"(× $SY)", uR"()", u'×', uR"(\p{Line_Break=Break_Symbols})")); rules.push_back(std::make_unique(uR"($OP $SP* ×)", uR"(\p{Line_Break=Open_Punctuation} \p{Line_Break=Space}*)", u'×', uR"()")); - rules.push_back(std::make_unique(uR"(( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* ×)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()")); + rules.push_back(std::make_unique(uR"(( $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW | $sot ) $QU_Pi $SP* ×)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Open_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=Glue} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | ^ ) [\p{Line_Break=Quotation} && \p{gc=Pi}] \p{Line_Break=Space}*)", u'×', uR"()")); rules.push_back(std::make_unique(uR"(× $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ))", uR"()", u'×', uR"([\p{Line_Break=Quotation} && \p{gc=Pf}] ( \p{Line_Break=Space} | \p{Line_Break=Glue} | \p{Line_Break=Word_Joiner} | \p{Line_Break=Close_Punctuation} | \p{Line_Break=Quotation} | \p{Line_Break=CP} | \p{Line_Break=Exclamation} | \p{Line_Break=Infix_Numeric} | \p{Line_Break=Break_Symbols} | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=ZWSpace} | (?!.) ))")); rules.push_back(std::make_unique(uR"($SP ÷ $IS $NU)", uR"(\p{Line_Break=Space})", u'÷', uR"(\p{Line_Break=Infix_Numeric} \p{Line_Break=Numeric})")); rules.push_back(std::make_unique(uR"(× $IS)", uR"()", u'×', uR"(\p{Line_Break=Infix_Numeric})")); @@ -3027,10 +3055,10 @@ RBBILineMonkey::RBBILineMonkey() : rules.push_back(std::make_unique(uR"([^$EastAsian] × $QU)", uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])", u'×', uR"(\p{Line_Break=Quotation})")); rules.push_back(std::make_unique(uR"(× $QU ( [^$EastAsian] | $eot ))", uR"()", u'×', uR"(\p{Line_Break=Quotation} ( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | (?!.) ))")); rules.push_back(std::make_unique(uR"($QU × [^$EastAsian])", uR"(\p{Line_Break=Quotation})", u'×', uR"([^[\p{ea=F}\p{ea=W}\p{ea=H}]])")); - rules.push_back(std::make_unique(uR"(( $sot | [^$EastAsian] ) $QU ×)", uR"(( ^ | [^[\p{ea=F}\p{ea=W}\p{ea=H}]] ) \p{Line_Break=Quotation})", u'×', uR"()")); + rules.push_back(std::make_unique(uR"(( [^$EastAsian] | $sot ) $QU ×)", uR"(( [^[\p{ea=F}\p{ea=W}\p{ea=H}]] | ^ ) \p{Line_Break=Quotation})", u'×', uR"()")); rules.push_back(std::make_unique(uR"(÷ $CB)", uR"()", u'÷', uR"(\p{Line_Break=Contingent_Break})")); rules.push_back(std::make_unique(uR"($CB ÷)", uR"(\p{Line_Break=Contingent_Break})", u'÷', uR"()")); - rules.push_back(std::make_unique(uR"(( $sot | $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL ) ( $HY | $Hyphen ) × $AL)", uR"(( ^ | \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])")); + rules.push_back(std::make_unique(uR"(( $BK | $CR | $LF | $NL | $SP | $ZW | $CB | $GL | $sot ) ( $HY | $Hyphen ) × $AL)", uR"(( \p{Line_Break=Mandatory_Break} | \p{Line_Break=Carriage_Return} | \p{Line_Break=Line_Feed} | \p{Line_Break=Next_Line} | \p{Line_Break=Space} | \p{Line_Break=ZWSpace} | \p{Line_Break=Contingent_Break} | \p{Line_Break=Glue} | ^ ) ( \p{Line_Break=Hyphen} | [\u2010] ))", u'×', uR"([\p{Line_Break=Ambiguous} \p{Line_Break=Alphabetic} \p{Line_Break=Surrogate} \p{Line_Break=Unknown} [\p{Line_Break=Complex_Context}--\p{gc=Mn}--\p{gc=Mc}]])")); rules.push_back(std::make_unique(uR"(× $BA)", uR"()", u'×', uR"(\p{Line_Break=Break_After})")); rules.push_back(std::make_unique(uR"(× $HY)", uR"()", u'×', uR"(\p{Line_Break=Hyphen})")); rules.push_back(std::make_unique(uR"(× $NS)", uR"()", u'×', uR"([\p{Line_Break=Nonstarter} \p{Line_Break=Conditional_Japanese_Starter}])")); @@ -3080,6 +3108,7 @@ RBBILineMonkey::RBBILineMonkey() : // --- End of generated code. --- + // TODO(egg): This could just as well be part of the rules… rules.push_back(std::make_unique(uR"(ALL ÷ / ÷ ALL)", uR"()", u'÷', @@ -3122,7 +3151,12 @@ void RBBILineMonkey::setText(const UnicodeString &s) { } for (std::size_t i = 0; i < resolved.size(); ++i) { if (resolved[i].appliedRule == nullptr) { - printf("Failed to resolve at %zu" , i); + printf("Failed to resolve at %zu between U+%04X and U+%04X ", i, s.char32At(i-1), s.char32At(i)); + if (resolved[i].indexInRemapped.has_value()) { + printf("which is remapped %d between U+%04X and U+%04X", *resolved[i].indexInRemapped, + remapped.char32At(*resolved[i].indexInRemapped - 1), + remapped.char32At(*resolved[i].indexInRemapped)); + } std::terminate(); } else { setAppliedRule(i, resolved[i].appliedRule->name().c_str()); diff --git a/icu4c/source/test/testdata/break_rules/line.txt b/icu4c/source/test/testdata/break_rules/line.txt index 9f85b791713..e2154abf630 100644 --- a/icu4c/source/test/testdata/break_rules/line.txt +++ b/icu4c/source/test/testdata/break_rules/line.txt @@ -176,7 +176,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_cj.txt b/icu4c/source/test/testdata/break_rules/line_cj.txt index 7aad76ecf10..bb0a6880ea2 100644 --- a/icu4c/source/test/testdata/break_rules/line_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_cj.txt @@ -180,7 +180,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_loose.txt b/icu4c/source/test/testdata/break_rules/line_loose.txt index 72e7563c927..f9152060bf2 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose.txt @@ -181,7 +181,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt index 99d01874d1f..b04236532bb 100644 --- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt @@ -200,7 +200,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA BAX HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_normal.txt b/icu4c/source/test/testdata/break_rules/line_normal.txt index 21129853979..c7c518d5b68 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal.txt @@ -182,7 +182,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt index 2061f917084..cfa9c7968e1 100644 --- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt +++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt @@ -186,7 +186,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 1c7fe997569..781ce068be7 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -2214,3 +2214,7 @@ Bangkok)• •« Complex »« chaining » • •« .618 »• # Interaction with the ICU tailoring to break before such numbers. +# A hyphen following non-breaking space that carries an intervening combining +# mark is treated as word-initial; by LB20a it has no break opportunity after +# it. A bug in ICU 76 incorrectly handled that case (ICU-22986). +• ̄-k• \ No newline at end of file diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk index 8d0172d055c..fea6eaaf279 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk index dbbbc0dfbae..1f8d5552424 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk index 9f77680c283..5ddccb23944 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk index 4199ddeda1c..b890ebe13fe 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk index bebfe7285a2..14aa5b32f8d 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_loose_phrase_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk index 0229e2cb2f2..f5df704f492 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk index 9b13706bfb5..0c912e83e75 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk index 7cbc6998771..208baf36649 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_normal_phrase_cj.brk differ diff --git a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk index b9f1fa48e7d..c627f2f2d60 100644 Binary files a/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk and b/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata/brkitr/line_phrase_cj.brk differ diff --git a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java index 994433250be..8d3ead20fb6 100644 --- a/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java +++ b/icu4j/main/core/src/test/java/com/ibm/icu/dev/test/rbbi/RBBITestMonkey.java @@ -1276,7 +1276,6 @@ public class RBBITestMonkey extends CoreTestFmwk { fLF.contains(fText.codePointAt(breakObliviousPrevPosX2)) || fNL.contains(fText.codePointAt(breakObliviousPrevPosX2)) || fSP.contains(fText.codePointAt(breakObliviousPrevPosX2)) || - fGL.contains(fText.codePointAt(breakObliviousPrevPosX2)) || fZW.contains(fText.codePointAt(breakObliviousPrevPosX2))) { setAppliedRule(pos, "LB 20a"); continue; @@ -1285,7 +1284,8 @@ public class RBBITestMonkey extends CoreTestFmwk { fCM.contains(fText.codePointAt(breakObliviousPrevPosX2))) { breakObliviousPrevPosX2 = moveIndex32(fText, breakObliviousPrevPosX2, -1); } - if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2))) { + if (fCB.contains(fText.codePointAt(breakObliviousPrevPosX2)) || + fGL.contains(fText.codePointAt(breakObliviousPrevPosX2))) { setAppliedRule(pos, "LB 20a"); continue; } diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt index 9f85b791713..e2154abf630 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line.txt @@ -176,7 +176,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt index 7aad76ecf10..bb0a6880ea2 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_cj.txt @@ -180,7 +180,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt index 72e7563c927..f9152060bf2 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose.txt @@ -181,7 +181,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt index 99d01874d1f..b04236532bb 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt @@ -200,7 +200,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA BAX HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt index 21129853979..c7c518d5b68 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal.txt @@ -182,7 +182,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt index 2061f917084..cfa9c7968e1 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt @@ -186,7 +186,7 @@ LB11.2: SP WJ; LB11.3: WJ CM* [^CM]; # Needs to apply before LB12, because the new monkeys are not greedy. -LB20a.2: GL (HY | HH) CM* AL; +LB20a.2: GL CM* (HY | HH) CM* AL; LB12: GL CM* [^CM]; LB12a: [^SP BA HY] CM* GL; diff --git a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 1c7fe997569..781ce068be7 100644 --- a/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/core/src/test/resources/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -2214,3 +2214,7 @@ Bangkok)• •« Complex »« chaining » • •« .618 »• # Interaction with the ICU tailoring to break before such numbers. +# A hyphen following non-breaking space that carries an intervening combining +# mark is treated as word-initial; by LB20a it has no break opportunity after +# it. A bug in ICU 76 incorrectly handled that case (ICU-22986). +• ̄-k• \ No newline at end of file