mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-22707 UTC-179-C28 LB19 change for simplified chinese
This commit is contained in:
parent
1513b66c32
commit
84ff5dacf8
2 changed files with 110 additions and 9 deletions
|
@ -251,6 +251,7 @@ $SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
|||
|
||||
#
|
||||
# LB 15d Do not break before numeric separators (IS), even after spaces.
|
||||
# SP IS QU is handled below as part of LB 19.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
|
@ -273,13 +274,34 @@ $LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
|||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
# LB 19 and LB 19a.
|
||||
# Instead of implementing both as keep-together rules as in UAX #14, we have an
|
||||
# East_Asian_Width and General_Category-insensitive keep-together rule
|
||||
# equivalent to the old LB19 × QU and QU ×, and then we poke holes into it based
|
||||
# on context. This avoids having to do manual chaining over multiple characters
|
||||
# with many other rules over multiple characters, as a keep-together LB19a would
|
||||
# overlap in context with at least LB14, LB15a, LB15a, LB15d, LB30a, and itself.
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}] - [$OP $GL $BA]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
|
||||
$QU $CM* .;
|
||||
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
[$LB18NonBreaks & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
^[$CM & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [\p{Pf} & $QU] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [$QU & \p{Pi}] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [$QU & \p{Pi}] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [$QU & \p{Pf}] / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
$SP [$IS & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* [$QU & \p{Pf}] $CM* $CMX / [ [\p{ea=F}\p{ea=W}\p{ea=H}] - [$NS $BA $EX $CL $IN $IS $GL $CM]];
|
||||
|
||||
^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
^$CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
|
@ -287,6 +309,9 @@ $QU $CM* .;
|
|||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
[$LB20NonBreaks - $HL] $CM* [$BA & [\p{ea=F}\p{ea=W}\p{ea=H}]] $CM* $CMX / [\p{Pi} & $QU] $CM* [ [\p{ea=F}\p{ea=W}\p{ea=H}] - $CM];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
|
|
|
@ -2706,6 +2706,7 @@ private:
|
|||
UnicodeSet *fVI;
|
||||
UnicodeSet *fPi;
|
||||
UnicodeSet *fPf;
|
||||
UnicodeSet *feaFWH;
|
||||
|
||||
BreakIterator *fCharBI;
|
||||
const UnicodeString *fText;
|
||||
|
@ -2783,6 +2784,8 @@ RBBILineMonkey::RBBILineMonkey() :
|
|||
fPi = new UnicodeSet(uR"([\p{Pi}])", status);
|
||||
fPf = new UnicodeSet(uR"([\p{Pf}])", status);
|
||||
|
||||
feaFWH = new UnicodeSet(uR"([\p{ea=F}\p{ea=W}\p{ea=H}])", status);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
deferredStatus = status;
|
||||
return;
|
||||
|
@ -2900,9 +2903,23 @@ void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos
|
|||
// LB 9 Treat X CM* as if it were x.
|
||||
// No explicit action required.
|
||||
|
||||
// LB 10 Treat any remaining combining mark as AL
|
||||
// LB 10 Treat any remaining combining mark as AL, but preserve its East
|
||||
// Asian Width.
|
||||
if (fCM->contains(*posChar)) {
|
||||
*posChar = u'A';
|
||||
switch (u_getIntPropertyValue(*posChar, UCHAR_EAST_ASIAN_WIDTH)) {
|
||||
case U_EA_WIDE:
|
||||
*posChar = u'♈';
|
||||
break;
|
||||
case U_EA_NEUTRAL:
|
||||
*posChar = u'ᴬ';
|
||||
break;
|
||||
case U_EA_AMBIGUOUS:
|
||||
*posChar = u'Ⓐ';
|
||||
break;
|
||||
default:
|
||||
puts("Unexpected ea value for lb=CM");
|
||||
std::terminate();
|
||||
}
|
||||
}
|
||||
|
||||
// Push the updated nextPos and nextChar back to our caller.
|
||||
|
@ -3214,12 +3231,70 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
break;
|
||||
}
|
||||
|
||||
// x QU
|
||||
// QU x
|
||||
if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
|
||||
setAppliedRule(pos, "LB 19");
|
||||
// LB 19
|
||||
// × [QU-\p{Pi}]
|
||||
if (fQU->contains(thisChar) && !fPi->contains(thisChar)) {
|
||||
setAppliedRule(pos, "LB 19 × [QU-\\p{Pi}]");
|
||||
continue;
|
||||
}
|
||||
// [^\p{ea=F}\p{ea=W}\p{ea=H}] × [\p{Pi}&QU]
|
||||
if (!feaFWH->contains(prevChar) && fPi->contains(thisChar) && fQU->contains(thisChar)) {
|
||||
setAppliedRule(pos, "LB 19 [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] × [\\p{Pi}&QU]");
|
||||
continue;
|
||||
}
|
||||
// × [\p{Pi}&QU] ( [^\p{ea=F}\p{ea=W}\p{ea=H}] | eot )
|
||||
if (fPi->contains(thisChar) && fQU->contains(thisChar)) {
|
||||
if (nextPos < fText->length()) {
|
||||
UChar32 nextChar = fText->char32At(nextPos);
|
||||
if (!feaFWH->contains(nextChar)) {
|
||||
setAppliedRule(pos, "LB 19 × [\\p{Pi}&QU] [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]");
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
setAppliedRule(pos, "LB 19 × [\\p{Pi}&QU] eot");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// [QU-\p{Pf}] ×
|
||||
if (fQU->contains(prevChar) && !fPf->contains(prevChar)) {
|
||||
setAppliedRule(pos, "LB 19 [QU-\\p{Pf}] ×");
|
||||
continue;
|
||||
}
|
||||
// [\p{Pf}&QU] × [^\p{ea=F}\p{ea=W}\p{ea=H}]
|
||||
if (fPf->contains(prevChar) && fQU->contains(prevChar) && !feaFWH->contains(thisChar)) {
|
||||
setAppliedRule(pos, "LB 19 [\\p{Pf}&QU] × [^\\p{ea=F}\\p{ea=W}\\p{ea=H}]");
|
||||
continue;
|
||||
}
|
||||
// ( sot | [^\p{ea=F}\p{ea=W}\p{ea=H}] ) [\p{Pf}&QU] ×
|
||||
if (fPf->contains(prevChar) && fQU->contains(prevChar)) {
|
||||
if (prevPos == 0) {
|
||||
setAppliedRule(pos, "LB 19 sot [\\p{Pf}&QU] ×");
|
||||
continue;
|
||||
}
|
||||
// prevPosX2 is -1 if there was a break, and prevCharX2 is 0; but the UAX #14 rules can
|
||||
// look through breaks.
|
||||
int breakObliviousPrevPosX2 = fText->moveIndex32(prevPos, -1);
|
||||
while (fCM->contains(fText->char32At(breakObliviousPrevPosX2))) {
|
||||
if (breakObliviousPrevPosX2 == 0) {
|
||||
break;
|
||||
}
|
||||
int beforeCM = fText->moveIndex32(breakObliviousPrevPosX2, -1);
|
||||
if (fBK->contains(fText->char32At(beforeCM)) ||
|
||||
fCR->contains(fText->char32At(beforeCM)) ||
|
||||
fLF->contains(fText->char32At(beforeCM)) ||
|
||||
fNL->contains(fText->char32At(beforeCM)) ||
|
||||
fSP->contains(fText->char32At(beforeCM)) ||
|
||||
fZW->contains(fText->char32At(beforeCM))) {
|
||||
break;
|
||||
}
|
||||
breakObliviousPrevPosX2 = beforeCM;
|
||||
}
|
||||
if (!feaFWH->contains(fText->char32At(breakObliviousPrevPosX2))) {
|
||||
setAppliedRule(pos, "LB 19 [^\\p{ea=F}\\p{ea=W}\\p{ea=H}] [\\p{Pf}&QU] ×");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
|
||||
setAppliedRule(pos, "LB 20 Break around a CB");
|
||||
|
@ -3598,6 +3673,7 @@ RBBILineMonkey::~RBBILineMonkey() {
|
|||
delete fVI;
|
||||
delete fPi;
|
||||
delete fPf;
|
||||
delete feaFWH;
|
||||
|
||||
delete fCharBI;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue