mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-2924 RBBI, line break rules, monkey test, better conformance to spec
X-SVN-Rev: 13394
This commit is contained in:
parent
8c1cec35ed
commit
a3f8e5695e
6 changed files with 80 additions and 20 deletions
|
@ -345,6 +345,26 @@ int32_t RBBISetBuilder::getNumCharCategories() {
|
|||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// getFirstChar Given a runtime RBBI character category, find
|
||||
// the first UChar32 that is in the set of chars
|
||||
// in the category.
|
||||
//------------------------------------------------------------------------
|
||||
UChar32 RBBISetBuilder::getFirstChar(int32_t category) {
|
||||
RangeDescriptor *rlRange;
|
||||
UChar32 retVal = (UChar32)-1;
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
if (rlRange->fNum == category) {
|
||||
retVal = rlRange->fStartChar;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return retVal;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// printRanges A debugging function.
|
||||
|
|
|
@ -89,6 +89,7 @@ public:
|
|||
void printSets();
|
||||
void printRanges();
|
||||
void printRangeGroups();
|
||||
UChar32 getFirstChar(int32_t val);
|
||||
|
||||
private:
|
||||
void numberSets();
|
||||
|
|
|
@ -317,7 +317,6 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// calcChainedFollowPos. Modify the previously calculated followPos sets
|
||||
|
@ -351,6 +350,8 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
|
|||
//
|
||||
int32_t endNodeIx;
|
||||
int32_t startNodeIx;
|
||||
UVector endingNodes(*fStatus);
|
||||
|
||||
for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
|
||||
RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
|
||||
RBBINode *endNode = NULL;
|
||||
|
@ -369,6 +370,37 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
|
|||
}
|
||||
|
||||
// We've got a node that can end a match.
|
||||
// TODO: endingNodes.addElement(endNode, *fStatus);
|
||||
|
||||
// Line Break Specific hack. Does this end val correspond to the $CM char class?
|
||||
// And is it part of a rule of this form: $XX $CM*
|
||||
// If so, we want to chain to rules beginning with $XX, not with $CM.
|
||||
// We still chain from the CM node, but the criteria for choosing
|
||||
// the nodes to chain to is different.
|
||||
// TODO: Add rule syntax for this behavior, get specifics out of here and
|
||||
// into the rule file.
|
||||
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
|
||||
U_ASSERT(c != -1);
|
||||
RBBINode *parent = NULL;
|
||||
RBBINode *grandParent = NULL;
|
||||
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
|
||||
if (cLBProp != U_LB_COMBINING_MARK) {
|
||||
goto neverMind;
|
||||
}
|
||||
parent = endNode->fParent;
|
||||
if (parent->fType != RBBINode::opStar) {
|
||||
goto neverMind;
|
||||
}
|
||||
grandParent = parent->fParent;
|
||||
if (grandParent->fType != RBBINode::opCat || grandParent->fRightChild != parent) {
|
||||
goto neverMind;
|
||||
}
|
||||
|
||||
|
||||
// TODO: grab nodes from grandParent->leftChild->endPos; add to endingNodes
|
||||
|
||||
|
||||
neverMind:
|
||||
// Now iterate over the nodes that can start a match, looking for ones
|
||||
// with the same char class as our ending node.
|
||||
RBBINode *startNode;
|
||||
|
|
|
@ -96,10 +96,13 @@ $LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
|
|||
# $SP $CM needs to behave like $ID.
|
||||
# X $CM needs to behave like X, where X is not $SP.
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
[$LB5NonBreaks] $CM*; # Stick together any combining sequences that don't match other rules.
|
||||
[$LB5NonBreaks] $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
|
||||
# LB 8
|
||||
$LB5NonBreaks [$CL $EX $IS $SY];
|
||||
[$LB5NonBreaks] $CL;
|
||||
[$LB5NonBreaks] $EX;
|
||||
[$LB5NonBreaks] $IS;
|
||||
[$LB5NonBreaks] $SY;
|
||||
|
||||
# LB 9
|
||||
$OPcm $SP* .?;
|
||||
|
@ -138,21 +141,22 @@ $LB14NonBreaks ($BAcm | $HYcm | $NScm);
|
|||
$BBcm [^$CB];
|
||||
|
||||
# LB 16
|
||||
($ALcm | $IDcm | $INcm | $NUcm) $INcm*;
|
||||
($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm;
|
||||
|
||||
# $LB 17
|
||||
$IDcm $POcm;
|
||||
($IDcm | SP CM+) $POcm;
|
||||
$ALcm+ $NUcm; # includes $LB19
|
||||
$NUcm $ALcm+;
|
||||
|
||||
|
||||
# LB 18
|
||||
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
|
||||
#$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm?;
|
||||
|
||||
# LB 19
|
||||
#$CM* $ALcm+;
|
||||
$ALcm+;
|
||||
|
||||
|
||||
#
|
||||
# Reverse Rules.
|
||||
#
|
||||
|
|
|
@ -2535,16 +2535,16 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
|
|||
// contains the logic to locate Hangul syllables.
|
||||
}
|
||||
|
||||
// LB 7b Keep combining sequences together.
|
||||
if (hangultype == U_HST_NOT_APPLICABLE) {
|
||||
// advance over any CM class chars
|
||||
for (;;) {
|
||||
*nextChar = fText->char32At(nPos);
|
||||
if (!fCM->contains(*nextChar)) {
|
||||
break;
|
||||
}
|
||||
nPos = fText->moveIndex32(nPos, 1);
|
||||
// LB 7b Keep combining sequences together.
|
||||
// advance over any CM class chars. (Line Break CM class is different from
|
||||
// grapheme cluster CM, so we need to do this even for HangulSyllables.
|
||||
// Line Break may eat additional stuff as combining, beyond what graphem cluster did.
|
||||
for (;;) {
|
||||
*nextChar = fText->char32At(nPos);
|
||||
if (!fCM->contains(*nextChar)) {
|
||||
break;
|
||||
}
|
||||
nPos = fText->moveIndex32(nPos, 1);
|
||||
}
|
||||
|
||||
|
||||
|
@ -2663,10 +2663,13 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
|
|||
}
|
||||
|
||||
// LB 8 Don't break before closings.
|
||||
if (fCL->contains(thisChar) ||
|
||||
fEX->contains(thisChar) ||
|
||||
fIS->contains(thisChar) ||
|
||||
fSY->contains(thisChar)) {
|
||||
// NU x CL and NU x IS are not matched here so that they will
|
||||
// fall into LB 17 and the more general number regular expression.
|
||||
//
|
||||
if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
|
||||
fEX->contains(thisChar) ||
|
||||
!fNU->contains(prevChar) && fIS->contains(thisChar) ||
|
||||
fSY->contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
2
icu4c/source/test/testdata/rbbitst.txt
vendored
2
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -24,7 +24,7 @@
|
|||
|
||||
# Temp debugging tests
|
||||
<line>
|
||||
<data>•a\u275d\u0085<100>\u0c56•</data>
|
||||
<data><>\U0001d7f9\u003b\u2034<></data>
|
||||
|
||||
########################################################################################
|
||||
#
|
||||
|
|
Loading…
Add table
Reference in a new issue