ICU-2924 RBBI, line break rules, monkey test, better conformance to spec

X-SVN-Rev: 13394
This commit is contained in:
Andy Heninger 2003-10-11 00:44:36 +00:00
parent 8c1cec35ed
commit a3f8e5695e
6 changed files with 80 additions and 20 deletions

View file

@ -345,6 +345,26 @@ int32_t RBBISetBuilder::getNumCharCategories() {
//------------------------------------------------------------------------
//
// getFirstChar Given a runtime RBBI character category, find
// the first UChar32 that is in the set of chars
// in the category.
//------------------------------------------------------------------------
UChar32 RBBISetBuilder::getFirstChar(int32_t category) {
RangeDescriptor *rlRange;
UChar32 retVal = (UChar32)-1;
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
if (rlRange->fNum == category) {
retVal = rlRange->fStartChar;
break;
}
}
return retVal;
}
//------------------------------------------------------------------------
//
// printRanges A debugging function.

View file

@ -89,6 +89,7 @@ public:
void printSets();
void printRanges();
void printRangeGroups();
UChar32 getFirstChar(int32_t val);
private:
void numberSets();

View file

@ -317,7 +317,6 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) {
}
//-----------------------------------------------------------------------------
//
// calcChainedFollowPos. Modify the previously calculated followPos sets
@ -351,6 +350,8 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
//
int32_t endNodeIx;
int32_t startNodeIx;
UVector endingNodes(*fStatus);
for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
RBBINode *tNode = (RBBINode *)leafNodes.elementAt(endNodeIx);
RBBINode *endNode = NULL;
@ -369,6 +370,37 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) {
}
// We've got a node that can end a match.
// TODO: endingNodes.addElement(endNode, *fStatus);
// Line Break Specific hack. Does this end val correspond to the $CM char class?
// And is it part of a rule of this form: $XX $CM*
// If so, we want to chain to rules beginning with $XX, not with $CM.
// We still chain from the CM node, but the criteria for choosing
// the nodes to chain to is different.
// TODO: Add rule syntax for this behavior, get specifics out of here and
// into the rule file.
UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
U_ASSERT(c != -1);
RBBINode *parent = NULL;
RBBINode *grandParent = NULL;
ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
if (cLBProp != U_LB_COMBINING_MARK) {
goto neverMind;
}
parent = endNode->fParent;
if (parent->fType != RBBINode::opStar) {
goto neverMind;
}
grandParent = parent->fParent;
if (grandParent->fType != RBBINode::opCat || grandParent->fRightChild != parent) {
goto neverMind;
}
// TODO: grab nodes from grandParent->leftChild->endPos; add to endingNodes
neverMind:
// Now iterate over the nodes that can start a match, looking for ones
// with the same char class as our ending node.
RBBINode *startNode;

View file

@ -96,10 +96,13 @@ $LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]];
# $SP $CM needs to behave like $ID.
# X $CM needs to behave like X, where X is not $SP.
# $CM not covered by the above needs to behave like $AL
[$LB5NonBreaks] $CM*; # Stick together any combining sequences that don't match other rules.
[$LB5NonBreaks] $CM+; # Stick together any combining sequences that don't match other rules.
# LB 8
$LB5NonBreaks [$CL $EX $IS $SY];
[$LB5NonBreaks] $CL;
[$LB5NonBreaks] $EX;
[$LB5NonBreaks] $IS;
[$LB5NonBreaks] $SY;
# LB 9
$OPcm $SP* .?;
@ -138,21 +141,22 @@ $LB14NonBreaks ($BAcm | $HYcm | $NScm);
$BBcm [^$CB];
# LB 16
($ALcm | $IDcm | $INcm | $NUcm) $INcm*;
($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm;
# $LB 17
$IDcm $POcm;
($IDcm | SP CM+) $POcm;
$ALcm+ $NUcm; # includes $LB19
$NUcm $ALcm+;
# LB 18
$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?;
#$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm?;
# LB 19
#$CM* $ALcm+;
$ALcm+;
#
# Reverse Rules.
#

View file

@ -2535,16 +2535,16 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo
// contains the logic to locate Hangul syllables.
}
// LB 7b Keep combining sequences together.
if (hangultype == U_HST_NOT_APPLICABLE) {
// advance over any CM class chars
for (;;) {
*nextChar = fText->char32At(nPos);
if (!fCM->contains(*nextChar)) {
break;
}
nPos = fText->moveIndex32(nPos, 1);
// LB 7b Keep combining sequences together.
// advance over any CM class chars. (Line Break CM class is different from
// grapheme cluster CM, so we need to do this even for HangulSyllables.
// Line Break may eat additional stuff as combining, beyond what graphem cluster did.
for (;;) {
*nextChar = fText->char32At(nPos);
if (!fCM->contains(*nextChar)) {
break;
}
nPos = fText->moveIndex32(nPos, 1);
}
@ -2663,10 +2663,13 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
}
// LB 8 Don't break before closings.
if (fCL->contains(thisChar) ||
fEX->contains(thisChar) ||
fIS->contains(thisChar) ||
fSY->contains(thisChar)) {
// NU x CL and NU x IS are not matched here so that they will
// fall into LB 17 and the more general number regular expression.
//
if (!fNU->contains(prevChar) && fCL->contains(thisChar) ||
fEX->contains(thisChar) ||
!fNU->contains(prevChar) && fIS->contains(thisChar) ||
fSY->contains(thisChar)) {
continue;
}

View file

@ -24,7 +24,7 @@
# Temp debugging tests
<line>
<data>•a\u275d\u0085<100>\u0c56•</data>
<data><>\U0001d7f9\u003b\u2034<></data>
########################################################################################
#