From a3f8e5695e6eb5a521cbab9cc739810544c3d5af Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Sat, 11 Oct 2003 00:44:36 +0000 Subject: [PATCH] ICU-2924 RBBI, line break rules, monkey test, better conformance to spec X-SVN-Rev: 13394 --- icu4c/source/common/rbbisetb.cpp | 20 +++++++++++++++ icu4c/source/common/rbbisetb.h | 1 + icu4c/source/common/rbbitblb.cpp | 34 +++++++++++++++++++++++++- icu4c/source/data/brkitr/line.txt | 14 +++++++---- icu4c/source/test/intltest/rbbitst.cpp | 29 ++++++++++++---------- icu4c/source/test/testdata/rbbitst.txt | 2 +- 6 files changed, 80 insertions(+), 20 deletions(-) diff --git a/icu4c/source/common/rbbisetb.cpp b/icu4c/source/common/rbbisetb.cpp index 5f4933550b5..089f91a3c44 100644 --- a/icu4c/source/common/rbbisetb.cpp +++ b/icu4c/source/common/rbbisetb.cpp @@ -345,6 +345,26 @@ int32_t RBBISetBuilder::getNumCharCategories() { +//------------------------------------------------------------------------ +// +// getFirstChar Given a runtime RBBI character category, find +// the first UChar32 that is in the set of chars +// in the category. +//------------------------------------------------------------------------ +UChar32 RBBISetBuilder::getFirstChar(int32_t category) { + RangeDescriptor *rlRange; + UChar32 retVal = (UChar32)-1; + for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { + if (rlRange->fNum == category) { + retVal = rlRange->fStartChar; + break; + } + } + return retVal; +} + + + //------------------------------------------------------------------------ // // printRanges A debugging function. diff --git a/icu4c/source/common/rbbisetb.h b/icu4c/source/common/rbbisetb.h index b4b80a6710b..62704fef417 100644 --- a/icu4c/source/common/rbbisetb.h +++ b/icu4c/source/common/rbbisetb.h @@ -89,6 +89,7 @@ public: void printSets(); void printRanges(); void printRangeGroups(); + UChar32 getFirstChar(int32_t val); private: void numberSets(); diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index 26b3c751b54..9c3fc402668 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -317,7 +317,6 @@ void RBBITableBuilder::calcFollowPos(RBBINode *n) { } - //----------------------------------------------------------------------------- // // calcChainedFollowPos. Modify the previously calculated followPos sets @@ -351,6 +350,8 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *fTree) { // int32_t endNodeIx; int32_t startNodeIx; + UVector endingNodes(*fStatus); + for (endNodeIx=0; endNodeIxfRB->fSetBuilder->getFirstChar(endNode->fVal); + U_ASSERT(c != -1); + RBBINode *parent = NULL; + RBBINode *grandParent = NULL; + ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK); + if (cLBProp != U_LB_COMBINING_MARK) { + goto neverMind; + } + parent = endNode->fParent; + if (parent->fType != RBBINode::opStar) { + goto neverMind; + } + grandParent = parent->fParent; + if (grandParent->fType != RBBINode::opCat || grandParent->fRightChild != parent) { + goto neverMind; + } + + + // TODO: grab nodes from grandParent->leftChild->endPos; add to endingNodes + + +neverMind: // Now iterate over the nodes that can start a match, looking for ones // with the same char class as our ending node. RBBINode *startNode; diff --git a/icu4c/source/data/brkitr/line.txt b/icu4c/source/data/brkitr/line.txt index f7d4b49359c..5cec3e22581 100644 --- a/icu4c/source/data/brkitr/line.txt +++ b/icu4c/source/data/brkitr/line.txt @@ -96,10 +96,13 @@ $LB5NonBreaks = [[$LB3NonBreaks] - [$ZW]]; # $SP $CM needs to behave like $ID. # X $CM needs to behave like X, where X is not $SP. # $CM not covered by the above needs to behave like $AL -[$LB5NonBreaks] $CM*; # Stick together any combining sequences that don't match other rules. +[$LB5NonBreaks] $CM+; # Stick together any combining sequences that don't match other rules. # LB 8 -$LB5NonBreaks [$CL $EX $IS $SY]; +[$LB5NonBreaks] $CL; +[$LB5NonBreaks] $EX; +[$LB5NonBreaks] $IS; +[$LB5NonBreaks] $SY; # LB 9 $OPcm $SP* .?; @@ -138,21 +141,22 @@ $LB14NonBreaks ($BAcm | $HYcm | $NScm); $BBcm [^$CB]; # LB 16 -($ALcm | $IDcm | $INcm | $NUcm) $INcm*; +($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm; # $LB 17 -$IDcm $POcm; +($IDcm | SP CM+) $POcm; $ALcm+ $NUcm; # includes $LB19 $NUcm $ALcm+; # LB 18 $PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?; +#$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm?; # LB 19 +#$CM* $ALcm+; $ALcm+; - # # Reverse Rules. # diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index eff1938f317..31f6290e4a4 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -2535,16 +2535,16 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo // contains the logic to locate Hangul syllables. } - // LB 7b Keep combining sequences together. - if (hangultype == U_HST_NOT_APPLICABLE) { - // advance over any CM class chars - for (;;) { - *nextChar = fText->char32At(nPos); - if (!fCM->contains(*nextChar)) { - break; - } - nPos = fText->moveIndex32(nPos, 1); + // LB 7b Keep combining sequences together. + // advance over any CM class chars. (Line Break CM class is different from + // grapheme cluster CM, so we need to do this even for HangulSyllables. + // Line Break may eat additional stuff as combining, beyond what graphem cluster did. + for (;;) { + *nextChar = fText->char32At(nPos); + if (!fCM->contains(*nextChar)) { + break; } + nPos = fText->moveIndex32(nPos, 1); } @@ -2663,10 +2663,13 @@ int32_t RBBILineMonkey::next(int32_t startPos) { } // LB 8 Don't break before closings. - if (fCL->contains(thisChar) || - fEX->contains(thisChar) || - fIS->contains(thisChar) || - fSY->contains(thisChar)) { + // NU x CL and NU x IS are not matched here so that they will + // fall into LB 17 and the more general number regular expression. + // + if (!fNU->contains(prevChar) && fCL->contains(thisChar) || + fEX->contains(thisChar) || + !fNU->contains(prevChar) && fIS->contains(thisChar) || + fSY->contains(thisChar)) { continue; } diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 714b8f7ab96..2b8381621e4 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -24,7 +24,7 @@ # Temp debugging tests -•a\u275d\u0085<100>\u0c56• +<>\U0001d7f9\u003b\u2034<> ######################################################################################## #