From 8feb899d7d316e2fd499d86b7e97794311ab3354 Mon Sep 17 00:00:00 2001 From: Syn Wee Quek Date: Tue, 11 Nov 2003 21:24:09 +0000 Subject: [PATCH] ICU-2292 line break rules updated, 15 mins testmonkey passes X-SVN-Rev: 13663 --- icu4c/source/common/rbbi.cpp | 6 +- icu4c/source/data/brkitr/line.txt | 43 ++++++++----- icu4c/source/test/intltest/rbbitst.cpp | 86 ++++++++++++++++++-------- 3 files changed, 90 insertions(+), 45 deletions(-) diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 0a211c37c58..3b7d41a78b0 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -730,8 +730,10 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { lookaheadStatus = 0; } else if (result == initialPosition) { // Ran off end, no match found. - // Treat as a break at the end of the input string. - result = fText->endIndex(); + // move forward one + fText->setIndex(initialPosition); + fText->next32(); + fText->getIndex(); } break; } diff --git a/icu4c/source/data/brkitr/line.txt b/icu4c/source/data/brkitr/line.txt index c53e9f32343..8c0cda320c7 100644 --- a/icu4c/source/data/brkitr/line.txt +++ b/icu4c/source/data/brkitr/line.txt @@ -44,6 +44,7 @@ $SA = [:LineBreak = Complex_Context:]; $SG = [:LineBreak = Surrogate:]; $SP = [:LineBreak = Space:]; $SY = [:LineBreak = Break_Symbols:]; +$WJ = [:LineBreak = Word_Joiner:]; $XX = [:LineBreak = Unknown:]; $ZW = [:LineBreak = ZWSpace:]; @@ -60,7 +61,6 @@ $LVT = [:Hangul_Syllable_Type = LVT:]; $HangulSyllable = $L+ | ($L* ($LV? $V+ | $LV | $LVT) $T*) | $T+; - # # Rule LB1. By default, treat AI (characters with ambiguous east Asian width), # SA (South East Asian: Thai, Lao, Khmer) @@ -91,6 +91,7 @@ $PRcm = $PR $CM*; $QUcm = $QU $CM*; $SPcm = $SP $CM*; $SYcm = $SY $CM*; +$WJcm = $WJ $CM*; # # Each class of character can stand by itself as an unbroken token, with trailing combining stuff @@ -114,6 +115,7 @@ $PR $CM+; $QU $CM+; $SP $CM+; $SY $CM+; +$WJ $CM+; ## ------------------------------------------------- @@ -131,12 +133,19 @@ $CR $LF {100}; # LB 4 x SP # x ZW -$LB3NonBreaks [$SP $ZW]; +$ZW [$SP $ZW]; $LB5NonBreaks $CM* [$SP $ZW]; # LB 5 Break after zero width space $LB5Breaks = [$LB3Breaks $ZW]; +# LB 6 +# +# Korean Syllable Definitions +# + +($HangulSyllable) $CM*; + # LB 7 Combining marks. TODO: get it right! # $SP $CM needs to behave like $ID. # X $CM needs to behave like X, where X is not $SP. @@ -163,10 +172,8 @@ $CLcm $SP* $NScm; ($B2cm)+; # LB 11b -$LB5NonBreaks $CM* $GLcm .?; -$LB5NonBreaks $CM* $GLcm $LB5NonBreaks $CM*; -$GLcm $LB3NonBreaks?; -$GLcm $LB5NonBreaks $CM*; +$LB5NonBreaks $CM* ($GLcm | $WJcm); +($GLcm | $WJcm) .?; # LB 12 $LB12NonBreaks = [$LB5NonBreaks - $SP]; @@ -184,14 +191,12 @@ $QUcm $LB5NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. $LB14NonBreaks = [$LB12NonBreaks - $CB]; $LB14CanBreakAfter = $LB14NonBreaks $CM* | $SP $CM+; - # LB 15 $LB14CanBreakAfter ($BAcm | $HYcm | $NScm); $BBcm [^$CB]; $BBcm [^$CB $CR $LF $BK $NL $ZW] $CM*; # LB 16 -#($ALcm | $IDcm | $SP $CM+ | $INcm | $NUcm) $INcm; $ALcm $INcm; $CM+ $INcm; # by rule 7c, any otherwise unattached CM behaves as AL $IDcm $INcm; @@ -206,11 +211,8 @@ $ALcm+ $NUcm; # includes $LB19 $CM+ $NUcm; # Rule 7c $NUcm $ALcm+; - - # LB 18 $PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm? $POcm?; -#$PRcm? ($OPcm | $HYcm)? $NUcm ($NUcm | $IScm)* $CLcm?; # LB 19 $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL @@ -226,7 +228,6 @@ $CM* $ALcm+; # The $CM* is from rule 7C, and unattached CM is treated as AL !!reverse; - $CM+ $ALPlus; $CM+ $BA; $CM+ $BB; @@ -246,6 +247,7 @@ $CM+ $PR; $CM+ $QU; $CM+ $SP; $CM+ $SY; +$CM+ $WJ; # LB 3 @@ -262,6 +264,9 @@ $LF $CR; # LB 6 Jamo is treated like an alphabet +$BackHangulSyllable = $L+ | ($T* ($V+$LV? | $LV | $LVT) $L*) | $T+; +$CM* $BackHangulSyllable; + # LB 7 Combining marks. # $SP $CM needs to behave like $ID. # X $CM needs to behave like X, where X is not $SP. @@ -288,9 +293,9 @@ $CM* $NS $SP* $CM* $CL; ($CM* $B2)+; # LB 11b -$CM* $GL $CM* $LB5NonBreaks; -$CM* $LB5NonBreaks $CM* $GL; -$LB3NonBreaks $CM* $GL; +$CM* ($GL | $WJ) $CM* $LB5NonBreaks; +$CM* $LB5NonBreaks $CM* ($GL | $WJ); +. $CM* ($GL | $WJ); # LB 12 @@ -340,6 +345,9 @@ $CM* $ALPlus $CM+ / $LB5Breaks; !!safe_reverse; +# LB 6 +$V+ $L; + # LB 7 $CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; $CM+ $SP / .; @@ -354,13 +362,16 @@ $SP+ $CM* $QU; $SP+ $CM* $CL; # LB 18 -$IS+ $CM* $NU; +($CM* $IS)+ $CM* $NU; $CL $CM* ($NU | $IS); ## ------------------------------------------------- !!safe_forward; +# LB 6 +$V+ $T; + # LB 7 [^$BK $CR $LF $NL $ZW $SP] $CM+; $SP $CM+ / [^$CM]; diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 51271ed649a..9b2212e6851 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -595,11 +595,11 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha if(exec) TestExtended(); break; case 17: name = "TestMonkey"; if(exec) { -#if !UCONFIG_NO_REGULAR_EXPRESSIONS + #if !UCONFIG_NO_REGULAR_EXPRESSIONS TestMonkey(params); -#else + #else logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)"); -#endif + #endif } break; default: name = ""; break; //needed to end loop @@ -2295,7 +2295,6 @@ int32_t RBBIWordMonkey::next(int32_t prevPos) { break; } - // Rule (5). ALetter x ALetter if (fALetterSet->contains(c1) && fALetterSet->contains(c2)) { @@ -2494,6 +2493,7 @@ RBBILineMonkey::RBBILineMonkey() fAL = new UnicodeSet("[\\p{Line_break=AL}]", status); fID = new UnicodeSet("[\\p{Line_break=ID}]", status); fSA = new UnicodeSet("[\\p{Line_break=SA}]", status); + fWJ = new UnicodeSet("[\\p{Line_break=WJ}]", status); fXX = new UnicodeSet("[\\p{Line_break=XX}]", status); fAL->addAll(*fXX); // Default behavior for XX is identical to AL @@ -2530,6 +2530,7 @@ RBBILineMonkey::RBBILineMonkey() fSets->addElement(fAI, status); fSets->addElement(fAL, status); fSets->addElement(fID, status); + fSets->addElement(fWJ, status); fSets->addElement(fSA, status); // fSets->addElement(fXX, status); @@ -2600,7 +2601,8 @@ void RBBILineMonkey::rule67Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPo // advance over any CM class chars. (Line Break CM class is different from // grapheme cluster CM, so we need to do this even for HangulSyllables. // Line Break may eat additional stuff as combining, beyond what graphem cluster did. - if (!(fBK->contains(*posChar) || *posChar==0x0a || *posChar==0x0d || *posChar==0x85)) { + if (!(fBK->contains(*posChar) || fZW->contains(*posChar) || *posChar==0x0a + || *posChar==0x0d || *posChar==0x85)) { for (;;) { *nextChar = fText->char32At(nPos); if (!fCM->contains(*nextChar)) { @@ -2791,11 +2793,21 @@ int32_t RBBILineMonkey::next(int32_t startPos) { // LB 9 Don't break after OP SP* /// UBool cmFlag = FALSE; for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) { + while (tPos > 0 && fCM->contains(fText->char32At(tPos))) { + tPos=fText->moveIndex32(tPos, -1); + } if (fOP->contains(fText->char32At(tPos))) { break; } - if (fSP->contains(prevChar) == FALSE - || fSP->contains(fText->char32At(tPos)) == FALSE + if (fSP->contains(fText->char32At(tPos)) == TRUE) { + int32_t temp = fText->moveIndex32(tPos, 1); + if (fCM->contains(fText->char32At(temp))) { + // if we have $SP$CM+ which is an $ID + goto fall_through_9; + } + } + // fSP->contains(prevChar) == FALSE || + if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) { /// || cmFlag == TRUE) { // if we have $SP$CM+ which is an $ID @@ -2834,6 +2846,9 @@ fall_through_9: if (fGL->contains(thisChar) || fGL->contains(prevChar)) { continue; } + if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) { + continue; + } // LB 12 break after space if (fSP->contains(prevChar)) { @@ -2896,6 +2911,10 @@ fall_through_9: nextPos = numEndIdx; pos = fCharBI->preceding(numEndIdx); thisChar = fText->char32At(pos); + while (fCM->contains(thisChar)) { + pos = fCharBI->preceding(pos); + thisChar = fText->char32At(pos); + } } continue; } @@ -3010,7 +3029,7 @@ static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr, { int count = 0; int i = 0; - int forward[20]; + int forward[50]; bi->setText(ustr); for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) { forward[count] = i; @@ -3078,9 +3097,14 @@ void RBBITest::TestWordBreaks(void) UErrorCode status = U_ZERO_ERROR; // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); BreakIterator *bi = BreakIterator::createWordInstance(locale, status); - UChar str[25]; + UChar str[300]; char *strlist[] = { + "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d", + "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b", + "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u179c\\u0027\\U000e0061\\u003a", + "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622", + "\\u90ca\\u3588\\u009c\\u0953\\u194b", "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e", "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e", @@ -3124,7 +3148,7 @@ void RBBITest::TestWordBreaks(void) // RBBICharMonkey monkey; RBBIWordMonkey monkey; - int expected[20]; + int expected[50]; int expectedcount = 0; monkey.setText(ustr); @@ -3144,7 +3168,7 @@ void RBBITest::TestWordBoundary(void) UErrorCode status = U_ZERO_ERROR; // BreakIterator *bi = BreakIterator::createCharacterInstance(locale, status); BreakIterator *bi = BreakIterator::createWordInstance(locale, status); - UChar str[20]; + UChar str[50]; char *strlist[] = { "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e", @@ -3182,7 +3206,7 @@ void RBBITest::TestWordBoundary(void) // printf("looping %d\n", loop); u_unescape(strlist[loop], str, 20); UnicodeString ustr(str); - int forward[20]; + int forward[50]; int count = 0; bi->setText(ustr); @@ -3217,9 +3241,21 @@ void RBBITest::TestLineBreaks(void) Locale locale("en"); UErrorCode status = U_ZERO_ERROR; BreakIterator *bi = BreakIterator::createLineInstance(locale, status); - UChar str[20]; + UChar str[50]; char *strlist[] = { + "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", + "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", + "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123", + "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060", + "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5", + "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f", + "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1", + "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5", + "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0", + "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc", + "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f", + "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f", "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b", "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085", "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac", @@ -3235,7 +3271,6 @@ void RBBITest::TestLineBreaks(void) "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b", "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0", "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025", - "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4", "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d", "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111", "\\u2014\\u0020\\u000a\\u17c5\\u24fc", @@ -3253,7 +3288,7 @@ void RBBITest::TestLineBreaks(void) UnicodeString ustr(str); RBBILineMonkey monkey; - int expected[20]; + int expected[50]; int expectedcount = 0; monkey.setText(ustr); @@ -3386,6 +3421,8 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint UnicodeString testText; int32_t numCharClasses; UVector *chClasses; + int expected[TESTSTRINGLEN*2 + 1]; + int expectedCount = 0; char expectedBreaks[TESTSTRINGLEN*2 + 1]; char forwardBreaks[TESTSTRINGLEN*2 + 1]; char reverseBreaks[TESTSTRINGLEN*2+1]; @@ -3443,6 +3480,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint memset(expectedBreaks, 0, sizeof(expectedBreaks)); expectedBreaks[0] = 1; int32_t breakPos = 0; + expectedCount = 0; for (;;) { breakPos = mk.next(breakPos); if (breakPos == -1) { @@ -3452,6 +3490,7 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint errln("breakPos > testText.length()"); } expectedBreaks[breakPos] = 1; + expected[expectedCount ++] = breakPos; } // Find the break positions using forward iteration @@ -3528,20 +3567,13 @@ void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, char *name, uint // Format looks like "<>\uabcd\uabcd<>\U0001abcd..." UnicodeString errorText = ""; - /*** if (strcmp(errorType, "next()") == 0) { + /***if (strcmp(errorType, "next()") == 0) { startContext = 0; - int j = i; - while (true) { - if (forwardBreaks[j ++] != 0) { - printf("%d\n", j); - break; - } - if (j % 100 == 0) { - printf("continue %d\n", j); - } - } - endContext = j + 1; + endContext = testText.length(); + + printStringBreaks(testText, expected, expectedCount); }***/ + for (ci=startContext; ci