ICU-2292 line breaks passing on default option

X-SVN-Rev: 13636
2025-04-10 07:39:16 +00:00 · 2003-11-07 22:49:38 +00:00 · 2003-11-07 22:49:38 +00:00 · 558442a420
commit 558442a420
parent 062c626e85
7 changed files with 154 additions and 85 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -405,6 +405,10 @@ int32_t RuleBasedBreakIterator::previous(void) {
        return BreakIterator::DONE;
    }

+    if (fData->fSafeRevTable != NULL) {
+        return handleNewPrevious();
+    }
+
    // old rule syntax
    // set things up.  handlePrevious() will back us up to some valid
    // break position before the current position (we back our internal
@ -415,8 +419,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
    int32_t start = current();

    fText->previous32();
-    int32_t lastResult    = (fData->fSafeRevTable != NULL) ? 
-                            handleNewPrevious(): handlePrevious();
+    int32_t lastResult    = handlePrevious();
    int32_t result        = lastResult;
    int32_t lastTag       = 0;
    UBool   breakTagValid = FALSE;
@ -450,9 +453,6 @@ int32_t RuleBasedBreakIterator::previous(void) {
    return lastResult;
 }

-
-
-
 /**
 * Sets the iterator to refer to the first boundary position following
 * the specified position.
@ -954,6 +954,10 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
        // if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
        if (hasPassedStartText) { 
            // if we have already considered the start of the text
+            if (fData->fLookAheadHardBreak == TRUE 
+                && row->fLookAhead != 0) {
+                result = 0;
+            }
            break;
        }

@ -1007,6 +1011,17 @@ int32_t RuleBasedBreakIterator::handleNewPrevious(void) {
                /// we need to make the lookahead rules not chain eventually.
                /// return result;
                /// this is going to be the longest match again
+
+                /// syn wee todo hard coded for line breaks stuff
+                /// needs to provide a tag in rules to ensure a stop.
+
+                if (fData->fLookAheadHardBreak == TRUE) {
+                    fText->setIndex(result);
+                    return result;
+                }
+                category = lastCategory;
+                fText->setIndex(result);
+              
                goto continueOn;
            }

--- a/icu4c/source/common/rbbidata.cpp
+++ b/icu4c/source/common/rbbidata.cpp
@ -107,6 +107,15 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {

    fRefCount = 1;

+    /// todo: maybe add this formally to the builder
+    UnicodeString hardbreak("!!lookAheadHardBreak");
+    if (fRuleString.indexOf(hardbreak) >= 0) {
+        fLookAheadHardBreak = TRUE;
+    }
+    else {
+        fLookAheadHardBreak = FALSE;
+    }
+
 #ifdef RBBI_DEBUG
    char *debugEnv = getenv("U_RBBIDEBUG");
    if (debugEnv && uprv_strstr(debugEnv, "data")) {this->printData();}
--- a/icu4c/source/common/rbbidata.h
+++ b/icu4c/source/common/rbbidata.h
@ -127,6 +127,14 @@ public:
    const UChar              *fRuleSource;

    UTrie               fTrie;
+    // if fLookAheadHardBreak is true, we will break at the first lookahead match
+    // the search does not go on further to look for a longer match
+    // this also allows breaks at both ends of the string
+    // e.g. rule "ABC / D; ABCDE" and 
+    // text "ABCD ABCDE ABC" will give breaks at
+    //       01234567890123
+    // {0, 3, 4, 5, 8, 9, 10, 11, 14}
+    UBool               fLookAheadHardBreak;

 private:
    int32_t             fRefCount;
--- a/icu4c/source/common/rbbiscan.cpp
+++ b/icu4c/source/common/rbbiscan.cpp
@ -480,6 +480,10 @@ UBool RBBIRuleScanner::doParseActions(EParseAction action)
                fRB->fDefaultTree = &fRB->fSafeFwdTree;
            } else if (opt == "safe_reverse") {
                fRB->fDefaultTree = &fRB->fSafeRevTree;
+            } else if (opt == "lookAheadHardBreak") {
+                // at the moment do nothing for this
+                // the code is handled in rbbi.cpp
+                // todo: think about how to handle this
            } else {
                error(U_BRK_UNRECOGNIZED_OPTION);
            }
--- a/icu4c/source/data/brkitr/line.txt
+++ b/icu4c/source/data/brkitr/line.txt
@ -14,6 +14,7 @@

 !!chain; 
 !!LBCMNoChain;
+!!lookAheadHardBreak;

 $AI = [:LineBreak =  Ambiguous:];
 $AL = [:LineBreak =  Alphabetic:];
@ -114,6 +115,9 @@ $QU $CM+;
 $SP $CM+;
 $SY $CM+;

+## -------------------------------------------------
+
+!!forward;

 #  
 #  Rule LB 3
@ -217,111 +221,122 @@ $CM* $ALcm+;    # The $CM* is from rule 7C, and unattached CM is treated as AL
 #            at the current position.
 #

-!!reverse;
-# !. . [^$LF $CR $NL $BK]* [$BK $CR $LF $NL];
+## -------------------------------------------------

-! $CM+ $ALPlus;
-! $CM+ $BA;
-! $CM+ $BB;
-! $CM+ $B2;
-! $CM+ $CL;
-! $CM+ $EX;
-! $CM+ $GL;
-! $CM+ $HY;
-! $CM+ $ID;
-! $CM+ $IN;
-! $CM+ $IS;
-! $CM+ $NS;
-! $CM+ $NU;
-! $CM+ $OP;
-! $CM+ $PO;
-! $CM+ $PR;
-! $CM+ $QU;
-! $CM+ $SP;
-! $CM+ $SY;
+!!reverse;
+
+
+$CM+ $ALPlus;
+$CM+ $BA;
+$CM+ $BB;
+$CM+ $B2;
+$CM+ $CL;
+$CM+ $EX;
+$CM+ $GL;
+$CM+ $HY;
+$CM+ $ID;
+$CM+ $IN;
+$CM+ $IS;
+$CM+ $NS;
+$CM+ $NU;
+$CM+ $OP;
+$CM+ $PO;
+$CM+ $PR;
+$CM+ $QU;
+$CM+ $SP;
+$CM+ $SY;

 # LB 3

-! ($BK | $CR | $LF | $NL) $LB3NonBreaks?;   
-! ($BK | $CR | $LF | $NL) $CM* $LB5NonBreaks;
-! $LF $CR;
+($BK | $CR | $LF | $NL) $LB3NonBreaks;   
+($BK | $CR | $LF | $NL) $CM* $LB5NonBreaks;
+$LF $CR;

 # LB 4         x SP
 #              x ZW
-! [$SP $ZW] $LB3NonBreaks;
-! [$SP $ZW] $CM* $LB5NonBreaks;
+[$SP $ZW] $LB3NonBreaks;
+[$SP $ZW] $CM* $LB5NonBreaks;

-# LB 5         Break after zero width space
+# LB 5 Break after zero width space

-# LB 7     Combining marks.  TODO:  get it right!
-#                                   $SP $CM needs to behave like $ID.
-#                                   X   $CM needs to behave like X, where X is not $SP.
-#                                   $CM not covered by the above needs to behave like $AL
-! $CM+ $LB5NonBreaks;    #  Stick together any combining sequences that don't match other rules.
+# LB 6 Jamo is treated like an alphabet
+
+# LB 7 Combining marks. 
+#    $SP $CM needs to behave like $ID.
+#    X   $CM needs to behave like X, where X is not $SP.
+#    $CM not covered by the above needs to behave like $AL
+# Stick together any combining sequences that don't match other rules.
+$CM+ $LB5NonBreaks;    

 # LB 8     
-! $CL $CM* $LB5NonBreaks;
-! $EX $CM* $LB5NonBreaks;
-! $IS $CM* $LB5NonBreaks;
-! $SY $CM* $LB5NonBreaks;
+$CL $CM* $LB5NonBreaks;
+$EX $CM* $LB5NonBreaks;
+$IS $CM* $LB5NonBreaks;
+$SY $CM* $LB5NonBreaks;

 # LB 9
-! .? $SP* $CM* $OP;         
-! $CM* $LB5NonBreaks $SP* $CM* $OP;
+$LB5NonBreaks $SP* $CM* $OP;

 # LB 10
-! $CM* $OP $SP* $CM* $QU;
+$CM* $OP $SP* $CM* $QU;

 # LB 11
-! $CM* $NS $SP* $CM* $CL;
+$CM* $NS $SP* $CM* $CL;

 # LB 11a
-! ($CM* $B2)+;
+($CM* $B2)+;

 # LB 11b
-! .? $CM* $GL $CM* $LB5NonBreaks;
-! $CM* $LB5NonBreaks $CM* $GL $CM* $LB5NonBreaks;
-! $LB3NonBreaks? $CM* $GL;
-! $CM* $LB5NonBreaks $CM* $GL;
+$CM* $GL $CM* $LB5NonBreaks;
+$CM* $LB5NonBreaks $CM* $GL;
+$LB3NonBreaks $CM* $GL;

 # LB 12

 # LB 14
-! .? ($CM* $QU)+ $CM* $LB12NonBreaks;
-! $CM* $LB5NonBreaks ($CM* $QU)+ $CM* $LB12NonBreaks;
-! .? ($CM* $QU)+ $CM+ $SP; # LB7a  SP CM+ behaves as ID
-! $CM* $LB5NonBreaks ($CM* $QU)+ $CM+ $SP;
-
-! $LB3NonBreaks? $CM* $QU;
-! $CM* $LB5NonBreaks $CM* $QU;    # Don't let a combining mark go onto $CR, $BK, etc.
+$CM* $QU $CM* $LB12NonBreaks;
+$CM* $QU $CM+ $SP;
+$CM* $LB5NonBreaks $CM* $QU;
 
 # LB 14a
 $BackLB14CanBreakAfter = ($CM* [$LB14NonBreaks - $CM]) | ($CM+ $SP);

 # LB 15
-! ($CM* $BA | $CM* $HY | $CM* $NS) $BackLB14CanBreakAfter;   
-! ($CM* $BA | $CM* $HY | $CM* $NS) $CM+ / [$BK $CR $LF $NL $ZW];   
-! [^$CB] $CM* $BB;
-! $CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;  
+$CM* ($BA | $HY | $NS) $BackLB14CanBreakAfter;   
+($CM* ($BA | $HY | $NS))+ $CM+ / [$BK $CR $LF $NL $ZW];   
+[$CR $LF $BK $NL $ZW] $CM* $BB;
+$CM* [^$CB $CR $LF $BK $NL $ZW] $CM* $BB;    

 # LB 16
-! $CM* $IN $CM* $ALPlus;
-! $CM* $IN $CM+ / [$BK $CR $LF $NL $ZW];     #  by rule 7c, any otherwise unattached CM behaves as AL
-! $CM* $IN $CM* $ID;
-! $CM* $IN $CM+ $SP; # by rule 7a, $SP $CM behaves like ID
-! $CM* $IN $CM* $IN;
-! $CM* $IN $CM* $NU;
+$CM* $IN $CM* $ALPlus;
+# by rule 7c, any otherwise unattached CM behaves as AL
+$CM* $IN $CM+ / [$BK $CR $LF $NL $ZW]; 
+
+$CM* $IN $CM* ($ID | $CM $SP); 
+$CM* $IN $CM* $IN;
+$CM* $IN $CM* $NU;

 # $LB 17
-! $CM* $PO ($CM* $ID | $CM+ $SP);
-! $CM* $NU ($CM* $ALPlus)+; # includes $LB19
-! ($CM* $NU)+;
-! ($CM* $NU)+ $CM+ / [$BK $CR $LF $NL $ZW];        # Rule 7c
-! ($CM* $ALPlus)+ $CM* $NU;
+$CM* $PO $CM* ($ID | $CM $SP);
+$CM* $NU ($CM* $ALPlus)+; # includes $LB19
+$CM* $NU $CM+ / [$BK $CR $LF $NL $ZW];        # Rule 7c
+
+$CM* $ALPlus $CM* $NU;

 # LB 18
-! ($CM* $PO)? ($CM* $CL)? ($CM* $NU | $CM* $IS)* $CM* $NU ($CM* $OP | $CM* $HY)? ($CM* $PR)?;
+($CM* $PO)? ($CM* $CL)? ($CM* $NU | $CM* $IS)* $CM* $NU ($CM* $OP | $CM* $HY)? ($CM* $PR)?;

 # LB 19
-! ($CM* $ALPlus)+;
-! ($CM* $ALPlus)+ $CM+ / [$BK $CR $LF $NL $ZW];    # The $CM* is from rule 7C, and unattached CM is treated as AL
+$CM* $ALPlus $CM* $ALPlus;
+# The $CM* is from rule 7C, and unattached CM is treated as AL
+$CM* $ALPlus $CM+ / [$BK $CR $LF $NL $ZW];  
+
+## problem state table can't handle lookahead when it is at the
+## start of the string, currently handled in the rbbi code
+## todo fix this
+
+## -------------------------------------------------
+
+!!safe_reverse;
+
+$CM* [^$CM];
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -587,9 +587,9 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
             if(exec) TestWordBreaks();                   break;
        case 13: name = "TestWordBoundary";
             if(exec) TestWordBoundary();                 break;
-/***
        case 14: name = "TestLineBreaks";
             if(exec) TestLineBreaks();                   break;
+        /***
        case 15: name = "TestSentBreaks";
             if(exec) TestSentBreaks();                 break;
        case 16: name = "TestExtended";
@ -602,8 +602,8 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
               logln("skipping TestMonkey (UCONFIG_NO_REGULAR_EXPRESSIONS)");
 #endif
             }
-***/
             break;
+        ***/
        default: name = ""; break; //needed to end loop
    }
 }
@ -2735,6 +2735,7 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        if (fSP->contains(thisChar)) {
            continue;
        }
+
        if (fZW->contains(thisChar)) {
            continue;
        }
@ -2745,11 +2746,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        }

        // LB 6, LB 7
+        int32_t oldpos = pos;
        rule67Adjust(prevPos, &prevChar, &pos,     &thisChar);
-
+        
        nextCPPos = fText->moveIndex32(pos, 1);
        nextPos   = nextCPPos;
        c = fText->char32At(nextPos);
+        // another percularity of lb4
+        if (fSP->contains(thisChar)) {
+            continue;
+        }
        rule67Adjust(pos,     &thisChar, &nextPos, &c);

        // If the loop is still warming up - if we haven't shifted the initial
@ -2785,11 +2791,16 @@ int32_t RBBILineMonkey::next(int32_t startPos) {
        }

        // LB 9  Don't break after OP SP*
+        /// UBool cmFlag = FALSE;
        for (tPos=prevPos; ; tPos=fCharBI->preceding(tPos)) {
            if (fOP->contains(fText->char32At(tPos))) {
                break;
            }
-            if (fSP->contains(fText->char32At(tPos)) == FALSE || tPos == 0) {
+            if (fSP->contains(prevChar) == FALSE
+                || fSP->contains(fText->char32At(tPos)) == FALSE 
+                || tPos == 0) {
+                /// || cmFlag == TRUE) {
+                // if we have $SP$CM+ which is an $ID
                goto fall_through_9;
            }
        }
@ -3164,7 +3175,13 @@ void RBBITest::TestLineBreaks(void)
    BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
    UChar         str[20]; 
    char          *strlist[] = 
-    {"\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
+    {
+     "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
+     "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
+     "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
+     "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
+     "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
+     "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
     "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
     "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
     "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
@ -3184,10 +3201,9 @@ void RBBITest::TestLineBreaks(void)
    };
    int loop;
    for (loop = 0; loop < (sizeof(strlist) / sizeof(char *)); loop ++) {
-        printf("looping %d\n", loop);
+        // printf("looping %d\n", loop);
        u_unescape(strlist[loop], str, 20);
        UnicodeString ustr(str);
-        // RBBICharMonkey monkey;
        RBBILineMonkey monkey;

        int expected[20];
@ -3214,8 +3230,9 @@ void RBBITest::TestLineBreaks(void)
            printStringBreaks(ustr, expected, expectedcount);
            errln("happy break test failed: missed %d match", 
                  expectedcount - count);
+            break;
        }
-        for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
+         for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
            count --;
            if (forward[count] != i) {
                printStringBreaks(ustr, expected, expectedcount);
@ -3226,6 +3243,7 @@ void RBBITest::TestLineBreaks(void)
        }
        if (count != 0) {
            errln("happy break test failed: missed a match");
+            break;
        }
    }
 }
--- a/icu4c/source/tools/genbrk/genbrk.cpp
+++ b/icu4c/source/tools/genbrk/genbrk.cpp
@ -108,8 +108,8 @@ DataHeader dh ={
        0,                          //     reserved

    { 0x42, 0x72, 0x6b, 0x20 },     //     dataFormat="Brk "
-    { 2, 1, 0, 0 },                 //     formatVersion
-        { 3, 1, 0, 0 }                //   dataVersion (Unicode version)
+    { 3, 0, 0, 0 },                 //     formatVersion
+        { 4, 0, 0, 0 }                //   dataVersion (Unicode version)
    }};

 #endif