ICU-13194 RBBI safe tables, all tests passing!

X-SVN-Rev: 41155
2025-04-08 06:53:45 +00:00 · 2018-03-26 23:01:16 +00:00 · 2018-03-26 23:01:16 +00:00 · b1b0be93ea
commit b1b0be93ea
parent 62dd66a13d
9 changed files with 61 additions and 66 deletions
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -651,7 +651,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
    }

    // Adjust offset to be on a code point boundary and not beyond the end of the text.
-    // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
+    // Note that isBoundary() is always false for offsets that are not on code point boundaries.
    // But we still need the side effect of leaving iteration at the following boundary.

    utext_setNativeIndex(&fText, offset);
@ -1116,7 +1116,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
    UChar32             c;
    int32_t             result          = 0;

-    const RBBIStateTable *stateTable = fData->fSafeRevTable;
+    const RBBIStateTable *stateTable = fData->fReverseTable;
    UTEXT_SETNATIVEINDEX(&fText, fromPosition);
    #ifdef RBBI_DEBUG
        if (gTrace) {
--- a/icu4c/source/common/rbbi_cache.cpp
+++ b/icu4c/source/common/rbbi_cache.cpp
@ -206,7 +206,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
 *   BreakCache implemetation
 */

-RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) : 
+RuleBasedBreakIterator::BreakCache::BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status) :
        fBI(bi), fSideBuffer(status) {
    reset();
 }
@ -299,7 +299,7 @@ void RuleBasedBreakIterator::BreakCache::previous(UErrorCode &status) {
    fBI->fPosition = fTextIdx;
    fBI->fRuleStatusIndex = fStatuses[fBufIdx];
    return;
-}    
+}


 UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
@ -317,7 +317,7 @@ UBool RuleBasedBreakIterator::BreakCache::seek(int32_t pos) {
        fTextIdx = fBoundaries[fBufIdx];
        return TRUE;
    }
-    
+
    int32_t min = fStartBufIdx;
    int32_t max = fEndBufIdx;
    while (min != max) {
@ -354,20 +354,33 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC
    if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
        int32_t aBoundary = 0;
        int32_t ruleStatusIndex = 0;
-        // TODO: check for position == length of text. Although may still need to back up to get rule status.
        if (position > 20) {
            int32_t backupPos = fBI->handleSafePrevious(position);
-            fBI->fPosition = backupPos;
-            aBoundary = fBI->handleNext();        // Ignore dictionary, just finding a rule based boundary.
-            if (aBoundary == backupPos + 1) {   // TODO: + 1 is wrong for supplementals.
-                // Safe rules work on pairs. +1 from start pos may be a false match.
+
+            if (backupPos > 0) {
+                // Advance to the boundary following the backup position.
+                // There is a complication: the safe reverse rules identify pairs of code points
+                // that are safe. If advancing from the safe point moves forwards by less than
+                // two code points, we need to advance one more time to ensure that the boundary
+                // is good, including a correct rules status value.
+                //
+                fBI->fPosition = backupPos;
                aBoundary = fBI->handleNext();
+                if (aBoundary <= backupPos + 4) {
+                    // +4 is a quick test for possibly having advanced only one codepoint.
+                    // Four being the length of the longest potential code point, a supplementary in UTF-8
+                    utext_setNativeIndex(&fBI->fText, aBoundary);
+                    if (backupPos == utext_getPreviousNativeIndex(&fBI->fText)) {
+                        // The initial handleNext() only advanced by a single code point. Go again.
+                        aBoundary = fBI->handleNext();   // Safe rules identify safe pairs.
+                    }
+                }
+                ruleStatusIndex = fBI->fRuleStatusIndex;
            }
-            ruleStatusIndex = fBI->fRuleStatusIndex;
        }
        reset(aBoundary, ruleStatusIndex);        // Reset cache to hold aBoundary as a single starting point.
    }
-    
+
    // Fill in boundaries between existing cache content and the new requested position.

    if (fBoundaries[fEndBufIdx] < position) {
@ -495,13 +508,24 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
            position = 0;
            positionStatusIdx = 0;
        } else {
-            fBI->fPosition = backupPosition;  // TODO: pass starting position in a clearer way.
-            position = fBI->handleNext();     // TODO: supplementals don't work with the +1.
-            if (position == backupPosition + 1) {
-                position = fBI->handleNext();   // Safe rules identify safe pairs.
+            // Advance to the boundary following the backup position.
+            // There is a complication: the safe reverse rules identify pairs of code points
+            // that are safe. If advancing from the safe point moves forwards by less than
+            // two code points, we need to advance one more time to ensure that the boundary
+            // is good, including a correct rules status value.
+            //
+            fBI->fPosition = backupPosition;
+            position = fBI->handleNext();
+            if (position <= backupPosition + 4) {
+                // +4 is a quick test for possibly having advanced only one codepoint.
+                // Four being the length of the longest potential code point, a supplementary in UTF-8
+                utext_setNativeIndex(&fBI->fText, position);
+                if (backupPosition == utext_getPreviousNativeIndex(&fBI->fText)) {
+                    // The initial handleNext() only advanced by a single code point. Go again.
+                    position = fBI->handleNext();   // Safe rules identify safe pairs.
+                }
            };
            positionStatusIdx = fBI->fRuleStatusIndex;
-
        }
    } while (position >= fromPosition);

@ -540,7 +564,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
            }
            U_ASSERT(position==dictSegEndPosition || position>=fromPosition);
        }
-            
+
        if (!segmentHandledByDictionary && position < fromPosition) {
            fSideBuffer.addElement(position, status);
            fSideBuffer.addElement(positionStatusIdx, status);
@ -566,7 +590,7 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status)
            break;
        }
    }
-      
+
    return success;
 }

--- a/icu4c/source/common/rbbidata.cpp
+++ b/icu4c/source/common/rbbidata.cpp
@ -106,7 +106,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
    if (data->fFTableLen != 0) {
        fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
    }
-    if (data->fSRTableLen != 0) {
+    if (data->fRTableLen != 0) {
+        fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
+    }
+    if (data->fSRTableLen != 0) {   // TODO: obsolete. Remove.
        fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
    }

--- a/icu4c/source/common/rbbidata.h
+++ b/icu4c/source/common/rbbidata.h
@ -173,7 +173,8 @@ public:
    /*                                     */
    const RBBIDataHeader     *fHeader;
    const RBBIStateTable     *fForwardTable;
-    const RBBIStateTable     *fSafeRevTable;
+    const RBBIStateTable     *fReverseTable;      // auto-generated safe reverse.
+    const RBBIStateTable     *fSafeRevTable;      // hand-written safe reverse. TODO: delete this.
    const UChar              *fRuleSource;
    const int32_t            *fRuleStatusTable; 

--- a/icu4c/source/common/rbbirb.cpp
+++ b/icu4c/source/common/rbbirb.cpp
@ -300,10 +300,9 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
    fForwardTables->buildSafe(status);


-    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
 #ifdef RBBI_DEBUG
    if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
-        fForwardTables
+        fForwardTables->printStates();
        fForwardTables->printRuleStatusTable();
        fForwardTables->printSafeTable();
    }
--- a/icu4c/source/common/rbbitblb.cpp
+++ b/icu4c/source/common/rbbitblb.cpp
@ -1312,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
            if (wantedEndState == endState) {
                int32_t pair = c1 << 16 | c2;
                safePairs.addElement(pair, status);
-                printf("(%d, %d) ", c1, c2);
+                // printf("(%d, %d) ", c1, c2);
            }
        }
-        printf("\n");
+        // printf("\n");
    }

    // Populate the initial safe table.
@ -1358,7 +1358,7 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) {
        rowState.setCharAt(c1, 0);
    }

-    // Merge similar states.
+    // TODO: Merge similar states.

 }

@ -1392,9 +1392,9 @@ int32_t  RBBITableBuilder::getSafeTableSize() const {

 //-----------------------------------------------------------------------------
 //
-//   exportTable()    export the state transition table in the format required
-//                    by the runtime engine.  getTableSize() bytes of memory
-//                    must be available at the output address "where".
+//   exportSafeTable()   export the state transition table in the format required
+//                       by the runtime engine.  getTableSize() bytes of memory
+//                       must be available at the output address "where".
 //
 //-----------------------------------------------------------------------------
 void RBBITableBuilder::exportSafeTable(void *where) {
--- a/icu4c/source/test/cintltst/cbiapts.c
+++ b/icu4c/source/test/cintltst/cbiapts.c
@ -308,14 +308,14 @@ static void TestBreakIteratorCAPI()


    log_verbose("\nTesting the functions for sentence\n");
-    ubrk_first(sentence);
+    pos = ubrk_first(sentence);
    pos = ubrk_current(sentence);
    log_verbose("Current(sentence) = %d\n", (int32_t)pos);
       pos = ubrk_last(sentence);
    if(pos!=49)
        log_err("error ubrk_last for sentence did not return 49\n");
    log_verbose("Last (sentence) = %d\n", (int32_t)pos);
-    ubrk_first(sentence);
+    pos = ubrk_first(sentence);
    to = ubrk_following( sentence, 0 );
    if (to == 0) log_err("ubrk_following returned 0\n");
    to = ubrk_preceding( sentence, to );
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -4551,40 +4551,6 @@ void RBBITest::TestDebug(void) {
    UParseError pe;
    LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
    assertSuccess(WHERE, status);
-
-#if 0
-    bi->dumpTables();
-
-    RBBIDataWrapper *dw = bi->fData;
-    const RBBIStateTable *fwtbl = dw->fForwardTable;
-    int32_t numCharClasses = dw->fHeader->fCatCount;
-    printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
-
-    for (int32_t c1=0; c1<numCharClasses; ++c1) {
-        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
-            int32_t wantedEndState = -1;
-            int32_t endState = 0;
-            for (int32_t startState = 1; startState < (int32_t)fwtbl->fNumStates; ++startState) {
-                RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState));
-                int32_t s2 = row->fNextState[c1];
-                row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2));
-                endState = row->fNextState[c2];
-                if (wantedEndState < 0) {
-                    wantedEndState = endState;
-                } else {
-                    if (wantedEndState != endState) {
-                        break;
-                    }
-                }
-            }
-            if (wantedEndState == endState) {
-                printf("(%d, %d) ", c1, c2);
-            }
-        }
-        printf("\n");
-    }
-    printf("\n");
-#endif
 }

 void RBBITest::TestProperties() {
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@ -39,7 +39,9 @@

 #   Temp debugging tests
 #
-
+<word>
+<data>•
+•</data>

 ## FILTERED BREAK TESTS