From 62dd66a13d051e0ac66933f4a7933fc93bdbfd49 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 22 Mar 2018 17:31:00 +0000 Subject: [PATCH] ICU-13194 RBBI safe rules, work in progress. X-SVN-Rev: 41135 --- icu4c/source/common/rbbi_cache.cpp | 17 ++- icu4c/source/common/rbbirb.cpp | 36 +++---- icu4c/source/common/rbbitblb.cpp | 139 ++++++++++++++++++++++--- icu4c/source/common/rbbitblb.h | 2 + icu4c/source/test/intltest/rbbitst.cpp | 41 ++++++++ icu4c/source/test/testdata/rbbitst.txt | 8 +- 6 files changed, 201 insertions(+), 42 deletions(-) diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp index 44c66fbea36..33f03d90013 100644 --- a/icu4c/source/common/rbbi_cache.cpp +++ b/icu4c/source/common/rbbi_cache.cpp @@ -356,12 +356,16 @@ UBool RuleBasedBreakIterator::BreakCache::populateNear(int32_t position, UErrorC int32_t ruleStatusIndex = 0; // TODO: check for position == length of text. Although may still need to back up to get rule status. if (position > 20) { - int32_t backupPos = fBI->handlePrevious(position); + int32_t backupPos = fBI->handleSafePrevious(position); fBI->fPosition = backupPos; - aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary. + aBoundary = fBI->handleNext(); // Ignore dictionary, just finding a rule based boundary. + if (aBoundary == backupPos + 1) { // TODO: + 1 is wrong for supplementals. + // Safe rules work on pairs. +1 from start pos may be a false match. + aBoundary = fBI->handleNext(); + } ruleStatusIndex = fBI->fRuleStatusIndex; } - reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. + reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point. } // Fill in boundaries between existing cache content and the new requested position. @@ -485,14 +489,17 @@ UBool RuleBasedBreakIterator::BreakCache::populatePreceding(UErrorCode &status) if (backupPosition <= 0) { backupPosition = 0; } else { - backupPosition = fBI->handlePrevious(backupPosition); + backupPosition = fBI->handleSafePrevious(backupPosition); } if (backupPosition == UBRK_DONE || backupPosition == 0) { position = 0; positionStatusIdx = 0; } else { fBI->fPosition = backupPosition; // TODO: pass starting position in a clearer way. - position = fBI->handleNext(); + position = fBI->handleNext(); // TODO: supplementals don't work with the +1. + if (position == backupPosition + 1) { + position = fBI->handleNext(); // Safe rules identify safe pairs. + }; positionStatusIdx = fBI->fRuleStatusIndex; } diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index ac629c6222c..043cef25f15 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -154,13 +154,15 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { // int32_t headerSize = align8(sizeof(RBBIDataHeader)); int32_t forwardTableSize = align8(fForwardTables->getTableSize()); - int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); + int32_t reverseTableSize = align8(fForwardTables->getSafeTableSize()); + int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); // TODO: remove hand-written rules. int32_t trieSize = align8(fSetBuilder->getTrieSize()); int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar)); int32_t totalSize = headerSize - + forwardTableSize + + forwardTableSize + + reverseTableSize + safeRevTableSize + statusTableSize + trieSize + rulesSize; @@ -180,27 +182,18 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { data->fLength = totalSize; data->fCatCount = fSetBuilder->getNumCharCategories(); - // Only save the forward table and the safe reverse table, - // because these are the only ones used at run-time. - // - // For the moment, we still build the other tables if they are present in the rule source files, - // for backwards compatibility. Old rule files need to work, and this is the simplest approach. - // - // Additional backwards compatibility consideration: if no safe rules are provided, consider the - // reverse rules to actually be the safe reverse rules. - data->fFTable = headerSize; data->fFTableLen = forwardTableSize; - // Do not save Reverse Table. - data->fRTable = data->fFTable + forwardTableSize; - data->fRTableLen = 0; + data->fRTable = data->fFTable + data->fFTableLen; + data->fRTableLen = reverseTableSize; // Do not save the Safe Forward table. - data->fSFTable = data->fRTable + 0; + data->fSFTable = data->fRTable + data->fRTableLen; data->fSFTableLen = 0; - data->fSRTable = data->fSFTable + 0; + // Hand written reverse rules. TODO: remove, once synthesized ones are working. + data->fSRTable = data->fSFTable + data->fSFTableLen; data->fSRTableLen = safeRevTableSize; U_ASSERT(safeRevTableSize > 0); @@ -214,6 +207,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); fForwardTables->exportTable((uint8_t *)data + data->fFTable); + fForwardTables->exportSafeTable((uint8_t *)data + data->fRTable); fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); @@ -297,22 +291,24 @@ RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) { if (fForwardTables == nullptr || fSafeRevTables == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; - delete fForwardTables; fForwardTables = nullptr; - delete fSafeRevTables; fSafeRevTables = nullptr; return nullptr; } fForwardTables->build(); - fForwardTables->buildSafe(status); fSafeRevTables->build(); + optimizeTables(); + fForwardTables->buildSafe(status); + + if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();}; #ifdef RBBI_DEBUG if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) { + fForwardTables fForwardTables->printRuleStatusTable(); + fForwardTables->printSafeTable(); } #endif - optimizeTables(); fSetBuilder->buildTrie(); // diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index ee2d172c185..dd84b4c27b7 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -38,10 +38,8 @@ RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UEr } // fDStates is UVector fDStates = new UVector(status); - // SafeTable is UVector. Contents owned by the UVector. - fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status); - if (U_SUCCESS(status) && (fDStates == nullptr || fSafeTable == nullptr)) { - status = U_MEMORY_ALLOCATION_ERROR;; + if (U_SUCCESS(status) && fDStates == nullptr ) { + status = U_MEMORY_ALLOCATION_ERROR; } } @@ -190,8 +188,6 @@ void RBBITableBuilder::build() { // for all tables. Merge the ones from this table into the global set. // mergeRuleStatusVals(); - - if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();}; } @@ -1316,10 +1312,10 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) { if (wantedEndState == endState) { int32_t pair = c1 << 16 | c2; safePairs.addElement(pair, status); - // printf("(%d, %d) ", c1, c2); + printf("(%d, %d) ", c1, c2); } } - //printf("\n"); + printf("\n"); } // Populate the initial safe table. @@ -1329,21 +1325,25 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) { // Row 1 is the start sate. // Row 2 and beyond are other states, initially one per char class, but // after initial construction, many of the states will be combined, compacting the table.) + // The String holds the nextState data only. The four leading fields of a row, fAccepting, + // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building. + + U_ASSERT(fSafeTable == nullptr); fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status); for (int32_t row=0; rowaddElement(new UnicodeString(numCharClasses+4, 0, numCharClasses+4), status); + fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status); } // From the start state, each input char class transitions to the state for that input. UnicodeString &startState = *(UnicodeString *)fSafeTable->elementAt(1); for (int32_t charClass=0; charClass < numCharClasses; ++charClass) { - // Note: +2 for the start & stop state; +4 for header columns in state table. - startState.setCharAt(charClass+4, charClass+2); + // Note: +2 for the start & stop state. + startState.setCharAt(charClass, charClass+2); } // Initially make every other state table row look like the start state row, for (int32_t row=2; rowelementAt(1); + UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(row); rowState = startState; // UnicodeString assignment, copies contents. } @@ -1355,13 +1355,85 @@ void RBBITableBuilder::buildSafe(UErrorCode &status) { int32_t c2 = pair & 0x0000ffff; UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(c2 + 2); - rowState.setCharAt(c1 + 4, 0); + rowState.setCharAt(c1, 0); } // Merge similar states. } + +//----------------------------------------------------------------------------- +// +// getSafeTableSize() Calculate the size of the runtime form of this +// safe state table. +// +//----------------------------------------------------------------------------- +int32_t RBBITableBuilder::getSafeTableSize() const { + int32_t size = 0; + int32_t numRows; + int32_t numCols; + int32_t rowSize; + + if (fSafeTable == nullptr) { + return 0; + } + + size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table. + + numRows = fSafeTable->size(); + numCols = fRB->fSetBuilder->getNumCharCategories(); + + rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols; + size += numRows * rowSize; + return size; +} + + +//----------------------------------------------------------------------------- +// +// exportTable() export the state transition table in the format required +// by the runtime engine. getTableSize() bytes of memory +// must be available at the output address "where". +// +//----------------------------------------------------------------------------- +void RBBITableBuilder::exportSafeTable(void *where) { + RBBIStateTable *table = (RBBIStateTable *)where; + uint32_t state; + int col; + + if (U_FAILURE(*fStatus) || fSafeTable == nullptr) { + return; + } + + int32_t catCount = fRB->fSetBuilder->getNumCharCategories(); + if (catCount > 0x7fff || + fSafeTable->size() > 0x7fff) { + *fStatus = U_BRK_INTERNAL_ERROR; + return; + } + + table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount; + table->fNumStates = fSafeTable->size(); + table->fFlags = 0; + table->fReserved = 0; + + for (state=0; statefNumStates; state++) { + UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state); + RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); + row->fAccepting = 0; + row->fLookAhead = 0; + row->fTagIdx = 0; + row->fReserved = 0; + for (col=0; colfNextState[col] = rowString->charAt(col); + } + } +} + + + + //----------------------------------------------------------------------------- // // printSet Debug function. Print the contents of a UVector @@ -1415,6 +1487,47 @@ void RBBITableBuilder::printStates() { #endif +//----------------------------------------------------------------------------- +// +// printSafeTable Debug Function. Dump the fully constructed safe table. +// +//----------------------------------------------------------------------------- +#ifdef RBBI_DEBUG +void RBBITableBuilder::printSafeTable() { + int c; // input "character" + int n; // state number + + RBBIDebugPrintf(" Safe Reverse Table \n"); + if (fSafeTable == nullptr) { + RBBIDebugPrintf(" --- nullptr ---\n"); + return; + } + RBBIDebugPrintf("state | i n p u t s y m b o l s \n"); + RBBIDebugPrintf(" | Acc LA Tag"); + for (c=0; cfSetBuilder->getNumCharCategories(); c++) { + RBBIDebugPrintf(" %2d", c); + } + RBBIDebugPrintf("\n"); + RBBIDebugPrintf(" |---------------"); + for (c=0; cfSetBuilder->getNumCharCategories(); c++) { + RBBIDebugPrintf("---"); + } + RBBIDebugPrintf("\n"); + + for (n=0; nsize(); n++) { + UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n); + RBBIDebugPrintf(" %3d | " , n); + RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags + for (c=0; cfSetBuilder->getNumCharCategories(); c++) { + RBBIDebugPrintf(" %2d", rowString->charAt(c)); + } + RBBIDebugPrintf("\n"); + } + RBBIDebugPrintf("\n\n"); +} +#endif + + //----------------------------------------------------------------------------- // diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index 104d544b172..6b5bb86196d 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -123,11 +123,13 @@ public: void printPosSets(RBBINode *n /* = NULL*/); void printStates(); void printRuleStatusTable(); + void printSafeTable(); #else #define printSet(s) #define printPosSets(n) #define printStates() #define printRuleStatusTable() + #define printSafeTable() #endif private: diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 02785167d76..23ecf10e927 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -4544,6 +4544,47 @@ void RBBITest::TestBug13447() { // for tracing without a lot of unwanted extra stuff happening. // void RBBITest::TestDebug(void) { + UErrorCode status = U_ZERO_ERROR; + LocalPointer bi ((RuleBasedBreakIterator *) + BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status); + const UnicodeString &rules = bi->getRules(); + UParseError pe; + LocalPointer newbi(new RuleBasedBreakIterator(rules, pe, status)); + assertSuccess(WHERE, status); + +#if 0 + bi->dumpTables(); + + RBBIDataWrapper *dw = bi->fData; + const RBBIStateTable *fwtbl = dw->fForwardTable; + int32_t numCharClasses = dw->fHeader->fCatCount; + printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates); + + for (int32_t c1=0; c1fNumStates; ++startState) { + RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * startState)); + int32_t s2 = row->fNextState[c1]; + row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * s2)); + endState = row->fNextState[c2]; + if (wantedEndState < 0) { + wantedEndState = endState; + } else { + if (wantedEndState != endState) { + break; + } + } + } + if (wantedEndState == endState) { + printf("(%d, %d) ", c1, c2); + } + } + printf("\n"); + } + printf("\n"); +#endif } void RBBITest::TestProperties() { diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 761b3e01b5b..38a69fd9742 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1370,7 +1370,7 @@ Bangkok)• !!forward; Hello\ World; -!!reverse; +!!safe_reverse; .*; •Hello World• @@ -1379,7 +1379,7 @@ Bangkok)• !!quoted_literals_only; !!forward; Hello\ World; -!!reverse; +!!safe_reverse; .*; @@ -1387,7 +1387,7 @@ Bangkok)• !!quoted_literals_only; !!forward; 'Hello World'; -!!reverse; +!!safe_reverse; .*; •Hello World• @@ -1399,7 +1399,7 @@ Bangkok)• !!forward; .; -!!reverse; +!!safe_reverse; .*; •a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•a•