diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 43ba58ba9e6..4182b6e0700 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -746,18 +746,68 @@ struct LookAheadResults { }; +// Wrapper functions to select the appropriate handleNext() or handleSafePrevious() +// instantiation, based on whether an 8 or 16 bit table is required. +// +// These Trie access functions will be inlined within the handleNext()/Previous() instantions. +static inline uint16_t TrieFunc8(const UCPTrie *trie, UChar32 c) { + return UCPTRIE_FAST_GET(trie, UCPTRIE_8, c); +} + +static inline uint16_t TrieFunc16(const UCPTrie *trie, UChar32 c) { + return UCPTRIE_FAST_GET(trie, UCPTRIE_16, c); +} + +int32_t RuleBasedBreakIterator::handleNext() { + const RBBIStateTable *statetable = fData->fForwardTable; + bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8; + if (statetable->fFlags & RBBI_8BITS_ROWS) { + if (use8BitsTrie) { + return handleNext(); + } else { + return handleNext(); + } + } else { + if (use8BitsTrie) { + return handleNext(); + } else { + return handleNext(); + } + } +} + +int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { + const RBBIStateTable *statetable = fData->fReverseTable; + bool use8BitsTrie = ucptrie_getValueWidth(fData->fTrie) == UCPTRIE_VALUE_BITS_8; + if (statetable->fFlags & RBBI_8BITS_ROWS) { + if (use8BitsTrie) { + return handleSafePrevious(fromPosition); + } else { + return handleSafePrevious(fromPosition); + } + } else { + if (use8BitsTrie) { + return handleSafePrevious(fromPosition); + } else { + return handleSafePrevious(fromPosition); + } + } +} + + //----------------------------------------------------------------------------------- // // handleNext() // Run the state machine to find a boundary // //----------------------------------------------------------------------------------- +template int32_t RuleBasedBreakIterator::handleNext() { int32_t state; uint16_t category = 0; RBBIRunMode mode; - RBBIStateTableRow *row; + RowType *row; UChar32 c; LookAheadResults lookAheadMatches; int32_t result = 0; @@ -789,7 +839,7 @@ int32_t RuleBasedBreakIterator::handleNext() { // Set the initial state for the state machine state = START_STATE; - row = (RBBIStateTableRow *) + row = (RowType *) //(statetable->fTableData + (statetable->fRowLen * state)); (tableData + tableRowLen * state); @@ -825,20 +875,17 @@ int32_t RuleBasedBreakIterator::handleNext() { if (mode == RBBI_RUN) { // look up the current character's character category, which tells us // which column in the state table to look at. - // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, - // not the size of the character going in, which is a UChar32. - // - category = UTRIE2_GET16(fData->fTrie, c); + category = trieFunc(fData->fTrie, c); // Check the dictionary bit in the character's category. // Counter is only used by dictionary based iteration. // Chars that need to be handled by a dictionary have a flag bit set // in their category values. // - if ((category & 0x4000) != 0) { + if ((category & dictMask) != 0) { fDictionaryCharCount++; // And off the dictionary flag bit. - category &= ~0x4000; + category &= ~dictMask; } } @@ -860,7 +907,7 @@ int32_t RuleBasedBreakIterator::handleNext() { // fNextState is a variable-length array. U_ASSERT(categoryfHeader->fCatCount); state = row->fNextState[category]; /*Not accessing beyond memory*/ - row = (RBBIStateTableRow *) + row = (RowType *) // (statetable->fTableData + (statetable->fRowLen * state)); (tableData + tableRowLen * state); @@ -948,10 +995,12 @@ int32_t RuleBasedBreakIterator::handleNext() { // because the safe table does not require as many options. // //----------------------------------------------------------------------------------- +template int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { + int32_t state; uint16_t category = 0; - RBBIStateTableRow *row; + RowType *row; UChar32 c; int32_t result = 0; @@ -971,7 +1020,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { // Set the initial state for the state machine c = UTEXT_PREVIOUS32(&fText); state = START_STATE; - row = (RBBIStateTableRow *) + row = (RowType *) (stateTable->fTableData + (stateTable->fRowLen * state)); // loop until we reach the start of the text or transition to state 0 @@ -980,12 +1029,10 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { // look up the current character's character category, which tells us // which column in the state table to look at. - // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, - // not the size of the character going in, which is a UChar32. // - // And off the dictionary flag bit. For reverse iteration it is not used. - category = UTRIE2_GET16(fData->fTrie, c); - category &= ~0x4000; + // Off the dictionary flag bit. For reverse iteration it is not used. + category = trieFunc(fData->fTrie, c); + category &= ~dictMask; #ifdef RBBI_DEBUG if (gTrace) { @@ -1004,7 +1051,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { // fNextState is a variable-length array. U_ASSERT(categoryfHeader->fCatCount); state = row->fNextState[category]; /*Not accessing beyond memory*/ - row = (RBBIStateTableRow *) + row = (RowType *) (stateTable->fTableData + (stateTable->fRowLen * state)); if (state == STOP_STATE) { @@ -1024,6 +1071,7 @@ int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { return result; } + //------------------------------------------------------------------------------- // // getRuleStatus() Return the break rule tag associated with the current diff --git a/icu4c/source/common/rbbi_cache.cpp b/icu4c/source/common/rbbi_cache.cpp index 4f9e83360a2..4ea9e3e28b2 100644 --- a/icu4c/source/common/rbbi_cache.cpp +++ b/icu4c/source/common/rbbi_cache.cpp @@ -119,6 +119,8 @@ UBool RuleBasedBreakIterator::DictionaryCache::preceding(int32_t fromPos, int32_ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPos, int32_t endPos, int32_t firstRuleStatus, int32_t otherRuleStatus) { + uint32_t dictMask = ucptrie_getValueWidth(fBI->fData->fTrie) == UCPTRIE_VALUE_BITS_8 ? + kDictBitFor8BitsTrie : kDictBit; if ((endPos - startPos) <= 1) { return; } @@ -142,13 +144,13 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo utext_setNativeIndex(text, rangeStart); UChar32 c = utext_current32(text); - category = UTRIE2_GET16(fBI->fData->fTrie, c); + category = ucptrie_get(fBI->fData->fTrie, c); while(U_SUCCESS(status)) { - while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & 0x4000) == 0) { + while((current = (int32_t)UTEXT_GETNATIVEINDEX(text)) < rangeEnd && (category & dictMask) == 0) { utext_next32(text); // TODO: cleaner loop structure. c = utext_current32(text); - category = UTRIE2_GET16(fBI->fData->fTrie, c); + category = ucptrie_get(fBI->fData->fTrie, c); } if (current >= rangeEnd) { break; @@ -166,7 +168,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo // Reload the loop variables for the next go-round c = utext_current32(text); - category = UTRIE2_GET16(fBI->fData->fTrie, c); + category = ucptrie_get(fBI->fData->fTrie, c); } // If we found breaks, ensure that the first and last entries are diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp index 1d4c9e5895f..7b624d90df2 100644 --- a/icu4c/source/common/rbbidata.cpp +++ b/icu4c/source/common/rbbidata.cpp @@ -11,10 +11,10 @@ #if !UCONFIG_NO_BREAK_ITERATION +#include "unicode/ucptrie.h" #include "unicode/utypes.h" #include "rbbidata.h" #include "rbbirb.h" -#include "utrie2.h" #include "udatamem.h" #include "cmemory.h" #include "cstring.h" @@ -110,17 +110,24 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable); } - fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS, - (uint8_t *)data + fHeader->fTrie, - fHeader->fTrieLen, - NULL, // *actual length - &status); + fTrie = ucptrie_openFromBinary(UCPTRIE_TYPE_FAST, + UCPTRIE_VALUE_BITS_ANY, + (uint8_t *)data + fHeader->fTrie, + fHeader->fTrieLen, + nullptr, // *actual length + &status); if (U_FAILURE(status)) { return; } - fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource); - fRuleString.setTo(TRUE, fRuleSource, -1); + UCPTrieValueWidth width = ucptrie_getValueWidth(fTrie); + if (!(width == UCPTRIE_VALUE_BITS_8 || width == UCPTRIE_VALUE_BITS_16)) { + status = U_INVALID_FORMAT_ERROR; + return; + } + + fRuleSource = ((char *)data + fHeader->fRuleSource); + fRuleString = UnicodeString::fromUTF8(StringPiece(fRuleSource, fHeader->fRuleSourceLen)); U_ASSERT(data->fRuleSourceLen > 0); fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable); @@ -142,8 +149,8 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) { //----------------------------------------------------------------------------- RBBIDataWrapper::~RBBIDataWrapper() { U_ASSERT(fRefCount == 0); - utrie2_close(fTrie); - fTrie = NULL; + ucptrie_close(fTrie); + fTrie = nullptr; if (fUDataMem) { udata_close(fUDataMem); } else if (!fDontFreeData) { @@ -225,6 +232,11 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab RBBIDebugPrintf(" %s\n", heading); + RBBIDebugPrintf("Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n", + table->fFlags, + table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F", + table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F", + table->fFlags & RBBI_8BITS_ROWS ? "T" : "F"); RBBIDebugPrintf("State | Acc LA TagIx"); for (c=0; cfCatCount; c++) {RBBIDebugPrintf("%3d ", c);} RBBIDebugPrintf("\n------|---------------"); for (c=0;cfCatCount; c++) { @@ -236,12 +248,20 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab RBBIDebugPrintf(" N U L L T A B L E\n\n"); return; } + UBool use8Bits = table->fFlags & RBBI_8BITS_ROWS; for (s=0; sfNumStates; s++) { RBBIStateTableRow *row = (RBBIStateTableRow *) (table->fTableData + (table->fRowLen * s)); - RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx); - for (c=0; cfCatCount; c++) { - RBBIDebugPrintf("%3d ", row->fNextState[c]); + if (use8Bits) { + RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r8.fAccepting, row->r8.fLookAhead, row->r8.fTagIdx); + for (c=0; cfCatCount; c++) { + RBBIDebugPrintf("%3d ", row->r8.fNextState[c]); + } + } else { + RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->r16.fAccepting, row->r16.fLookAhead, row->r16.fTagIdx); + for (c=0; cfCatCount; c++) { + RBBIDebugPrintf("%3d ", row->r16.fNextState[c]); + } } RBBIDebugPrintf("\n"); } @@ -377,35 +397,64 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD // int32_t topSize = offsetof(RBBIStateTable, fTableData); - // Forward state table. + // Forward state table. tableStartOffset = ds->readUInt32(rbbiDH->fFTable); tableLength = ds->readUInt32(rbbiDH->fFTableLen); if (tableLength > 0) { - ds->swapArray32(ds, inBytes+tableStartOffset, topSize, + RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset); + UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS; + + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, outBytes+tableStartOffset, status); - ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, - outBytes+tableStartOffset+topSize, status); + + // Swap the state table if the table is in 16 bits. + if (use8Bits) { + if (outBytes != inBytes) { + uprv_memmove(outBytes+tableStartOffset+topSize, + inBytes+tableStartOffset+topSize, + tableLength-topSize); + } + } else { + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, + outBytes+tableStartOffset+topSize, status); + } } - + // Reverse state table. Same layout as forward table, above. tableStartOffset = ds->readUInt32(rbbiDH->fRTable); tableLength = ds->readUInt32(rbbiDH->fRTableLen); if (tableLength > 0) { - ds->swapArray32(ds, inBytes+tableStartOffset, topSize, + RBBIStateTable *rbbiST = (RBBIStateTable *)(inBytes+tableStartOffset); + UBool use8Bits = ds->readUInt32(rbbiST->fFlags) & RBBI_8BITS_ROWS; + + ds->swapArray32(ds, inBytes+tableStartOffset, topSize, outBytes+tableStartOffset, status); - ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, - outBytes+tableStartOffset+topSize, status); + + // Swap the state table if the table is in 16 bits. + if (use8Bits) { + if (outBytes != inBytes) { + uprv_memmove(outBytes+tableStartOffset+topSize, + inBytes+tableStartOffset+topSize, + tableLength-topSize); + } + } else { + ds->swapArray16(ds, inBytes+tableStartOffset+topSize, tableLength-topSize, + outBytes+tableStartOffset+topSize, status); + } } // Trie table for character categories - utrie2_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), - outBytes+ds->readUInt32(rbbiDH->fTrie), status); + ucptrie_swap(ds, inBytes+ds->readUInt32(rbbiDH->fTrie), ds->readUInt32(rbbiDH->fTrieLen), + outBytes+ds->readUInt32(rbbiDH->fTrie), status); - // Source Rules Text. It's UChar data - ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen), - outBytes+ds->readUInt32(rbbiDH->fRuleSource), status); + // Source Rules Text. It's UTF8 data + if (outBytes != inBytes) { + uprv_memmove(outBytes+ds->readUInt32(rbbiDH->fRuleSource), + inBytes+ds->readUInt32(rbbiDH->fRuleSource), + ds->readUInt32(rbbiDH->fRuleSourceLen)); + } // Table of rule status values. It's all int_32 values ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen), diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h index 7b9b8d82526..2458f902faa 100644 --- a/icu4c/source/common/rbbidata.h +++ b/icu4c/source/common/rbbidata.h @@ -49,16 +49,17 @@ ubrk_swap(const UDataSwapper *ds, #ifdef __cplusplus +#include "unicode/ucptrie.h" #include "unicode/uobject.h" #include "unicode/unistr.h" #include "unicode/uversion.h" #include "umutex.h" -#include "utrie2.h" + U_NAMESPACE_BEGIN // The current RBBI data format version. -static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {5, 0, 0, 0}; +static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0}; /* * The following structs map exactly onto the raw data from ICU common data file. @@ -94,25 +95,25 @@ struct RBBIDataHeader { -struct RBBIStateTableRow { - int16_t fAccepting; /* Non-zero if this row is for an accepting state. */ +template +struct RBBIStateTableRowT { + ST fAccepting; /* Non-zero if this row is for an accepting state. */ /* Value 0: not an accepting state. */ /* -1: Unconditional Accepting state. */ /* positive: Look-ahead match has completed. */ /* Actual boundary position happened earlier */ /* Value here == fLookAhead in earlier */ /* state, at actual boundary pos. */ - int16_t fLookAhead; /* Non-zero if this row is for a state that */ + ST fLookAhead; /* Non-zero if this row is for a state that */ /* corresponds to a '/' in the rule source. */ /* Value is the same as the fAccepting */ /* value for the rule (which will appear */ /* in a different state. */ - int16_t fTagIdx; /* Non-zero if this row covers a {tagged} position */ + ST fTagIdx; /* Non-zero if this row covers a {tagged} position */ /* from a rule. Value is the index in the */ /* StatusTable of the set of matching */ /* tags (rule status values) */ - int16_t fReserved; - uint16_t fNextState[1]; /* Next State, indexed by char category. */ + UT fNextState[1]; /* Next State, indexed by char category. */ /* Variable-length array declared with length 1 */ /* to disable bounds checkers. */ /* Array Size is actually fData->fHeader->fCatCount*/ @@ -120,12 +121,18 @@ struct RBBIStateTableRow { /* before changing anything here. */ }; +typedef RBBIStateTableRowT RBBIStateTableRow8; +typedef RBBIStateTableRowT RBBIStateTableRow16; + +union RBBIStateTableRow { + RBBIStateTableRow16 r16; + RBBIStateTableRow8 r8; +}; struct RBBIStateTable { uint32_t fNumStates; /* Number of states. */ uint32_t fRowLen; /* Length of a state table row, in bytes. */ uint32_t fFlags; /* Option Flags for this state table */ - uint32_t fReserved; /* reserved */ char fTableData[1]; /* First RBBIStateTableRow begins here. */ /* Variable-length array declared with length 1 */ /* to disable bounds checkers. */ @@ -133,10 +140,9 @@ struct RBBIStateTable { /* arithmetic for indexing variable length rows.) */ }; -typedef enum { - RBBI_LOOKAHEAD_HARD_BREAK = 1, - RBBI_BOF_REQUIRED = 2 -} RBBIStateTableFlags; +constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1; +constexpr uint32_t RBBI_BOF_REQUIRED = 2; +constexpr uint32_t RBBI_8BITS_ROWS = 4; /* */ @@ -170,13 +176,13 @@ public: const RBBIDataHeader *fHeader; const RBBIStateTable *fForwardTable; const RBBIStateTable *fReverseTable; - const UChar *fRuleSource; + const char *fRuleSource; const int32_t *fRuleStatusTable; /* number of int32_t values in the rule status table. Used to sanity check indexing */ int32_t fStatusMaxIdx; - UTrie2 *fTrie; + UCPTrie *fTrie; private: u_atomic_int32_t fRefCount; diff --git a/icu4c/source/common/rbbirb.cpp b/icu4c/source/common/rbbirb.cpp index 68ded32e1d0..9c507527b8d 100644 --- a/icu4c/source/common/rbbirb.cpp +++ b/icu4c/source/common/rbbirb.cpp @@ -22,6 +22,7 @@ #include "unicode/uniset.h" #include "unicode/uchar.h" #include "unicode/uchriter.h" +#include "unicode/ustring.h" #include "unicode/parsepos.h" #include "unicode/parseerr.h" @@ -154,7 +155,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { int32_t reverseTableSize = align8(fForwardTable->getSafeTableSize()); int32_t trieSize = align8(fSetBuilder->getTrieSize()); int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); - int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar)); + + int32_t rulesLengthInUTF8 = 0; + u_strToUTF8WithSub(0, 0, &rulesLengthInUTF8, + fStrippedRules.getBuffer(), fStrippedRules.length(), + 0xfffd, nullptr, fStatus); + *fStatus = U_ZERO_ERROR; + + int32_t rulesSize = align8((rulesLengthInUTF8+1)); int32_t totalSize = headerSize + forwardTableSize @@ -197,11 +205,11 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { data->fRTableLen = reverseTableSize; data->fTrie = data->fRTable + data->fRTableLen; - data->fTrieLen = fSetBuilder->getTrieSize(); - data->fStatusTable = data->fTrie + trieSize; + data->fTrieLen = trieSize; + data->fStatusTable = data->fTrie + data->fTrieLen; data->fStatusTableLen= statusTableSize; data->fRuleSource = data->fStatusTable + statusTableSize; - data->fRuleSourceLen = fStrippedRules.length() * sizeof(UChar); + data->fRuleSourceLen = rulesLengthInUTF8; uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); @@ -214,7 +222,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() { ruleStatusTable[i] = fRuleStatusVals->elementAti(i); } - fStrippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); + u_strToUTF8WithSub((char *)data+data->fRuleSource, rulesSize, &rulesLengthInUTF8, + fStrippedRules.getBuffer(), fStrippedRules.length(), + 0xfffd, nullptr, fStatus); + if (U_FAILURE(*fStatus)) { + return NULL; + } return data; } diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp index 4eb324b4b90..9c406af6713 100644 --- a/icu4c/source/common/rbbiscan.cpp +++ b/icu4c/source/common/rbbiscan.cpp @@ -829,16 +829,14 @@ static const UChar chRParen = 0x29; UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) { UnicodeString strippedRules; int32_t rulesLength = rules.length(); - bool skippingSpaces = false; for (int32_t idx=0; idxfStatus; fRangeList = 0; - fTrie = 0; + fMutableTrie = nullptr; + fTrie = nullptr; fTrieSize = 0; fGroupCount = 0; fSawBOF = FALSE; @@ -79,7 +80,8 @@ RBBISetBuilder::~RBBISetBuilder() delete r; } - utrie2_close(fTrie); + ucptrie_close(fTrie); + umutablecptrie_close(fMutableTrie); } @@ -255,17 +257,23 @@ void RBBISetBuilder::buildRanges() { void RBBISetBuilder::buildTrie() { RangeDescriptor *rlRange; - fTrie = utrie2_open(0, // Initial value for all code points. + fMutableTrie = umutablecptrie_open( + 0, // Initial value for all code points. 0, // Error value for out-of-range input. fStatus); + bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie; for (rlRange = fRangeList; rlRange!=0 && U_SUCCESS(*fStatus); rlRange=rlRange->fNext) { - utrie2_setRange32(fTrie, - rlRange->fStartChar, // Range start - rlRange->fEndChar, // Range end (inclusive) - rlRange->fNum, // value for range - TRUE, // Overwrite previously written values - fStatus); + uint32_t value = rlRange->fNum; + if (use8Bits && ((value & RuleBasedBreakIterator::kDictBit) != 0)) { + U_ASSERT((value & RuleBasedBreakIterator::kDictBitFor8BitsTrie) == 0); + value = RuleBasedBreakIterator::kDictBitFor8BitsTrie | (value & ~RuleBasedBreakIterator::kDictBit); + } + umutablecptrie_setRange(fMutableTrie, + rlRange->fStartChar, // Range start + rlRange->fEndChar, // Range end (inclusive) + value, // value for range + fStatus); } } @@ -274,8 +282,8 @@ void RBBISetBuilder::mergeCategories(IntPair categories) { U_ASSERT(categories.first >= 1); U_ASSERT(categories.second > categories.first); for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) { - int32_t rangeNum = rd->fNum & ~DICT_BIT; - int32_t rangeDict = rd->fNum & DICT_BIT; + int32_t rangeNum = rd->fNum & ~RuleBasedBreakIterator::kDictBit; + int32_t rangeDict = rd->fNum & RuleBasedBreakIterator::kDictBit; if (rangeNum == categories.second) { rd->fNum = categories.first | rangeDict; } else if (rangeNum > categories.second) { @@ -295,15 +303,18 @@ int32_t RBBISetBuilder::getTrieSize() { if (U_FAILURE(*fStatus)) { return 0; } - utrie2_freeze(fTrie, UTRIE2_16_VALUE_BITS, fStatus); - fTrieSize = utrie2_serialize(fTrie, - NULL, // Buffer - 0, // Capacity - fStatus); - if (*fStatus == U_BUFFER_OVERFLOW_ERROR) { - *fStatus = U_ZERO_ERROR; + if (fTrie == nullptr) { + bool use8Bits = getNumCharCategories() <= kMaxCharCategoriesFor8BitsTrie; + fTrie = umutablecptrie_buildImmutable( + fMutableTrie, + UCPTRIE_TYPE_FAST, + use8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16, + fStatus); + fTrieSize = ucptrie_toBinary(fTrie, nullptr, 0, fStatus); + if (*fStatus == U_BUFFER_OVERFLOW_ERROR) { + *fStatus = U_ZERO_ERROR; + } } - // RBBIDebugPrintf("Trie table size is %d\n", trieSize); return fTrieSize; } @@ -316,9 +327,9 @@ int32_t RBBISetBuilder::getTrieSize() { // //----------------------------------------------------------------------------------- void RBBISetBuilder::serializeTrie(uint8_t *where) { - utrie2_serialize(fTrie, - where, // Buffer - fTrieSize, // Capacity + ucptrie_toBinary(fTrie, + where, // Buffer + fTrieSize, // Capacity fStatus); } @@ -467,7 +478,7 @@ void RBBISetBuilder::printRangeGroups() { lastPrintedGroupNum = groupNum; RBBIDebugPrintf("%2i ", groupNum); - if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" ");} + if (rlRange->fNum & RuleBasedBreakIterator::kDictBit) { RBBIDebugPrintf(" ");} for (i=0; ifIncludesSets->size(); i++) { RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i); @@ -669,7 +680,7 @@ void RangeDescriptor::setDictionaryFlag() { if (varRef && varRef->fType == RBBINode::varRef) { const UnicodeString *setName = &varRef->fText; if (setName->compare(dictionary, -1) == 0) { - fNum |= RBBISetBuilder::DICT_BIT; + fNum |= RuleBasedBreakIterator::kDictBit; break; } } diff --git a/icu4c/source/common/rbbisetb.h b/icu4c/source/common/rbbisetb.h index ed6a76b1214..cc031a2924d 100644 --- a/icu4c/source/common/rbbisetb.h +++ b/icu4c/source/common/rbbisetb.h @@ -16,9 +16,10 @@ #if !UCONFIG_NO_BREAK_ITERATION +#include "unicode/ucptrie.h" +#include "unicode/umutablecptrie.h" #include "unicode/uobject.h" #include "rbbirb.h" -#include "utrie2.h" #include "uvector.h" U_NAMESPACE_BEGIN @@ -101,8 +102,6 @@ public: */ void mergeCategories(IntPair categories); - static constexpr int32_t DICT_BIT = 0x4000; - #ifdef RBBI_DEBUG void printSets(); void printRanges(); @@ -121,8 +120,9 @@ private: RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors - UTrie2 *fTrie; // The mapping TRIE that is the end result of processing - uint32_t fTrieSize; // the Unicode Sets. + UMutableCPTrie *fMutableTrie; // The mapping TRIE that is the end result of processing + UCPTrie *fTrie; // the Unicode Sets. + uint32_t fTrieSize; // Groups correspond to character categories - // groups of ranges that are in the same original UnicodeSets. diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index 960ef7ec822..ce6e637f813 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -28,6 +28,8 @@ U_NAMESPACE_BEGIN +const int32_t kMaxStateFor8BitsTable = 255; + RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) : fRB(rb), fTree(*rootNode), @@ -1335,11 +1337,18 @@ int32_t RBBITableBuilder::getTableSize() const { numRows = fDStates->size(); numCols = fRB->fSetBuilder->getNumCharCategories(); - rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols; + if (use8BitsForTable()) { + rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols; + } else { + rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols; + } size += numRows * rowSize; return size; } +bool RBBITableBuilder::use8BitsForTable() const { + return fDStates->size() <= kMaxStateFor8BitsTable; +} //----------------------------------------------------------------------------- // @@ -1364,27 +1373,44 @@ void RBBITableBuilder::exportTable(void *where) { return; } - table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount; table->fNumStates = fDStates->size(); table->fFlags = 0; + if (use8BitsForTable()) { + table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount; + table->fFlags |= RBBI_8BITS_ROWS; + } else { + table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount; + } if (fRB->fLookAheadHardBreak) { table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; } if (fRB->fSetBuilder->sawBOF()) { table->fFlags |= RBBI_BOF_REQUIRED; } - table->fReserved = 0; for (state=0; statefNumStates; state++) { RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state); RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); - U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767); - U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767); - row->fAccepting = (int16_t)sd->fAccepting; - row->fLookAhead = (int16_t)sd->fLookAhead; - row->fTagIdx = (int16_t)sd->fTagsIdx; - for (col=0; colfNextState[col] = (uint16_t)sd->fDtran->elementAti(col); + if (use8BitsForTable()) { + U_ASSERT (-128 < sd->fAccepting && sd->fAccepting <= 127); + U_ASSERT (-128 < sd->fLookAhead && sd->fLookAhead <= 127); + U_ASSERT (-128 < sd->fTagsIdx && sd->fTagsIdx <= 127); + row->r8.fAccepting = (int8_t)sd->fAccepting; + row->r8.fLookAhead = (int8_t)sd->fLookAhead; + row->r8.fTagIdx = (int8_t)sd->fTagsIdx; + for (col=0; colfDtran->elementAti(col) <= kMaxStateFor8BitsTable); + row->r8.fNextState[col] = sd->fDtran->elementAti(col); + } + } else { + U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767); + U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767); + row->r16.fAccepting = (int16_t)sd->fAccepting; + row->r16.fLookAhead = (int16_t)sd->fLookAhead; + row->r16.fTagIdx = (int16_t)sd->fTagsIdx; + for (col=0; colr16.fNextState[col] = sd->fDtran->elementAti(col); + } } } } @@ -1520,11 +1546,18 @@ int32_t RBBITableBuilder::getSafeTableSize() const { numRows = fSafeTable->size(); numCols = fRB->fSetBuilder->getNumCharCategories(); - rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols; + if (use8BitsForSafeTable()) { + rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols; + } else { + rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols; + } size += numRows * rowSize; return size; } +bool RBBITableBuilder::use8BitsForSafeTable() const { + return fSafeTable->size() <= kMaxStateFor8BitsTable; +} //----------------------------------------------------------------------------- // @@ -1549,20 +1582,33 @@ void RBBITableBuilder::exportSafeTable(void *where) { return; } - table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount; table->fNumStates = fSafeTable->size(); table->fFlags = 0; - table->fReserved = 0; + if (use8BitsForSafeTable()) { + table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount; + table->fFlags |= RBBI_8BITS_ROWS; + } else { + table->fRowLen = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount; + } for (state=0; statefNumStates; state++) { UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state); RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen); - row->fAccepting = 0; - row->fLookAhead = 0; - row->fTagIdx = 0; - row->fReserved = 0; - for (col=0; colfNextState[col] = rowString->charAt(col); + if (use8BitsForSafeTable()) { + row->r8.fAccepting = 0; + row->r8.fLookAhead = 0; + row->r8.fTagIdx = 0; + for (col=0; colcharAt(col) <= kMaxStateFor8BitsTable); + row->r8.fNextState[col] = rowString->charAt(col); + } + } else { + row->r16.fAccepting = 0; + row->r16.fLookAhead = 0; + row->r16.fTagIdx = 0; + for (col=0; colr16.fNextState[col] = rowString->charAt(col); + } } } } diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index c2b574fe1b8..9bd73a56f83 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -53,6 +53,9 @@ public: */ void exportTable(void *where); + /** Use 8 bits to encode the forward table */ + bool use8BitsForTable() const; + /** * Find duplicate (redundant) character classes. Begin looking with categories.first. * Duplicate, if found are returned in the categories parameter. @@ -85,6 +88,8 @@ public: */ void exportSafeTable(void *where); + /** Use 8 bits to encode the safe reverse table */ + bool use8BitsForSafeTable() const; private: void calcNullable(RBBINode *n); diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index 7825f603a51..843144064ab 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -32,6 +32,8 @@ #include "unicode/parseerr.h" #include "unicode/schriter.h" +struct UCPTrie; + U_NAMESPACE_BEGIN /** @internal */ @@ -659,6 +661,28 @@ private: */ int32_t handleNext(); + /* + * Templatized version of handleNext() and handleSafePrevious(). + * + * There will be exactly four instantiations, two each for 8 and 16 bit tables, + * two each for 8 and 16 bit trie. + * Having separate instantiations for the table types keeps conditional tests of + * the table type out of the inner loops, at the expense of replicated code. + * + * The template parameter for the Trie access function is a value, not a type. + * Doing it this way, the compiler will inline the Trie function in the + * expanded functions. (Both the 8 and 16 bit access functions have the same type + * signature) + */ + + typedef uint16_t (*PTrieFunc)(const UCPTrie *, UChar32); + + template + int32_t handleSafePrevious(int32_t fromPosition); + + template + int32_t handleNext(); + /** * This function returns the appropriate LanguageBreakEngine for a @@ -682,6 +706,16 @@ private: */ void dumpTables(); + /** + * Bit for dictionary based category + */ + static constexpr int32_t kDictBit = 0x4000; + + /** + * Bit for dictionary based category in 8bits trie + */ + static constexpr int32_t kDictBitFor8BitsTrie = 0x0080; + #endif /* U_HIDE_INTERNAL_API */ }; diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp index ecc10e8ea65..fdb496e6c70 100644 --- a/icu4c/source/test/intltest/rbbiapts.cpp +++ b/icu4c/source/test/intltest/rbbiapts.cpp @@ -1030,7 +1030,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) { parseError.offset = 0; LocalUDataMemoryPointer data(udata_open(U_ICUDATA_BRKITR, "brk", dataFile, &status)); uint32_t length; - const UChar *builtSource; + const char *builtSource; const uint8_t *rbbiRules; const uint8_t *builtRules; @@ -1040,7 +1040,7 @@ void RBBIAPITest::RoundtripRule(const char *dataFile) { } builtRules = (const uint8_t *)udata_getMemory(data.getAlias()); - builtSource = (const UChar *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource); + builtSource = (const char *)(builtRules + ((RBBIDataHeader*)builtRules)->fRuleSource); LocalPointer brkItr (new RuleBasedBreakIterator(builtSource, parseError, status)); if (U_FAILURE(status)) { errln("%s:%d createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 1cac149dd8e..484b58994bc 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -128,6 +128,11 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(TestReverse); TESTCASE_AUTO(TestBug13692); TESTCASE_AUTO(TestDebugRules); + TESTCASE_AUTO(Test8BitsTrieWith8BitStateTable); + TESTCASE_AUTO(Test8BitsTrieWith16BitStateTable); + TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable); + TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable); + TESTCASE_AUTO(TestTable_8_16_Bits); #if U_ENABLE_TRACING TESTCASE_AUTO(TestTraceCreateCharacter); @@ -4621,7 +4626,7 @@ void RBBITest::TestBug12677() { RuleBasedBreakIterator bi(rules, pe, status); assertSuccess(WHERE, status); UnicodeString rtRules = bi.getRules(); - assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules); + assertEquals(WHERE, UnicodeString(u"!!forward;$x=[ab#];'#''?';"), rtRules); } @@ -4635,6 +4640,7 @@ void RBBITest::TestTableRedundancies() { RBBIDataWrapper *dw = bi->fData; const RBBIStateTable *fwtbl = dw->fForwardTable; + UBool in8Bits = fwtbl->fFlags & RBBI_8BITS_ROWS; int32_t numCharClasses = dw->fHeader->fCatCount; // printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates); @@ -4645,7 +4651,7 @@ void RBBITest::TestTableRedundancies() { UnicodeString s; for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) { RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); - s.append(row->fNextState[column]); + s.append(in8Bits ? row->r8.fNextState[column] : row->r16.fNextState[column]); } columns.push_back(s); } @@ -4665,12 +4671,22 @@ void RBBITest::TestTableRedundancies() { for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) { UnicodeString s; RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r)); - assertTrue(WHERE, row->fAccepting >= -1); - s.append(row->fAccepting + 1); // values of -1 are expected. - s.append(row->fLookAhead); - s.append(row->fTagIdx); - for (int32_t column = 0; column < numCharClasses; column++) { - s.append(row->fNextState[column]); + if (in8Bits) { + assertTrue(WHERE, row->r8.fAccepting >= -1); + s.append(row->r8.fAccepting + 1); // values of -1 are expected. + s.append(row->r8.fLookAhead); + s.append(row->r8.fTagIdx); + for (int32_t column = 0; column < numCharClasses; column++) { + s.append(row->r8.fNextState[column]); + } + } else { + assertTrue(WHERE, row->r16.fAccepting >= -1); + s.append(row->r16.fAccepting + 1); // values of -1 are expected. + s.append(row->r16.fLookAhead); + s.append(row->r16.fTagIdx); + for (int32_t column = 0; column < numCharClasses; column++) { + s.append(row->r16.fNextState[column]); + } } rows.push_back(s); } @@ -4743,12 +4759,14 @@ void RBBITest::TestReverse(std::unique_ptrbi) { RBBIDataWrapper *data = bi->fData; int32_t categoryCount = data->fHeader->fCatCount; - UTrie2 *trie = data->fTrie; + UCPTrie *trie = data->fTrie; + bool use8BitsTrie = ucptrie_getValueWidth(trie) == UCPTRIE_VALUE_BITS_8; + uint32_t dictBit = use8BitsTrie ? 0x0080 : 0x4000; std::vector strings(categoryCount, UnicodeString()); for (int cp=0; cp<0x1fff0; ++cp) { - int cat = utrie2_get32(trie, cp); - cat &= ~0x4000; // And off the dictionary bit from the category. + int cat = ucptrie_get(trie, cp); + cat &= ~dictBit; // And off the dictionary bit from the category. assertTrue(WHERE, cat < categoryCount && cat >= 0); if (cat < 0 || cat >= categoryCount) return; strings[cat].append(cp); @@ -4886,6 +4904,182 @@ void RBBITest::TestDebugRules() { #endif } +void RBBITest::testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits) { + UCPTrieValueWidth expectedTrieWidth = expectedTrieWidthIn8Bits ? UCPTRIE_VALUE_BITS_8 : UCPTRIE_VALUE_BITS_16; + int32_t expectedStateRowBits = expectedStateRowIn8Bits ? RBBI_8BITS_ROWS : 0; + // Text are duplicate characters from U+4E00 to U+4FFF + UnicodeString text; + for (UChar c = 0x4e00; c < 0x5000; c++) { + text.append(c).append(c); + } + // Generate rule which will caused length+4 character classes and + // length+3 states + UnicodeString rules(u"!!quoted_literals_only;"); + for (UChar c = 0x4e00; c < 0x4e00 + numChar; c++) { + rules.append(u'\'').append(c).append(c).append(u"';"); + } + rules.append(u".;"); + UErrorCode status = U_ZERO_ERROR; + UParseError parseError; + RuleBasedBreakIterator bi(rules, parseError, status); + + assertEquals(WHERE, numChar + 4, bi.fData->fHeader->fCatCount); + assertEquals(WHERE, numChar + 3, bi.fData->fForwardTable->fNumStates); + assertEquals(WHERE, expectedTrieWidth, ucptrie_getValueWidth(bi.fData->fTrie)); + assertEquals(WHERE, expectedStateRowBits, bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS); + assertEquals(WHERE, expectedStateRowBits, bi.fData->fReverseTable->fFlags & RBBI_8BITS_ROWS); + + bi.setText(text); + + int32_t pos; + int32_t i = 0; + while ((pos = bi.next()) > 0) { + // The first numChar should not break between the pair + if (i++ < numChar) { + assertEquals(WHERE, i * 2, pos); + } else { + // After the first numChar next(), break on each character. + assertEquals(WHERE, i + numChar, pos); + } + } + while ((pos = bi.previous()) > 0) { + // The first numChar should not break between the pair + if (--i < numChar) { + assertEquals(WHERE, i * 2, pos); + } else { + // After the first numChar next(), break on each character. + assertEquals(WHERE, i + numChar, pos); + } + } +} + +void RBBITest::Test8BitsTrieWith8BitStateTable() { + testTrieStateTable(123, true /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */); +} + +void RBBITest::Test16BitsTrieWith8BitStateTable() { + testTrieStateTable(124, false /* expectedTrieWidthIn8Bits */, true /* expectedStateRowIn8Bits */); +} + +void RBBITest::Test16BitsTrieWith16BitStateTable() { + testTrieStateTable(255, false /* expectedTrieWidthIn8Bits */, false /* expectedStateRowIn8Bits */); +} + +void RBBITest::Test8BitsTrieWith16BitStateTable() { + // Test UCPTRIE_VALUE_BITS_8 with 16 bits rows. Use a different approach to + // create state table in 16 bits. + + // Generate 510 'a' as text + UnicodeString text; + for (int32_t i = 0; i < 510; i++) { + text.append(u'a'); + } + + UnicodeString rules(u"!!quoted_literals_only;'"); + // 254 'a' in the rule will cause 256 states + for (int32_t i = 0; i < 254; i++) { + rules.append(u'a'); + } + rules.append(u"';.;"); + + UErrorCode status = U_ZERO_ERROR; + UParseError parseError; + LocalPointer bi(new RuleBasedBreakIterator(rules, parseError, status)); + + assertEquals(WHERE, 256, bi->fData->fForwardTable->fNumStates); + assertEquals(WHERE, UCPTRIE_VALUE_BITS_8, ucptrie_getValueWidth(bi->fData->fTrie)); + assertEquals(WHERE, + false, RBBI_8BITS_ROWS == (bi->fData->fForwardTable->fFlags & RBBI_8BITS_ROWS)); + bi->setText(text); + + // break positions: + // 254, 508, 509, ... 510 + assertEquals("next()", 254, bi->next()); + int32_t i = 0; + int32_t pos; + while ((pos = bi->next()) > 0) { + assertEquals(WHERE, 508 + i , pos); + i++; + } + i = 0; + while ((pos = bi->previous()) > 0) { + i++; + if (pos >= 508) { + assertEquals(WHERE, 510 - i , pos); + } else { + assertEquals(WHERE, 254 , pos); + } + } +} + +// Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and +// that there are no problems with rules at the size that transitions between the two. +// +// A rule that matches a literal string, like 'abcdefghij', will require one state and +// one character class per character in the string. So we can make a rule to tickle the +// boundaries by using literal strings of various lengths. +// +// For both the number of states and the number of character classes, the eight bit format +// only has 7 bits available, allowing for 128 values. For both, a few values are reserved, +// leaving 120 something available. This test runs the string over the range of 120 - 130, +// which allows some margin for changes to the number of values reserved by the rule builder +// without breaking the test. + +void RBBITest::TestTable_8_16_Bits() { + + // testStr serves as both the source of the rule string (truncated to the desired length) + // and as test data to check matching behavior. A break rule consisting of the first 120 + // characters of testStr will match the first 120 chars of the full-length testStr. + UnicodeString testStr; + for (UChar c=0x3000; c<0x3200; ++c) { + testStr.append(c); + } + + const int32_t startLength = 120; // The shortest rule string to test. + const int32_t endLength = 260; // The longest rule string to test + const int32_t increment = this->quick ? endLength - startLength : 1; + + for (int32_t ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) { + UParseError parseError; + UErrorCode status = U_ZERO_ERROR; + + UnicodeString ruleString{u"!!quoted_literals_only; '#';"}; + ruleString.findAndReplace(UnicodeString(u"#"), UnicodeString(testStr, 0, ruleLen)); + RuleBasedBreakIterator bi(ruleString, parseError, status); + if (!assertSuccess(WHERE, status)) { + errln(ruleString); + break; + } + // bi.dumpTables(); + + // Verify that the break iterator is functioning - that the first boundary found + // in testStr is at the length of the rule string. + bi.setText(testStr); + assertEquals(WHERE, ruleLen, bi.next()); + + // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache + // of previously detected boundaries, thus forcing the engine to run the safe reverse rules. + bi.setText(testStr); + int32_t result = bi.preceding(ruleLen); + assertEquals(WHERE, 0, result); + + // Verify that the range of rule lengths being tested cover the transations + // from 8 to 16 bit data. + bool has8BitRowData = bi.fData->fForwardTable->fFlags & RBBI_8BITS_ROWS; + bool has8BitsTrie = ucptrie_getValueWidth(bi.fData->fTrie) == UCPTRIE_VALUE_BITS_8; + + if (ruleLen == startLength) { + assertEquals(WHERE, true, has8BitRowData); + assertEquals(WHERE, true, has8BitsTrie); + } + if (ruleLen == endLength) { + assertEquals(WHERE, false, has8BitRowData); + assertEquals(WHERE, false, has8BitsTrie); + } + } +} + + #if U_ENABLE_TRACING static std::vector gData; static std::vector gEntryFn; diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 8f667e5e74d..59adf767723 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -86,6 +86,11 @@ public: void TestDebug(); void TestProperties(); + void Test8BitsTrieWith8BitStateTable(); + void Test8BitsTrieWith16BitStateTable(); + void Test16BitsTrieWith8BitStateTable(); + void Test16BitsTrieWith16BitStateTable(); + void TestTable_8_16_Bits(); #if U_ENABLE_TRACING void TestTraceCreateCharacter(); @@ -133,6 +138,9 @@ private: // Test parameters, from the test framework and test invocation. const char* fTestParams; + // Helper functions to test different trie bit sizes and state table bit sizes. + void testTrieStateTable(int32_t numChar, bool expectedTrieWidthIn8Bits, bool expectedStateRowIn8Bits); + #if U_ENABLE_TRACING void assertTestTraceResult(int32_t fnNumber, const char* expectedData); #endif diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java index 9dc032d8cd2..4e66170f066 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java @@ -12,10 +12,12 @@ package com.ibm.icu.impl; import java.io.DataOutputStream; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import com.ibm.icu.impl.ICUBinary.Authenticate; import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.util.CodePointTrie; /** *

Internal class used for Rule Based Break Iterators.

@@ -43,14 +45,10 @@ public final class RBBIDataWrapper { * Option Flags for this state table. */ public int fFlags; - /** - * Option Flags for this state table. - */ - public int fReserved; /** * Linear array of next state values, accessed as short[state, char_class] */ - public short[] fTable; + public char[] fTable; public RBBIStateTable() { } @@ -59,16 +57,29 @@ public final class RBBIDataWrapper { if (length == 0) { return null; } - if (length < 16) { + if (length < 12) { throw new IOException("Invalid RBBI state table length."); } RBBIStateTable This = new RBBIStateTable(); This.fNumStates = bytes.getInt(); This.fRowLen = bytes.getInt(); This.fFlags = bytes.getInt(); - This.fReserved = bytes.getInt(); - int lengthOfShorts = length - 16; // length in bytes. - This.fTable = ICUBinary.getShorts(bytes, lengthOfShorts / 2, lengthOfShorts & 1); + int lengthOfTable = length - 12; // length in bytes. + boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS; + if (use8Bits) { + This.fTable = new char[lengthOfTable]; + for (int i = 0; i < lengthOfTable; i++) { + byte b = bytes.get(); + if (i % This.fRowLen < NEXTSTATES) { + This.fTable[i] = (char) b; // Treat b as signed. + } else { + This.fTable[i] = (char)(0xff & b); // Treat b as unsigned. + } + } + ICUBinary.skipBytes(bytes, lengthOfTable & 1); + } else { + This.fTable = ICUBinary.getChars(bytes, lengthOfTable / 2, lengthOfTable & 1); + } return This; } @@ -76,13 +87,20 @@ public final class RBBIDataWrapper { bytes.writeInt(fNumStates); bytes.writeInt(fRowLen); bytes.writeInt(fFlags); - bytes.writeInt(fReserved); - int tableLen = fRowLen * fNumStates / 2; // fRowLen is bytes. - for (int i = 0; i < tableLen; i++) { - bytes.writeShort(fTable[i]); + if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) { + int tableLen = fRowLen * fNumStates; // fRowLen is bytes. + for (int i = 0; i < tableLen; i++) { + byte b = (byte)(fTable[i] & 0x00ff); + bytes.writeByte(b); + } + } else { + int tableLen = fRowLen * fNumStates / 2; // fRowLen is bytes. + for (int i = 0; i < tableLen; i++) { + bytes.writeChar(fTable[i]); + } } - int bytesWritten = 16 + fRowLen * fNumStates; // total bytes written, - // including 16 for the header. + int bytesWritten = 12 + fRowLen * fNumStates; // total bytes written, + // including 12 for the header. while (bytesWritten % 8 != 0) { bytes.writeByte(0); ++bytesWritten; @@ -105,7 +123,6 @@ public final class RBBIDataWrapper { if (fNumStates != otherST.fNumStates) return false; if (fRowLen != otherST.fRowLen) return false; if (fFlags != otherST.fFlags) return false; - if (fReserved != otherST.fReserved) return false; return Arrays.equals(fTable, otherST.fTable); } } @@ -134,12 +151,12 @@ public final class RBBIDataWrapper { public RBBIStateTable fRTable; - public Trie2 fTrie; + public CodePointTrie fTrie; public String fRuleSource; public int fStatusTable[]; public static final int DATA_FORMAT = 0x42726b20; // "Brk " - public static final int FORMAT_VERSION = 0x05000000; // 4.0.0.0 + public static final int FORMAT_VERSION = 0x06000000; // 6.0.0.0 private static final class IsAcceptable implements Authenticate { @Override @@ -186,20 +203,20 @@ public final class RBBIDataWrapper { * offset to the "tagIndex" field in a state table row. */ public final static int TAGIDX = 2; - /** - * offset to the reserved field in a state table row. - */ - public final static int RESERVED = 3; /** * offset to the start of the next states array in a state table row. */ - public final static int NEXTSTATES = 4; + public final static int NEXTSTATES = 3; // Bit selectors for the "FLAGS" field of the state table header // enum RBBIStateTableFlags in the C version. // public final static int RBBI_LOOKAHEAD_HARD_BREAK = 1; public final static int RBBI_BOF_REQUIRED = 2; + public final static int RBBI_8BITS_ROWS = 4; + + public final static int DICT_BIT = 0x4000; + public final static int DICT_BIT_FOR_8BITS_TRIE = 0x0080; /** * Data Header. A struct-like class with the fields from the RBBI data file header. @@ -243,7 +260,7 @@ public final class RBBIDataWrapper { * array index of the start of the state table row for that state. */ public int getRowIndex(int state){ - return state * (fHeader.fCatCount + 4); + return state * (fHeader.fCatCount + NEXTSTATES); } RBBIDataWrapper() { @@ -330,7 +347,10 @@ public final class RBBIDataWrapper { // as we don't go more than 100 bytes past the // past the end of the TRIE. - This.fTrie = Trie2.createFromSerialized(bytes); // Deserialize the TRIE, leaving buffer + This.fTrie = CodePointTrie.fromBinary( + CodePointTrie.Type.FAST, + null, + bytes); // Deserialize the TRIE, leaving buffer // at an unknown position, preceding the // padding between TRIE and following section. @@ -359,8 +379,8 @@ public final class RBBIDataWrapper { } ICUBinary.skipBytes(bytes, This.fHeader.fRuleSource - pos); pos = This.fHeader.fRuleSource; - This.fRuleSource = ICUBinary.getString( - bytes, This.fHeader.fRuleSourceLen / 2, This.fHeader.fRuleSourceLen & 1); + This.fRuleSource = new String( + ICUBinary.getBytes(bytes, This.fHeader.fRuleSourceLen, 0), StandardCharsets.UTF_8); if (RuleBasedBreakIterator.fDebugEnv!=null && RuleBasedBreakIterator.fDebugEnv.indexOf("data")>=0) { This.dump(System.out); @@ -396,6 +416,15 @@ public final class RBBIDataWrapper { return dest.toString(); } + static public String charToString(char n, int width) { + StringBuilder dest = new StringBuilder(width); + dest.append(n); + while (dest.length() < width) { + dest.insert(0, ' '); + } + return dest.toString(); + } + /** Fixed width int-to-string conversion. */ static public String intToHexString(int n, int width) { StringBuilder dest = new StringBuilder(width); @@ -408,11 +437,11 @@ public final class RBBIDataWrapper { /** Dump a state table. (A full set of RBBI rules has 4 state tables.) */ private void dumpTable(java.io.PrintStream out, RBBIStateTable table) { - if (table == null || table.fTable.length == 0) { + if (table == null || (table.fTable.length == 0)) { out.println(" -- null -- "); } else { - int n; - int state; + char n; + char state; StringBuilder header = new StringBuilder(" Row Acc Look Tag"); for (n=0; n fHeader.fCatCount) { out.println("Error, bad category " + Integer.toHexString(category) + " for char " + Integer.toHexString(char32)); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java index 293856471f4..dfe3d2adddd 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleBuilder.java @@ -11,6 +11,7 @@ package com.ibm.icu.text; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -182,7 +183,9 @@ class RBBIRuleBuilder { int reverseTableSize = align8(fForwardTable.getSafeTableSize()); int trieSize = align8(fSetBuilder.getTrieSize()); int statusTableSize = align8(fRuleStatusVals.size() * 4); - int rulesSize = align8((strippedRules.length()) * 2); + + byte[] strippedRulesUTF8 = strippedRules.getBytes(StandardCharsets.UTF_8); + int rulesSize = align8(strippedRulesUTF8.length + 1); int totalSize = headerSize + forwardTableSize @@ -202,7 +205,7 @@ class RBBIRuleBuilder { header[RBBIDataWrapper.DH_MAGIC] = 0xb1a0; header[RBBIDataWrapper.DH_FORMATVERSION] = RBBIDataWrapper.FORMAT_VERSION; header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections. - header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount. + header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen @@ -214,11 +217,11 @@ class RBBIRuleBuilder { + header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE] - + header[RBBIDataWrapper.DH_TRIELEN]; + + trieSize; header[RBBIDataWrapper.DH_STATUSTABLELEN] = statusTableSize; // fStatusTableLen header[RBBIDataWrapper.DH_RULESOURCE] = header[RBBIDataWrapper.DH_STATUSTABLE] + statusTableSize; - header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRules.length() * 2; + header[RBBIDataWrapper.DH_RULESOURCELEN] = strippedRulesUTF8.length; for (i = 0; i < header.length; i++) { dos.writeInt(header[i]); outputPos += 4; @@ -257,8 +260,9 @@ class RBBIRuleBuilder { // Write out the stripped rules (rules with extra spaces removed // These go last in the data area, even though they are not last in the header. Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]); - dos.writeChars(strippedRules); - outputPos += strippedRules.length() * 2; + dos.write(strippedRulesUTF8, 0, strippedRulesUTF8.length); + dos.write(0); // Null termination + outputPos += strippedRulesUTF8.length + 1; while (outputPos % 8 != 0) { // pad to an 8 byte boundary dos.write(0); outputPos += 1; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java index 3bcc30d7100..9045e1f3912 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBIRuleScanner.java @@ -697,16 +697,14 @@ class RBBIRuleScanner { static String stripRules(String rules) { StringBuilder strippedRules = new StringBuilder(); int rulesLength = rules.length(); - boolean skippingSpaces = false; for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) { int cp = rules.codePointAt(idx); boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE); - if (skippingSpaces && whiteSpace) { + if (whiteSpace) { continue; } strippedRules.appendCodePoint(cp); - skippingSpaces = whiteSpace; } return strippedRules.toString(); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java index 17b0b10fc85..e7189a58c80 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBISetBuilder.java @@ -8,15 +8,16 @@ */ package com.ibm.icu.text; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import com.ibm.icu.impl.Assert; -import com.ibm.icu.impl.Trie2Writable; -import com.ibm.icu.impl.Trie2_16; import com.ibm.icu.text.RBBIRuleBuilder.IntPair; +import com.ibm.icu.util.CodePointTrie; +import com.ibm.icu.util.MutableCodePointTrie; // // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules @@ -125,9 +126,9 @@ class RBBISetBuilder { RBBIRuleBuilder fRB; // The RBBI Rule Compiler that owns us. RangeDescriptor fRangeList; // Head of the linked list of RangeDescriptors - Trie2Writable fTrie; // The mapping TRIE that is the end result of processing + MutableCodePointTrie fTrie; // The mapping TRIE that is the end result of processing // the Unicode Sets. - Trie2_16 fFrozenTrie; + CodePointTrie fFrozenTrie; // Groups correspond to character categories - // groups of ranges that are in the same original UnicodeSets. @@ -140,6 +141,7 @@ class RBBISetBuilder { boolean fSawBOF; static final int DICT_BIT = 0x4000; + static final int DICT_BIT_FOR_8BITS_TRIE = 0x0080; //------------------------------------------------------------------------ @@ -286,22 +288,30 @@ class RBBISetBuilder { } + private static final int MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE = 127; + /** * Build the Trie table for mapping UChar32 values to the corresponding * range group number. */ void buildTrie() { + boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE; RangeDescriptor rlRange; - fTrie = new Trie2Writable(0, // Initial value for all code points. - 0); // Error value for out-of-range input. + fTrie = new MutableCodePointTrie(0, // Initial value for all code points. + 0); // Error value for out-of-range input. for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) { + int value = rlRange.fNum; + if (use8Bits && ((value & DICT_BIT) != 0)) { + assert((value & DICT_BIT_FOR_8BITS_TRIE) == 0); + // switch to the bit from DICT_BIT to DICT_BIT_FOR_8BITS_TRIE + value = DICT_BIT_FOR_8BITS_TRIE | (value & ~DICT_BIT); + } fTrie.setRange( rlRange.fStartChar, // Range start rlRange.fEndChar, // Range end (inclusive) - rlRange.fNum, // value for range - true // Overwrite previously written values + value // value for range ); } } @@ -326,17 +336,31 @@ class RBBISetBuilder { --fGroupCount; } + //----------------------------------------------------------------------------------- + // + // freezeTrieIfNotYet() Ensure the trie is frozen. Shared code by getTrieSize + // and serializeTrie. + // + //----------------------------------------------------------------------------------- + void freezeTrieIfNotYet() { + if (fFrozenTrie == null) { + boolean use8Bits = getNumCharCategories() <= MAX_CHAR_CATEGORIES_FOR_8BITS_TRIE; + fFrozenTrie = fTrie.buildImmutable(CodePointTrie.Type.FAST, + use8Bits ? + CodePointTrie.ValueWidth.BITS_8 : + CodePointTrie.ValueWidth.BITS_16); + fTrie = null; + } + } + //----------------------------------------------------------------------------------- // // getTrieSize() Return the size that will be required to serialize the Trie. // //----------------------------------------------------------------------------------- int getTrieSize() { - if (fFrozenTrie == null) { - fFrozenTrie = fTrie.toTrie2_16(); - fTrie = null; - } - return fFrozenTrie.getSerializedLength(); + freezeTrieIfNotYet(); + return fFrozenTrie.toBinary(new ByteArrayOutputStream()); } @@ -346,11 +370,8 @@ class RBBISetBuilder { // //----------------------------------------------------------------------------------- void serializeTrie(OutputStream os) throws IOException { - if (fFrozenTrie == null) { - fFrozenTrie = fTrie.toTrie2_16(); - fTrie = null; - } - fFrozenTrie.serialize(os); + freezeTrieIfNotYet(); + fFrozenTrie.toBinary(os); } //------------------------------------------------------------------------ diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java index 4e1a1b7f6b3..8d9fdeed3ac 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java @@ -74,6 +74,8 @@ class RBBITableBuilder { /** Synthesized safe table, a List of row arrays. */ private List fSafeTable; + private static final int MAX_STATE_FOR_8BITS_TABLE = 255; + /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */ int[] fLookAheadRuleMap; @@ -1097,10 +1099,11 @@ class RBBITableBuilder { if (fRB.fTreeRoots[fRootIx] == null) { return 0; } - int size = 16; // The header of 4 ints, with no rows to the table. + int size = 12; // The header of 4 ints, with no rows to the table. int numRows = fDStates.size(); int numCols = fRB.fSetBuilder.getNumCharCategories(); - int rowSize = 8 + 2*numCols; + boolean use8Bits = numRows <= MAX_STATE_FOR_8BITS_TABLE; + int rowSize = (use8Bits ? 1 : 2 ) * (RBBIDataWrapper.NEXTSTATES + numCols); size += numRows * rowSize; size = (size + 7) & ~7; // round up to a multiple of 8 bytes return size; @@ -1125,13 +1128,20 @@ class RBBITableBuilder { Assert.assrt(fRB.fSetBuilder.getNumCharCategories() < 0x7fff && fDStates.size() < 0x7fff); table.fNumStates = fDStates.size(); + boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE; // Size of table size in shorts. - // the "4" is the size of struct RBBIStateTableRow, the row header part only. - int rowLen = 4 + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts. - int tableSize = (getTableSize() - 16) / 2; // fTable length in shorts. - table.fTable = new short[tableSize]; - table.fRowLen = rowLen * 2; // Row length in bytes. + int rowLen = RBBIDataWrapper.NEXTSTATES + fRB.fSetBuilder.getNumCharCategories(); // Row Length in shorts. + int tableSize; + if (use8Bits) { + tableSize = (getTableSize() - 12); // fTable length in bytes. + table.fTable = new char[tableSize]; + table.fRowLen = rowLen; // Row length in bytes. + } else { + tableSize = (getTableSize() - 12) / 2; // fTable length in shorts. + table.fTable = new char[tableSize]; + table.fRowLen = rowLen * 2; // Row length in bytes. + } if (fRB.fLookAheadHardBreak) { table.fFlags |= RBBIDataWrapper.RBBI_LOOKAHEAD_HARD_BREAK; @@ -1139,18 +1149,29 @@ class RBBITableBuilder { if (fRB.fSetBuilder.sawBOF()) { table.fFlags |= RBBIDataWrapper.RBBI_BOF_REQUIRED; } + if (use8Bits) { + table.fFlags |= RBBIDataWrapper.RBBI_8BITS_ROWS; + } int numCharCategories = fRB.fSetBuilder.getNumCharCategories(); for (state=0; state= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { @@ -927,7 +928,7 @@ public class RuleBasedBreakIterator extends BreakIterator { } int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING]; - if (completedRule > 0) { + if (completedRule > 0 && completedRule != 0xffff) { // Lookahead match is completed int lookaheadResult = fLookAheadMatches.getPosition(completedRule); if (lookaheadResult >= 0) { @@ -937,13 +938,14 @@ public class RuleBasedBreakIterator extends BreakIterator { } } + // If we are at the position of the '/' in a look-ahead (hard break) rule; // record the current position, to be returned later, if the full rule matches. // TODO: Move this check before the previous check of fAccepting. // This would enable hard-break rules with no following context. // But there are line break test failures when trying this. Investigate. // Issue ICU-20837 - int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; + int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD]; if (rule != 0) { int pos = text.getIndex(); if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) { @@ -996,14 +998,17 @@ public class RuleBasedBreakIterator extends BreakIterator { * @internal */ private int handleSafePrevious(int fromPosition) { - int state; + char state; short category = 0; int result = 0; // caches for quicker access CharacterIterator text = fText; - Trie2 trie = fRData.fTrie; - short[] stateTable = fRData.fRTable.fTable; + CodePointTrie trie = fRData.fTrie; + char[] stateTable = fRData.fRTable.fTable; + int flagsState = fRData.fRTable.fFlags; + int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? + RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT; CISetIndex32(text, fromPosition); if (TRACE) { @@ -1029,7 +1034,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // // And off the dictionary flag bit. For reverse iteration it is not used. category = (short) trie.get(c); - category &= ~0x4000; + category &= ~dictMask; if (TRACE) { System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5)); System.out.print(RBBIDataWrapper.intToHexString(c, 10)); @@ -1209,6 +1214,8 @@ public class RuleBasedBreakIterator extends BreakIterator { int category; int current; int foundBreakCount = 0; + int dictMask = fRData.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8 ? + RBBIDataWrapper.DICT_BIT_FOR_8BITS_TRIE : RBBIDataWrapper.DICT_BIT; // Loop through the text, looking for ranges of dictionary characters. // For each span, find the appropriate break engine, and ask it to find @@ -1219,7 +1226,7 @@ public class RuleBasedBreakIterator extends BreakIterator { category = (short)fRData.fTrie.get(c); while(true) { - while((current = fText.getIndex()) < rangeEnd && (category & 0x4000) == 0) { + while((current = fText.getIndex()) < rangeEnd && (category & dictMask) == 0) { c = CharacterIteration.next32(fText); // pre-increment category = (short)fRData.fTrie.get(c); } diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 50fcaa746d3..e126ae2a1d2 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f8b27a3e77ffbe4468ede22ae8a4ca85df993472d6bdea432505a148ff33f23 -size 13149606 +oid sha256:8ed7db50765b06c8a35f48048543c5c9a2c2e19993f752bd71a15e6ac89aa3b3 +size 13141781 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index af7908c2900..8fd2b94f214 100644 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58c1ec5386cba3b6660c3bf8c22ce74c343d3101354f157533d13ec1099e1379 -size 94524 +oid sha256:6d2882ccb44134313ff0365eb24776d4e859fa9dd223f10d608d65fdfd7f23d9 +size 94529 diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar index 6ca6a1b4804..e4207f027e0 100644 --- a/icu4j/main/shared/data/testdata.jar +++ b/icu4j/main/shared/data/testdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c39bb717c3e95c47c14f49507e8a7866e89bdb3588f021693c3909dda64f4dcb -size 723466 +oid sha256:e032f823e0ba2fd99f784fe400675049c126e091158a285955c71aa5e2c6036b +size 723481 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index 353d5151509..f790db886ae 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -29,8 +29,10 @@ import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.impl.RBBIDataWrapper; import com.ibm.icu.text.BreakIterator; import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.util.CodePointTrie; import com.ibm.icu.util.ULocale; + @RunWith(JUnit4.class) public class RBBITest extends TestFmwk { public RBBITest() { @@ -562,7 +564,7 @@ public class RBBITest extends TestFmwk { RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rules); String rtRules = bi.toString(); // getRules() in C++ - assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ", rtRules); + assertEquals("Break Iterator rule stripping test", "!!forward;$x=[ab#];'#''?';", rtRules); } @Test @@ -582,7 +584,7 @@ public class RBBITest extends TestFmwk { StringBuilder s = new StringBuilder(); for (int r = 1; r < fwtbl.fNumStates; r++) { int row = dw.getRowIndex(r); - short tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column]; + char tableVal = fwtbl.fTable[row + RBBIDataWrapper.NEXTSTATES + column]; s.append((char)tableVal); } columns.add(s.toString()); @@ -602,13 +604,12 @@ public class RBBITest extends TestFmwk { for (int r=0; r= -1); s.append(fwtbl.fTable[row + RBBIDataWrapper.ACCEPTING]); s.append(fwtbl.fTable[row + RBBIDataWrapper.LOOKAHEAD]); s.append(fwtbl.fTable[row + RBBIDataWrapper.TAGIDX]); for (int column=0; column 0) { + assertEquals("next()", 508 + i , pos); + i++; + } + i = 0; + while ((pos = bi.previous()) > 0) { + i++; + if (pos >= 508) { + assertEquals("previous()", 510 - i , pos); + } else { + assertEquals("previous()", 254 , pos); + } + } + } + + /** + * Test that both compact (8 bit) and full sized (16 bit) rbbi tables work, and + * that there are no problems with rules at the size that transitions between the two. + * + * A rule that matches a literal string, like 'abcdefghij', will require one state and + * one character class per character in the string. So we can make a rule to tickle the + * boundaries by using literal strings of various lengths. + * + * For both the number of states and the number of character classes, the eight bit format + * only has 7 bits available, allowing for 128 values. For both, a few values are reserved, + * leaving 120 something available. This test runs the string over the range of 120 - 130, + * which allows some margin for changes to the number of values reserved by the rule builder + * without breaking the test. + */ + @Test + public void TestTable_8_16_Bits() { + // testStr serves as both the source of the rule string (truncated to the desired length) + // and as test data to check matching behavior. A break rule consisting of the first 120 + // characters of testStr will match the first 120 chars of the full-length testStr. + StringBuilder builder = new StringBuilder(0x200); + for (char c=0x3000; c<0x3200; ++c) { + builder.append(c); + } + String testStr = builder.toString(); + + int startLength = 120; // The shortest rule string to test. + int endLength = 260; // The longest rule string to test + int increment = 1; + for (int ruleLen=startLength; ruleLen <= endLength; ruleLen += increment) { + String ruleString = (new String("!!quoted_literals_only; '#';")) + .replace("#", testStr.substring(0, ruleLen)); + RuleBasedBreakIterator bi = new RuleBasedBreakIterator(ruleString); + + // Verify that the break iterator is functioning - that the first boundary found + // in testStr is at the length of the rule string. + bi.setText(testStr); + assertEquals("The first boundary found in testStr should be at the length of the rule string", + ruleLen, bi.next()); + + // Reverse iteration. Do a setText() first, to flush the break iterator's internal cache + // of previously detected boundaries, thus forcing the engine to run the safe reverse rules. + bi.setText(testStr); + int result = bi.preceding(ruleLen); + assertEquals("Reverse iteration should find the boundary at 0", 0, result); + + // Verify that the range of rule lengths being tested cover the transations + // from 8 to 16 bit data. + RBBIDataWrapper dw = bi.fRData; + RBBIDataWrapper.RBBIStateTable fwtbl = dw.fFTable; + + boolean has8BitRowData = (fwtbl.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) != 0; + boolean has8BitsTrie = dw.fTrie.getValueWidth() == CodePointTrie.ValueWidth.BITS_8; + if (ruleLen == startLength) { + assertTrue("State table should be in 8 bits", has8BitRowData); + assertTrue("Trie should be in 8 bits", has8BitsTrie); + } + if (ruleLen == endLength) { + assertFalse("State table should be in 16 bits", has8BitRowData); + assertFalse("Trie should be in 16 bits", has8BitsTrie); + } + } + } }