diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 369e400630d..9b7e70c3cf4 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -68,10 +68,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode init(status); fData = new RBBIDataWrapper(data, status); // status checked in constructor if (U_FAILURE(status)) {return;} - if(fData == 0) { + if(fData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } + if (fData->fForwardTable->fLookAheadResultsSize > 0) { + fLookAheadMatches = static_cast( + uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); + if (fLookAheadMatches == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + } } // @@ -98,10 +106,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, } fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); if (U_FAILURE(status)) {return;} - if(fData == 0) { + if(fData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } + if (fData->fForwardTable->fLookAheadResultsSize > 0) { + fLookAheadMatches = static_cast( + uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); + if (fLookAheadMatches == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + } } @@ -117,10 +133,18 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &sta init(status); fData = new RBBIDataWrapper(udm, status); // status checked in constructor if (U_FAILURE(status)) {return;} - if(fData == 0) { + if(fData == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } + if (fData->fForwardTable->fLookAheadResultsSize > 0) { + fLookAheadMatches = static_cast( + uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); + if (fLookAheadMatches == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + } } @@ -188,30 +212,34 @@ RuleBasedBreakIterator::~RuleBasedBreakIterator() { // fCharIter was adopted from the outside. delete fCharIter; } - fCharIter = NULL; + fCharIter = nullptr; utext_close(&fText); - if (fData != NULL) { + if (fData != nullptr) { fData->removeReference(); - fData = NULL; + fData = nullptr; } delete fBreakCache; - fBreakCache = NULL; + fBreakCache = nullptr; delete fDictionaryCache; - fDictionaryCache = NULL; + fDictionaryCache = nullptr; delete fLanguageBreakEngines; - fLanguageBreakEngines = NULL; + fLanguageBreakEngines = nullptr; delete fUnhandledBreakEngine; - fUnhandledBreakEngine = NULL; + fUnhandledBreakEngine = nullptr; + + uprv_free(fLookAheadMatches); + fLookAheadMatches = nullptr; } /** * Assignment operator. Sets this iterator to have the same behavior, * and iterate over the same text, as the one passed in. + * TODO: needs better handling of memory allocation errors. */ RuleBasedBreakIterator& RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { @@ -252,6 +280,14 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { fData = that.fData->addReference(); } + uprv_free(fLookAheadMatches); + fLookAheadMatches = nullptr; + if (fData && fData->fForwardTable->fLookAheadResultsSize > 0) { + fLookAheadMatches = static_cast( + uprv_malloc(fData->fForwardTable->fLookAheadResultsSize * sizeof(int32_t))); + } + + fPosition = that.fPosition; fRuleStatusIndex = that.fRuleStatusIndex; fDone = that.fDone; @@ -275,16 +311,17 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { // //----------------------------------------------------------------------------- void RuleBasedBreakIterator::init(UErrorCode &status) { - fCharIter = NULL; - fData = NULL; + fCharIter = nullptr; + fData = nullptr; fPosition = 0; fRuleStatusIndex = 0; fDone = false; fDictionaryCharCount = 0; - fLanguageBreakEngines = NULL; - fUnhandledBreakEngine = NULL; - fBreakCache = NULL; - fDictionaryCache = NULL; + fLanguageBreakEngines = nullptr; + fUnhandledBreakEngine = nullptr; + fBreakCache = nullptr; + fDictionaryCache = nullptr; + fLookAheadMatches = nullptr; // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER. // fText = UTEXT_INITIALIZER; @@ -700,52 +737,6 @@ enum RBBIRunMode { }; -// Map from look-ahead break states (corresponds to rules) to boundary positions. -// Allows multiple lookahead break rules to be in flight at the same time. -// -// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers -// in the state table be sequential, then we can just index an array. And the -// table could also tell us in advance how big that array needs to be. -// -// Before ICU 57 there was just a single simple variable for a look-ahead match that -// was in progress. Two rules at once did not work. - -static const int32_t kMaxLookaheads = 8; -struct LookAheadResults { - int32_t fUsedSlotLimit; - int32_t fPositions[8]; - uint16_t fKeys[8]; - - LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {} - - int32_t getPosition(uint16_t key) { - for (int32_t i=0; i= kMaxLookaheads) { - UPRV_UNREACHABLE; - } - fKeys[i] = key; - fPositions[i] = position; - U_ASSERT(fUsedSlotLimit == i); - fUsedSlotLimit = i + 1; - } -}; - - // Wrapper functions to select the appropriate handleNext() or handleSafePrevious() // instantiation, based on whether an 8 or 16 bit table is required. // @@ -809,7 +800,6 @@ int32_t RuleBasedBreakIterator::handleNext() { RowType *row; UChar32 c; - LookAheadResults lookAheadMatches; int32_t result = 0; int32_t initialPosition = 0; const RBBIStateTable *statetable = fData->fForwardTable; @@ -912,7 +902,8 @@ int32_t RuleBasedBreakIterator::handleNext() { fRuleStatusIndex = row->fTagsIdx; // Remember the break status (tag) values. } else if (accepting > ACCEPTING_UNCONDITIONAL) { // Lookahead match is completed. - int32_t lookaheadResult = lookAheadMatches.getPosition(accepting); + U_ASSERT(accepting < fData->fForwardTable->fLookAheadResultsSize); + int32_t lookaheadResult = fLookAheadMatches[accepting]; if (lookaheadResult >= 0) { fRuleStatusIndex = row->fTagsIdx; fPosition = lookaheadResult; @@ -927,9 +918,11 @@ int32_t RuleBasedBreakIterator::handleNext() { // But there are line break test failures when trying this. Investigate. // Issue ICU-20837 uint16_t rule = row->fLookAhead; - if (rule != 0) { + U_ASSERT(rule == 0 || rule > ACCEPTING_UNCONDITIONAL); + U_ASSERT(rule == 0 || rule < fData->fForwardTable->fLookAheadResultsSize); + if (rule > ACCEPTING_UNCONDITIONAL) { int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText); - lookAheadMatches.setPosition(rule, pos); + fLookAheadMatches[rule] = pos; } if (state == STOP_STATE) { diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp index 970fa8197bd..193acafc442 100644 --- a/icu4c/source/common/rbbidata.cpp +++ b/icu4c/source/common/rbbidata.cpp @@ -230,14 +230,16 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab uint32_t c; uint32_t s; - RBBIDebugPrintf(" %s\n", heading); + RBBIDebugPrintf("%s\n", heading); - RBBIDebugPrintf("Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n", + RBBIDebugPrintf(" fDictCategoriesStart: %d\n", table->fDictCategoriesStart); + RBBIDebugPrintf(" fLookAheadResultsSize: %d\n", table->fLookAheadResultsSize); + RBBIDebugPrintf(" Flags: %4x RBBI_LOOKAHEAD_HARD_BREAK=%s RBBI_BOF_REQUIRED=%s RBBI_8BITS_ROWS=%s\n", table->fFlags, table->fFlags & RBBI_LOOKAHEAD_HARD_BREAK ? "T" : "F", table->fFlags & RBBI_BOF_REQUIRED ? "T" : "F", table->fFlags & RBBI_8BITS_ROWS ? "T" : "F"); - RBBIDebugPrintf("State | Acc LA TagIx"); + RBBIDebugPrintf("\nState | Acc LA TagIx"); for (c=0; cfCatCount; c++) {RBBIDebugPrintf("%3d ", c);} RBBIDebugPrintf("\n------|---------------"); for (c=0;cfCatCount; c++) { RBBIDebugPrintf("----"); diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h index efbd4bea112..0c078a82f86 100644 --- a/icu4c/source/common/rbbidata.h +++ b/icu4c/source/common/rbbidata.h @@ -137,6 +137,8 @@ struct RBBIStateTable { uint32_t fDictCategoriesStart; // Char category number of the first dictionary // char class, or the the largest category number + 1 // if there are no dictionary categories. + uint32_t fLookAheadResultsSize; // Size of run-time array required for holding + // look-ahead results. Indexed by row.fLookAhead. uint32_t fFlags; // Option Flags for this state table. char fTableData[1]; // First RBBIStateTableRow begins here. // Variable-length array declared with length 1 diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index 09a6aaa0189..2f12d60d608 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -715,12 +715,6 @@ void RBBITableBuilder::mapLookAheadRules() { } fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1); - // Counter used when assigning lookahead rule numbers. - // Contains the last look-ahead number already in use. - // The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved - // for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. - int32_t laSlotsInUse = ACCEPTING_UNCONDITIONAL; - for (int32_t n=0; nsize(); n++) { RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n); int32_t laSlotForState = 0; @@ -756,7 +750,7 @@ void RBBITableBuilder::mapLookAheadRules() { } if (laSlotForState == 0) { - laSlotForState = ++laSlotsInUse; + laSlotForState = ++fLASlotsInUse; } // For each look ahead node covered by this state, @@ -1386,6 +1380,7 @@ void RBBITableBuilder::exportTable(void *where) { table->fNumStates = fDStates->size(); table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart(); + table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1; table->fFlags = 0; if (use8BitsForTable()) { table->fRowLen = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount; diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index bab4ff595bf..fe3db8d7bf1 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -20,6 +20,7 @@ #include "unicode/uobject.h" #include "unicode/rbbi.h" +#include "rbbidata.h" #include "rbbirb.h" #include "rbbinode.h" @@ -184,9 +185,15 @@ private: /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */ UVector32 *fLookAheadRuleMap = nullptr; + /* Counter used when assigning lookahead rule numbers. + * Contains the last look-ahead number already in use. + * The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved + * for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */ + int32_t fLASlotsInUse = ACCEPTING_UNCONDITIONAL; - RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class - RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class + + RBBITableBuilder(const RBBITableBuilder &other) = delete; // forbid copying of this class + RBBITableBuilder &operator=(const RBBITableBuilder &other) = delete; // forbid copying of this class }; // diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h index 2ebe931b1c0..cc8b362c9cc 100644 --- a/icu4c/source/common/unicode/rbbi.h +++ b/icu4c/source/common/unicode/rbbi.h @@ -142,6 +142,11 @@ private: */ UBool fDone; + /** + * Array of look-ahead tentative results. + */ + int32_t *fLookAheadMatches; + //======================================================================= // constructors //======================================================================= diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 84c6cd782b3..c3d12881d42 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -133,6 +133,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha TESTCASE_AUTO(Test16BitsTrieWith8BitStateTable); TESTCASE_AUTO(Test16BitsTrieWith16BitStateTable); TESTCASE_AUTO(TestTable_8_16_Bits); + TESTCASE_AUTO(TestBug13590); #if U_ENABLE_TRACING TESTCASE_AUTO(TestTraceCreateCharacter); @@ -5078,6 +5079,63 @@ void RBBITest::TestTable_8_16_Bits() { } } +/* Test handling of a large number of look-ahead rules. + * The number of rules in the test exceeds the implementation limits prior to the + * improvements introduced with #13590. + * + * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ... + * The text being matched is sequential, "ABCDEFGHI..." + * + * The upshot is that the look-ahead rules all match on their preceding context, + * and consequently must save a potential result, but then fail to match on their + * trailing context, so that they don't actually cause a boundary. + * + * Additionally, add a ".*" rule, so there are no boundaries unless a + * look-ahead hard-break rule forces one. + */ +void RBBITest::TestBug13590() { + UnicodeString rules {u"!!quoted_literals_only; !!chain; .*;\n"}; + + const int NUM_LOOKAHEAD_RULES = 50; + const char16_t STARTING_CHAR = u'\u5000'; + char16_t firstChar; + for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) { + firstChar = STARTING_CHAR + ruleNum*2; + rules.append(u'\'') .append(firstChar) .append(firstChar+1) .append(u'\'') + .append(u' ') .append(u'/') .append(u' ') + .append(u'\'') .append(firstChar+2) .append(firstChar+4) .append(u'\'') + .append(u';') .append(u'\n'); + } + + // Change the last rule added from the form "UV / WY" to "UV / WX". + // Changes the rule so that it will match - all 4 chars are in ascending sequence. + rules.findAndReplace(UnicodeString(firstChar+4), UnicodeString(firstChar+3)); + + UErrorCode status = U_ZERO_ERROR; + UParseError parseError; + RuleBasedBreakIterator bi(rules, parseError, status); + if (!assertSuccess(WHERE, status)) { + errln(rules); + return; + } + // bi.dumpTables(); + + UnicodeString testString; + for (char16_t c = STARTING_CHAR-200; c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) { + testString.append(c); + } + bi.setText(testString); + + int breaksFound = 0; + while (bi.next() != UBRK_DONE) { + ++breaksFound; + } + + // Two matches are expected, one from the last rule that was explicitly modified, + // and one at the end of the text. + assertEquals(WHERE, 2, breaksFound); +} + #if U_ENABLE_TRACING static std::vector gData; diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 59adf767723..da144114af8 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -91,6 +91,7 @@ public: void Test16BitsTrieWith8BitStateTable(); void Test16BitsTrieWith16BitStateTable(); void TestTable_8_16_Bits(); + void TestBug13590(); #if U_ENABLE_TRACING void TestTraceCreateCharacter(); diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java index fd81e6ecff5..0db534dc658 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/RBBIDataWrapper.java @@ -46,6 +46,11 @@ public final class RBBIDataWrapper { * or the the largest category number + 1 if there are no dictionary categories. */ public int fDictCategoriesStart; + /** + * Size of run-time array required for holding + * look-ahead results. Indexed by row.fLookAhead. + */ + public int fLookAheadResultsSize; /** * Option Flags for this state table. */ @@ -54,7 +59,7 @@ public final class RBBIDataWrapper { * Length in bytes of the state table header, of all the int32 fields * preceding fTable in the serialized form. */ - public static int fHeaderSize = 16; + public static int fHeaderSize = 20; /** * Linear array of next state values, accessed as short[state, char_class] */ @@ -74,6 +79,7 @@ public final class RBBIDataWrapper { This.fNumStates = bytes.getInt(); This.fRowLen = bytes.getInt(); This.fDictCategoriesStart = bytes.getInt(); + This.fLookAheadResultsSize = bytes.getInt(); This.fFlags = bytes.getInt(); int lengthOfTable = length - fHeaderSize; // length in bytes. boolean use8Bits = (This.fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS; @@ -94,6 +100,7 @@ public final class RBBIDataWrapper { bytes.writeInt(fNumStates); bytes.writeInt(fRowLen); bytes.writeInt(fDictCategoriesStart); + bytes.writeInt(fLookAheadResultsSize); bytes.writeInt(fFlags); if ((fFlags & RBBIDataWrapper.RBBI_8BITS_ROWS) == RBBIDataWrapper.RBBI_8BITS_ROWS) { int tableLen = fRowLen * fNumStates; // fRowLen is bytes. @@ -131,6 +138,7 @@ public final class RBBIDataWrapper { if (fNumStates != otherST.fNumStates) return false; if (fRowLen != otherST.fRowLen) return false; if (fDictCategoriesStart != otherST.fDictCategoriesStart) return false; + if (fLookAheadResultsSize != otherST.fLookAheadResultsSize) return false; if (fFlags != otherST.fFlags) return false; return Arrays.equals(fTable, otherST.fTable); } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java index f9811155325..39a815470b0 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RBBITableBuilder.java @@ -79,6 +79,12 @@ class RBBITableBuilder { /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */ int[] fLookAheadRuleMap; + /** Counter used when assigning lookahead rule numbers. + * Contains the last look-ahead number already in use. + * The first look-ahead number is 2; Number 1 (ACCEPTING_UNCONDITIONAL) is reserved + * for non-lookahead accepting states. See the declarations of RBBIStateTableRowT. */ + int fLASlotsInUse = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL; + //----------------------------------------------------------------------------- // // Constructor for RBBITableBuilder. @@ -617,12 +623,6 @@ class RBBITableBuilder { void mapLookAheadRules() { fLookAheadRuleMap = new int[fRB.fScanner.numRules() + 1]; - // Counter used when assigning lookahead rule numbers. - // Contains the last look-ahead number already in use. - // The first look-ahead number is 2; - // Number 1 is reserved for non-lookahead accepting states. - int laSlotsInUse = RBBIDataWrapper.ACCEPTING_UNCONDITIONAL; - for (RBBIStateDescriptor sd: fDStates) { int laSlotForState = 0; @@ -656,7 +656,7 @@ class RBBITableBuilder { } if (laSlotForState == 0) { - laSlotForState = ++laSlotsInUse; + laSlotForState = ++fLASlotsInUse; } // For each look ahead node covered by this state, @@ -1139,6 +1139,8 @@ class RBBITableBuilder { fDStates.size() < 0x7fff); table.fNumStates = fDStates.size(); table.fDictCategoriesStart = fRB.fSetBuilder.getDictCategoriesStart(); + table.fLookAheadResultsSize = + fLASlotsInUse == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1; boolean use8Bits = table.fNumStates <= MAX_STATE_FOR_8BITS_TABLE; // Size of table size in shorts. diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java index 8206eadd314..7ddea90048c 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -70,6 +70,7 @@ public class RuleBasedBreakIterator extends BreakIterator { public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException { RuleBasedBreakIterator This = new RuleBasedBreakIterator(); This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is)); + This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; return This; } @@ -93,6 +94,7 @@ public class RuleBasedBreakIterator extends BreakIterator { public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes) throws IOException { RuleBasedBreakIterator This = new RuleBasedBreakIterator(); This.fRData = RBBIDataWrapper.get(bytes); + This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize]; return This; } @@ -107,6 +109,7 @@ public class RuleBasedBreakIterator extends BreakIterator { ByteArrayOutputStream ruleOS = new ByteArrayOutputStream(); compileRules(rules, ruleOS); fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray())); + fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; } catch (IOException e) { ///CLOVER:OFF // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler, @@ -138,7 +141,7 @@ public class RuleBasedBreakIterator extends BreakIterator { synchronized (gAllBreakEngines) { result.fBreakEngines = new ArrayList<>(gAllBreakEngines); } - result.fLookAheadMatches = new LookAheadResults(); + result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize]; result.fBreakCache = result.new BreakCache(fBreakCache); result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache); return result; @@ -251,6 +254,11 @@ public class RuleBasedBreakIterator extends BreakIterator { */ private boolean fDone; + /** + * Array of look-ahead tentative results. + */ + private int[] fLookAheadMatches; + /** * Cache of previously determined boundary positions. */ @@ -740,53 +748,6 @@ public class RuleBasedBreakIterator extends BreakIterator { } // end synchronized(gAllBreakEngines) } - private static final int kMaxLookaheads = 8; - private static class LookAheadResults { - int fUsedSlotLimit; - int[] fPositions; - int[] fKeys; - - LookAheadResults() { - fUsedSlotLimit= 0; - fPositions = new int[kMaxLookaheads]; - fKeys = new int[kMaxLookaheads]; - } - - int getPosition(int key) { - for (int i=0; i= kMaxLookaheads) { - assert(false); - i = kMaxLookaheads - 1; - } - fKeys[i] = key; - fPositions[i] = position; - assert(fUsedSlotLimit == i); - fUsedSlotLimit = i + 1; - } - - void reset() { - fUsedSlotLimit = 0; - } - }; - private LookAheadResults fLookAheadMatches = new LookAheadResults(); - - /** * The State Machine Engine for moving forward is here. * This function is the heart of the RBBI run time engine. @@ -854,7 +815,6 @@ public class RuleBasedBreakIterator extends BreakIterator { System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6)); } } - fLookAheadMatches.reset(); // loop until we reach the end of the text or transition to state 0 while (state != STOP_STATE) { @@ -921,7 +881,7 @@ public class RuleBasedBreakIterator extends BreakIterator { fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) { // Lookahead match is completed - int lookaheadResult = fLookAheadMatches.getPosition(accepting); + int lookaheadResult = fLookAheadMatches[accepting]; if (lookaheadResult >= 0) { fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX]; fPosition = lookaheadResult; @@ -944,7 +904,7 @@ public class RuleBasedBreakIterator extends BreakIterator { // We want the beginning of it. pos--; } - fLookAheadMatches.setPosition(rule, pos); + fLookAheadMatches[rule] = pos; } diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar index 158ef449747..30d733b303e 100644 --- a/icu4j/main/shared/data/icudata.jar +++ b/icu4j/main/shared/data/icudata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53e4c3251f31233ffcfe3ff4229ea43d81422a3fa071ee774ed835e5e969d22c -size 13142859 +oid sha256:73ba0e9a3520637ead2b6a0579902b3622e6611c3b9a6b22b835576a990982bf +size 13147665 diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar index f80547f225e..059da08944f 100644 --- a/icu4j/main/shared/data/icutzdata.jar +++ b/icu4j/main/shared/data/icutzdata.jar @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72b712d8d19a5aa8d1cb36f070337010c29595c63d917cf81e3213a5ea5be2e7 +oid sha256:2f0c964909d165506be2914219f625245cf9e9e26ebfad0c78843f23b9740375 size 94529 diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java index 504236095a5..8edaf497860 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java @@ -852,4 +852,57 @@ public class RBBITest extends TestFmwk { } } } + + /* Test handling of a large number of look-ahead rules. + * The number of rules in the test exceeds the implementation limits prior to the + * improvements introduced with #13590. + * + * The test look-ahead rules have the form "AB / CE"; "CD / EG"; ... + * The text being matched is sequential, "ABCDEFGHI..." + * + * The upshot is that the look-ahead rules all match on their preceding context, + * and consequently must save a potential result, but then fail to match on their + * trailing context, so that they don't actually cause a boundary. + * + * Additionally, add a ".*" rule, so there are no boundaries unless a + * look-ahead hard-break rule forces one. + */ + @Test + public void TestBug13590() { + StringBuilder rules = new StringBuilder("!!quoted_literals_only; !!chain; .*;\n"); + + int NUM_LOOKAHEAD_RULES = 50; + char STARTING_CHAR = '\u5000'; + char firstChar = 0; + for (int ruleNum = 0; ruleNum < NUM_LOOKAHEAD_RULES; ++ruleNum) { + firstChar = (char) (STARTING_CHAR + ruleNum*2); + rules.append('\'') .append(firstChar) .append((char)(firstChar+1)) .append('\'') + .append(' ') .append('/') .append(' ') + .append('\'') .append((char)(firstChar+2)) .append((char)(firstChar+4)) .append('\'') + .append(';') .append('\n'); + } + + // Change the last rule added from the form "UV / WY" to "UV / WX". + // Changes the rule so that it will match - all 4 chars are in ascending sequence. + String rulesStr = rules.toString().replace((char)(firstChar+4), (char)(firstChar+3)); + + RuleBasedBreakIterator bi = new RuleBasedBreakIterator(rulesStr); + // bi.dump(System.out); + + StringBuilder testString = new StringBuilder(); + for (char c = (char) (STARTING_CHAR-200); c < STARTING_CHAR + NUM_LOOKAHEAD_RULES*4; ++c) { + testString.append(c); + } + bi.setText(testString); + + int breaksFound = 0; + while (bi.next() != BreakIterator.DONE) { + ++breaksFound; + } + + // Two matches are expected, one from the last rule that was explicitly modified, + // and one at the end of the text. + assertEquals("Wrong number of breaks found", 2, breaksFound); + } + }