diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index f9c0d03473d..db44897d64f 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -413,6 +413,7 @@ int32_t RuleBasedBreakIterator::previous(void) { // break position before the current position (we back our internal // iterator up one step to prevent handlePrevious() from returning // the current position), but not necessarily the last one before + // where we started int32_t start = current(); @@ -653,7 +654,10 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { // otherwise, we can use following() on the position before the specified // one and return true if the position we get back is the one the user // specified - return following(offset - 1) == offset; + fText->setIndex(offset); + int32_t backOne = fText->move32(-1, CharacterIterator::kCurrent); + UBool result = following(backOne) == offset; + return result; } /** @@ -684,84 +688,97 @@ int32_t RuleBasedBreakIterator::handleNext() { } int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { + int32_t state; + int16_t category = 0; + RBBIStateTableRow *row; + UChar32 c; + int32_t lookaheadStatus = 0; + int32_t lookaheadTagIdx = 0; + int32_t result = 0; + int32_t initialPosition = 0; + int32_t lookaheadResult = 0; + int32_t endCount = 0; + UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; + if (fTrace) { RBBIDebugPuts("Handle Next pos char state category"); } // No matter what, handleNext alway correctly sets the break tag value. fLastStatusIndexValid = TRUE; + fLastRuleStatusIndex = 0; // if we're already at the end of the text, return DONE. if (fText == NULL || fData == NULL || fText->hasNext() == FALSE) { - fLastRuleStatusIndex = 0; return BreakIterator::DONE; } - int32_t initialPosition = fText->getIndex(); - int32_t result = initialPosition; - int32_t lookaheadResult = 0; + // Set up the starting char. + initialPosition = fText->getIndex(); + result = initialPosition; + c = fText->current32(); - // Initialize the state machine. Begin in state 1 - int32_t state = START_STATE; - int16_t category; - UChar32 c = fText->current32(); - RBBIStateTableRow *row; - int32_t lookaheadStatus = 0; - int32_t lookaheadTagIdx = 0; + // Set the initial state for the state machine + state = START_STATE; + row = (RBBIStateTableRow *) + (statetable->fTableData + (statetable->fRowLen * state)); + category = (statetable->fFlags & RBBI_BOF_REQUIRED)? 2 : 3; - fLastRuleStatusIndex = 0; - - row = (RBBIStateTableRow *) // Point to starting row of state table. - (statetable->fTableData + (statetable->fRowLen * state)); - - // Character Category fetch for starting character. - // See comments on character category code within loop, below. - UTRIE_GET16(&fData->fTrie, c, category); - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - category &= ~0x4000; - } // loop until we reach the end of the text or transition to state 0 + // for (;;) { if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) { // Reached end of input string. // Note: CharacterIterator::DONE is 0xffff, which is also a legal // character value. Check for DONE first, because it's quicker, // but also need to check fText->hasNext() to be certain. - - if (lookaheadResult > result) { - // We ran off the end of the string with a pending look-ahead match. - // Treat this as if the look-ahead condition had been met, and return - // the match at the / position from the look-ahead rule. - result = lookaheadResult; - fLastRuleStatusIndex = lookaheadTagIdx; - lookaheadStatus = 0; - } else if (result == initialPosition) { - // Ran off end, no match found. - // move forward one - fText->setIndex(initialPosition); - fText->next32(); - fText->getIndex(); + if (endCount++ >= 1) { + // We have already run the loop one last time with the + // character set to the psueudo {eof} value. Now it is time + // to unconditionally bail out. + if (lookaheadResult > result) { + // We ran off the end of the string with a pending look-ahead match. + // Treat this as if the look-ahead condition had been met, and return + // the match at the / position from the look-ahead rule. + result = lookaheadResult; + fLastRuleStatusIndex = lookaheadTagIdx; + lookaheadStatus = 0; + } else if (result == initialPosition) { + // Ran off end, no match found. + // move forward one + fText->setIndex(initialPosition); + fText->next32(); + } + break; } - break; + // Run the loop one last time with the fake end-of-input character category. + category = 1; } - // look up the current character's character category, which tells us - // which column in the state table to look at. - // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, - // not the size of the character going in, which is a UChar32. - // - UTRIE_GET16(&fData->fTrie, c, category); - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators (subclasses). - // Chars that need to be handled by a dictionary have a flag bit set - // in their category values. // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~0x4000; + // Get the char category. An incoming category of 1 or 2 means that + // we are preset for doing the beginning or end of input, and + // that we shouldn't get a category from an actual text input character. + // + if (category >= 3) { + // look up the current character's character category, which tells us + // which column in the state table to look at. + // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, + // not the size of the character going in, which is a UChar32. + // + UTRIE_GET16(&fData->fTrie, c, category); + + // Check the dictionary bit in the character's category. + // Counter is only used by dictionary based iterators (subclasses). + // Chars that need to be handled by a dictionary have a flag bit set + // in their category values. + // + if ((category & 0x4000) != 0) { + fDictionaryCharCount++; + // And off the dictionary flag bit. + category &= ~0x4000; + } } #ifdef RBBI_DEBUG @@ -776,38 +793,43 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { } #endif - // look up a state transition in the state table + // State Transition - move machine to its next state + // state = row->fNextState[category]; row = (RBBIStateTableRow *) (statetable->fTableData + (statetable->fRowLen * state)); - // Get the next character. Doing it here positions the iterator - // to the correct position for recording matches in the code that - // follows. - c = fText->next32(); + // Advance to the next character. + // If this is a beginning-of-input loop iteration, don't advance + // the input position. The next iteration will be processing the + // first real input character. + if (category != 2) { + c = fText->next32(); + } + category = 3; // Flag that we aren't at the start of input. + // Exact category doesn't matter, so long as it's >=3. + if (row->fAccepting == -1) { - // Match found, common case, could have lookahead so we move on to check it + // Match found, common case. result = fText->getIndex(); - /// added fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. } if (row->fLookAhead != 0) { if (lookaheadStatus != 0 && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. Set the result accordingly, but only - // if no other rule has matched further in the mean time. + // Lookahead match is completed. result = lookaheadResult; fLastRuleStatusIndex = lookaheadTagIdx; lookaheadStatus = 0; - /// i think we have to back up to read the lookahead character again - /// fText->setIndex(lookaheadResult); - /// TODO: this is a simple hack since reverse rules only have simple - /// lookahead rules that we can definitely break out from. - /// we need to make the lookahead rules not chain eventually. - /// return result; - /// this is going to be the longest match again + // TODO: make a standalone hard break in a rule work. + if (lookAheadHardBreak) { + fText->setIndex(result); + return result; + } + // Look-ahead completed, but other rules may match further. Continue on + // TODO: junk this feature? I don't think it's used anywhwere. goto continueOn; } @@ -819,13 +841,12 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) { } - if (row->fAccepting == 0) { - // No match, nothing of interest happening, common case. - goto continueOn; + if (row->fAccepting != 0) { + // Because this is an accepting state, any in-progress look-ahead match + // is no longer relavant. Clear out the pending lookahead status. + lookaheadStatus = 0; // clear out any pending look-ahead match. } - lookaheadStatus = 0; // clear out any pending look-ahead matches. - continueOn: if (state == STOP_STATE) { // This is the normal exit from the lookup state machine. @@ -833,6 +854,7 @@ continueOn: // longer match is possible, no matter what characters follow. break; } + } // The state machine is done. Check whether it found a match... @@ -878,14 +900,33 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) { int32_t lookaheadResult = 0; int32_t lookaheadTagIdx = 0; UChar32 c = fText->current32(); + UBool doingBOF = (fData->fReverseTable->fFlags & RBBI_BOF_REQUIRED) != 0; RBBIStateTableRow *row; + + // + // Initial (startup) state tble row + // row = (RBBIStateTableRow *) (this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen)); - UTRIE_GET16(&fData->fTrie, c, category); - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - category &= ~0x4000; + + // + // Initial char category (state table column). + // If this table required a beginning-of-input test, + // hardwire to column 2 + // otherwise do the normal char category thing. + // + if (doingBOF) { + // The rules included a test for being at the start {bof}, which + // requires that we start the run with an extra iteration + // of the state machine with the reserved character category of 2. + category = 2; + } else { + UTRIE_GET16(&fData->fTrie, c, category); + if ((category & 0x4000) != 0) { + fDictionaryCharCount++; + category &= ~0x4000; + } } if (fTrace) { @@ -898,15 +939,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) { break; } - UTRIE_GET16(&fData->fTrie, c, category); - - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators. - // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - category &= ~0x4000; - } #ifdef RBBI_DEBUG if (fTrace) { @@ -970,7 +1002,20 @@ continueOn: } // then advance one character backwards - c = fText->previous32(); + if (doingBOF == FALSE) { + c = fText->previous32(); + } + doingBOF = FALSE; + UTRIE_GET16(&fData->fTrie, c, category); + U_ASSERT(category>=2); + + // Check the dictionary bit in the character's category. + // Counter is only used by dictionary based iterators. + // + if ((category & 0x4000) != 0) { + fDictionaryCharCount++; + category &= ~0x4000; + } } // Note: the result postion isn't what is returned to the user by previous(), @@ -996,67 +1041,100 @@ continueOn: // //----------------------------------------------------------------------------------- int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) { - if (fText == NULL || statetable == NULL) { - return 0; - } - // break tag is no longer valid after icu switched to exact backwards - // positioning. - fLastStatusIndexValid = FALSE; - if (statetable == NULL) { - return fText->setToStart(); - } - - int32_t state = START_STATE; - int32_t category; - UBool hasPassedStartText = !fText->hasPrevious(); - UChar32 c = fText->previous32(); - // previous character - int32_t result = fText->getIndex(); - int32_t lookaheadStatus = 0; - int32_t lookaheadResult = 0; - UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; - - RBBIStateTableRow *row; - - row = (RBBIStateTableRow *) - (statetable->fTableData + (state * statetable->fRowLen)); - UTRIE_GET16(&fData->fTrie, c, category); - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - category &= ~0x4000; - } + int32_t state; + int16_t category = 0; + RBBIStateTableRow *row; + UChar32 c; + int32_t lookaheadStatus = 0; + int32_t result = 0; + int32_t initialPosition = 0; + int32_t lookaheadResult = 0; + int32_t endCount = 0; + UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0; if (fTrace) { - RBBIDebugPuts("Handle Prev pos char state category"); + RBBIDebugPuts("Handle Previous pos char state category"); } - // loop until we reach the beginning of the text or transition to state 0 + // handlePrevious() never gets the rule status. + // Flag the status as invalid; if the user ever asks for status, we will need + // to back up, then re-find the break position using handleNext(), which does + // get the status value. + fLastStatusIndexValid = FALSE; + fLastRuleStatusIndex = 0; + + // if we're already at the start of the text, return DONE. + if (fText == NULL || fData == NULL || fText->hasPrevious() == FALSE) { + return BreakIterator::DONE; + } + + // Set up the starting char. + initialPosition = fText->getIndex(); + result = initialPosition; + c = fText->previous32(); + + // Set the initial state for the state machine + state = START_STATE; + row = (RBBIStateTableRow *) + (statetable->fTableData + (statetable->fRowLen * state)); + category = (statetable->fFlags & RBBI_BOF_REQUIRED)? 2 : 3; + + + // loop until we reach the start of the text or transition to state 0 + // for (;;) { - if (hasPassedStartText) { - // Ran off the beginning of text. - if (*(int32_t *)fData->fHeader->fFormatVersion == 1) { - // This is the old (ICU 3.2 and earlier) format data. - // No explicit support for matching {eof}. Did have hack, though... - if (row->fLookAhead != 0 && lookaheadResult == 0) { - result = 0; + if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) { + // Reached end of input string. + // Note: CharacterIterator::DONE is 0xffff, which is also a legal + // character value. Check for DONE first, because it's quicker, + // but also need to check fText->hasNext() to be certain. + if (endCount++ >= 1 || + *(int32_t *)fData->fHeader->fFormatVersion == 1 ) { + // We have already run the loop one last time with the + // character set to the psueudo {eof} value. Now it is time + // to unconditionally bail out. + // (Or we have an old format binary rule file that does not support {eof}.) + if (lookaheadResult < result) { + // We ran off the end of the string with a pending look-ahead match. + // Treat this as if the look-ahead condition had been met, and return + // the match at the / position from the look-ahead rule. + result = lookaheadResult; + lookaheadStatus = 0; + } else if (result == initialPosition) { + // Ran off start, no match found. + // move one index one (towards the start, since we are doing a previous()) + fText->setIndex(initialPosition); + fText->previous32(); // TODO: shouldn't be necessary. We're already at beginning. Check. } break; } - // Newer data format, with support for {eof}. - // end of input is hardwired by rule builder as category/column 1. + // Run the loop one last time with the fake end-of-input character category. category = 1; - } else { - // Not at {eof}. - // look up the current character's category (the table column) - UTRIE_GET16(&fData->fTrie, c, category); } - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators. // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - category &= ~0x4000; + // Get the char category. An incoming category of 1 or 2 means that + // we are preset for doing the beginning or end of input, and + // that we shouldn't get a category from an actual text input character. + // + if (category >= 3) { + // look up the current character's character category, which tells us + // which column in the state table to look at. + // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, + // not the size of the character going in, which is a UChar32. + // + UTRIE_GET16(&fData->fTrie, c, category); + + // Check the dictionary bit in the character's category. + // Counter is only used by dictionary based iterators (subclasses). + // Chars that need to be handled by a dictionary have a flag bit set + // in their category values. + // + if ((category & 0x4000) != 0) { + fDictionaryCharCount++; + // And off the dictionary flag bit. + category &= ~0x4000; + } } #ifdef RBBI_DEBUG @@ -1071,84 +1149,85 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) } #endif - // look up a state transition in the backwards state table + // State Transition - move machine to its next state + // state = row->fNextState[category]; row = (RBBIStateTableRow *) - (statetable->fTableData + (state * statetable->fRowLen)); + (statetable->fTableData + (statetable->fRowLen * state)); if (row->fAccepting == -1) { - // Match found, common case, could have lookahead so we move on to check it + // Match found, common case. result = fText->getIndex(); } if (row->fLookAhead != 0) { if (lookaheadStatus != 0 && row->fAccepting == lookaheadStatus) { - // Lookahead match is completed. Set the result accordingly, but only - // if no other rule has matched further in the mean time. + // Lookahead match is completed. result = lookaheadResult; lookaheadStatus = 0; - /// i think we have to back up to read the lookahead character again - /// fText->setIndex(lookaheadResult); - /// TODO: this is a simple hack since reverse rules only have simple - /// lookahead rules that we can definitely break out from. - /// we need to make the lookahead rules not chain eventually. - /// return result; - /// this is going to be the longest match again - - /// syn wee todo hard coded for line breaks stuff - /// needs to provide a tag in rules to ensure a stop. - + // TODO: make a standalone hard break in a rule work. if (lookAheadHardBreak) { fText->setIndex(result); return result; } - fText->setIndex(result); - + // Look-ahead completed, but other rules may match further. Continue on + // TODO: junk this feature? I don't think it's used anywhwere. goto continueOn; } - int32_t r = fText->getIndex(); - lookaheadResult = r; - lookaheadStatus = row->fLookAhead; - goto continueOn; - } - - // not lookahead - if (row->fAccepting == 0) { - // No match, nothing of interest happening, common case. + int32_t r = fText->getIndex(); + lookaheadResult = r; + lookaheadStatus = row->fLookAhead; goto continueOn; } - // This is a plain (non-look-ahead) accepting state - if (!lookAheadHardBreak) { - lookaheadStatus = 0; // clear out any pending look-ahead matches. - // But only if not doing the lookAheadHardBreak option, - // which needs to force a break no matter what is going - // on with the rest of the match, i.e. we can't abandon - // a partially completed look-ahead match because some - // other rule matched further than the '/' position - // in the look-ahead match. + if (row->fAccepting != 0) { + // Because this is an accepting state, any in-progress look-ahead match + // is no longer relavant. Clear out the pending lookahead status. + lookaheadStatus = 0; // clear out any pending look-ahead match. } continueOn: if (state == STOP_STATE) { + // This is the normal exit from the lookup state machine. + // We have advanced through the string until it is certain that no + // longer match is possible, no matter what characters follow. break; } - if (hasPassedStartText) { - break; + // Move (backwards) to the next character to process. + // If this is a beginning-of-input loop iteration, don't advance + // the input position. The next iteration will be processing the + // first real input character. + if (category != 2) { + c = fText->previous32(); } + category = 3; // Flag that this is no longer the first loop iteration. + // Exact category doesn't matter, so long as it's >=3. + - // Advance one character backwards - hasPassedStartText = !fText->hasPrevious(); - c = fText->previous32(); } + // The state machine is done. Check whether it found a match... + // If the iterator failed to advance in the match engine, force it ahead by one. + // (This really indicates a defect in the break rules. They should always match + // at least one character.) + if (result == initialPosition) { + result = fText->setIndex(initialPosition); + fText ->previous32(); + result = fText->getIndex(); + } + + // Leave the iterator at our result position. fText->setIndex(result); - + #ifdef RBBI_DEBUG + if (fTrace) { + RBBIDebugPrintf("result = %d\n\n", result); + } + #endif return result; } diff --git a/icu4c/source/common/rbbidata.h b/icu4c/source/common/rbbidata.h index ceb7c1a8e6c..7ba1db05d65 100644 --- a/icu4c/source/common/rbbidata.h +++ b/icu4c/source/common/rbbidata.h @@ -131,7 +131,8 @@ struct RBBIStateTable { }; typedef enum { - RBBI_LOOKAHEAD_HARD_BREAK = 1 + RBBI_LOOKAHEAD_HARD_BREAK = 1, + RBBI_BOF_REQUIRED = 2 } RBBIStateTableFlags; diff --git a/icu4c/source/common/rbbiscan.cpp b/icu4c/source/common/rbbiscan.cpp index 321fd942d37..8b3b6f21e00 100644 --- a/icu4c/source/common/rbbiscan.cpp +++ b/icu4c/source/common/rbbiscan.cpp @@ -1148,7 +1148,7 @@ void RBBIRuleScanner::scanSet() { // Verify that the set contains at least one code point. // - if (uset->charAt(0) == -1) { + if (uset->isEmpty()) { // This set is empty. // Make it an error, because it almost certainly is not what the user wanted. // Also, avoids having to think about corner cases in the tree manipulation code diff --git a/icu4c/source/common/rbbisetb.cpp b/icu4c/source/common/rbbisetb.cpp index eceffb6bac3..0bbe8e920ee 100644 --- a/icu4c/source/common/rbbisetb.cpp +++ b/icu4c/source/common/rbbisetb.cpp @@ -94,6 +94,7 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb) fTrie = 0; fTrieSize = 0; fGroupCount = 0; + fSawBOF = FALSE; } @@ -224,7 +225,8 @@ void RBBISetBuilder::build() { // // Numbering: # 0 (state table column 0) is unused. // # 1 is reserved - table column 1 is for end-of-input - // # 2 is the first range list. + // # 2 is reserved - table column 2 is for beginning-in-input + // # 3 is the first range list. // RangeDescriptor *rlSearchRange; for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) { @@ -236,20 +238,26 @@ void RBBISetBuilder::build() { } if (rlRange->fNum == 0) { fGroupCount ++; - rlRange->fNum = fGroupCount+1; + rlRange->fNum = fGroupCount+2; rlRange->setDictionaryFlag(); - addValToSets(rlRange->fIncludesSets, fGroupCount+1); + addValToSets(rlRange->fIncludesSets, fGroupCount+2); } } // Handle input sets that contain the special string {eof}. // Column 1 of the state table is reserved for EOF on input. - // Add this column value (1) to the equivalent expression + // Column 2 is reserved for before-the-start-input. + // (This column can be optimized away later if there are no rule + // references to {bof}.) + // Add this column value (1 or 2) to the equivalent expression // subtree for each UnicodeSet that contains the string {eof} - // Because EOF is not a character in the normal sense, it doesn't - // affect the computation of ranges or TRIE. + // Because {bof} and {eof} are not a characters in the normal sense, + // they doesn't affect the computation of ranges or TRIE. static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0}; + static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0}; + UnicodeString eofString(eofUString); + UnicodeString bofString(bofUString); for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni); if (usetNode==NULL) { @@ -259,6 +267,10 @@ void RBBISetBuilder::build() { if (inputSet->contains(eofString)) { addValToSet(usetNode, 1); } + if (inputSet->contains(bofString)) { + addValToSet(usetNode, 2); + fSawBOF = TRUE; + } } @@ -367,10 +379,19 @@ void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) { // //------------------------------------------------------------------------ int32_t RBBISetBuilder::getNumCharCategories() const { - return fGroupCount + 2; + return fGroupCount + 3; } +//------------------------------------------------------------------------ +// +// sawBOF +// +//------------------------------------------------------------------------ +UBool RBBISetBuilder::sawBOF() const { + return fSawBOF; +} + //------------------------------------------------------------------------ // diff --git a/icu4c/source/common/rbbisetb.h b/icu4c/source/common/rbbisetb.h index ec0a0dc855c..c8bc1df7786 100644 --- a/icu4c/source/common/rbbisetb.h +++ b/icu4c/source/common/rbbisetb.h @@ -82,11 +82,13 @@ public: void addValToSets(UVector *sets, uint32_t val); void addValToSet (RBBINode *usetNode, uint32_t val); int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the - // runtime state machine, which are the same as - // columns in the DFA state table + // runtime state machine, which are the same as + // columns in the DFA state table int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. void serializeTrie(uint8_t *where); // write out the serialized Trie. UChar32 getFirstChar(int32_t val) const; + UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo + // character were encountered. #ifdef RBBI_DEBUG void printSets(); void printRanges(); @@ -116,6 +118,8 @@ private: // column 2 is for group 0. Funny counting. int32_t fGroupCount; + UBool fSawBOF; + RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class }; diff --git a/icu4c/source/common/rbbitblb.cpp b/icu4c/source/common/rbbitblb.cpp index e4c02f8d115..ba25a81903d 100644 --- a/icu4c/source/common/rbbitblb.cpp +++ b/icu4c/source/common/rbbitblb.cpp @@ -80,6 +80,22 @@ void RBBITableBuilder::build() { fTree->printTree(TRUE); } + // + // If the rules contained any references to {bof} + // add a {bof} to the + // tree. Means that all matches must start out with the + // {bof} fake character. + // + if (fRB->fSetBuilder->sawBOF()) { + RBBINode *bofTop = new RBBINode(RBBINode::opCat); + RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar); + bofTop->fLeftChild = bofLeaf; + bofTop->fRightChild = fTree; + bofLeaf->fParent = bofTop; + bofLeaf->fVal = 2; // Reserved value for {bof}. + fTree = bofTop; + } + // // Add a unique right-end marker to the expression. // Appears as a cat-node, left child being the original tree, @@ -126,6 +142,13 @@ void RBBITableBuilder::build() { calcChainedFollowPos(fTree); } + // + // BOF (start of input) test fixup. + // + if (fRB->fSetBuilder->sawBOF()) { + bofFixup(fTree); + } + // // Build the DFA state transition tables. // @@ -349,8 +372,15 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { return; } - // Get all nodes that can be the start a match, which is FirstPosition(root) - UVector *matchStartNodes = tree->fFirstPosSet; + // Get all nodes that can be the start a match, which is FirstPosition() + // of the portion of the tree corresponding to user-written rules. + // See the tree description in bofFixup(). + RBBINode *userRuleRoot = tree; + if (fRB->fSetBuilder->sawBOF()) { + userRuleRoot = tree->fLeftChild->fRightChild; + } + U_ASSERT(userRuleRoot != NULL); + UVector *matchStartNodes = userRuleRoot->fFirstPosSet; // Iteratate over all leaf nodes, @@ -417,6 +447,62 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) { } +//----------------------------------------------------------------------------- +// +// bofFixup. Fixup for state tables that include {bof} beginning of input testing. +// Do an swizzle similar to chaining, modifying the followPos set of +// the bofNode to include the followPos nodes from other {bot} nodes +// scattered through the tree. +// +// This function has much in common with calcChainedFollowPos(). +// +//----------------------------------------------------------------------------- +void RBBITableBuilder::bofFixup(RBBINode *tree) { + + if (U_FAILURE(*fStatus)) { + return; + } + + // The parse tree looks like this ... + // fTree root ---> + // / \ + // <#end node> + // / \ + // rest + // of tree + // + // We will be adding things to the followPos set of the + // + RBBINode *bofNode = fTree->fLeftChild->fLeftChild; + U_ASSERT(bofNode->fType == RBBINode::leafChar); + U_ASSERT(bofNode->fVal == 2); + + // Get all nodes that can be the start a match of the user-written rules + // (excluding the fake bofNode) + // We want the nodes that can start a match in the + // part labeled "rest of tree" + // + UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet; + + RBBINode *startNode; + int startNodeIx; + for (startNodeIx = 0; startNodeIxsize(); startNodeIx++) { + startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx); + if (startNode->fType != RBBINode::leafChar) { + continue; + } + + if (startNode->fVal == bofNode->fVal) { + // We found a leaf node corresponding to a {bof} that was + // explicitly written into a rule. + // Add everything from the followPos set of this node to the + // followPos set of the fake bofNode at the start of the tree. + // + setAdd(bofNode->fFollowPos, startNode->fFollowPos); + } + } +} + //----------------------------------------------------------------------------- // // buildStateTable() Determine the set of runtime DFA states and the @@ -958,6 +1044,9 @@ void RBBITableBuilder::exportTable(void *where) { if (fRB->fLookAheadHardBreak) { table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK; } + if (fRB->fSetBuilder->sawBOF()) { + table->fFlags |= RBBI_BOF_REQUIRED; + } table->fReserved = 0; for (state=0; statefNumStates; state++) { diff --git a/icu4c/source/common/rbbitblb.h b/icu4c/source/common/rbbitblb.h index e4a2b890bd1..b1db601e984 100644 --- a/icu4c/source/common/rbbitblb.h +++ b/icu4c/source/common/rbbitblb.h @@ -4,7 +4,7 @@ /* ********************************************************************** -* Copyright (c) 2002-2004, International Business Machines +* Copyright (c) 2002-2005, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -51,6 +51,7 @@ private: void calcLastPos(RBBINode *n); void calcFollowPos(RBBINode *n); void calcChainedFollowPos(RBBINode *n); + void bofFixup(RBBINode *n); void buildStateTable(); void flagAcceptingStates(); void flagLookAheadStates();