ICU-4773 add beginning of input test to rbbi rules

X-SVN-Rev: 18589
This commit is contained in:
Andy Heninger 2005-09-27 00:03:32 +00:00
parent 3b26cf3000
commit f327cc7af2
7 changed files with 388 additions and 193 deletions

View file

@ -413,6 +413,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
// break position before the current position (we back our internal
// iterator up one step to prevent handlePrevious() from returning
// the current position), but not necessarily the last one before
// where we started
int32_t start = current();
@ -653,7 +654,10 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
// otherwise, we can use following() on the position before the specified
// one and return true if the position we get back is the one the user
// specified
return following(offset - 1) == offset;
fText->setIndex(offset);
int32_t backOne = fText->move32(-1, CharacterIterator::kCurrent);
UBool result = following(backOne) == offset;
return result;
}
/**
@ -684,84 +688,97 @@ int32_t RuleBasedBreakIterator::handleNext() {
}
int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
int32_t state;
int16_t category = 0;
RBBIStateTableRow *row;
UChar32 c;
int32_t lookaheadStatus = 0;
int32_t lookaheadTagIdx = 0;
int32_t result = 0;
int32_t initialPosition = 0;
int32_t lookaheadResult = 0;
int32_t endCount = 0;
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
if (fTrace) {
RBBIDebugPuts("Handle Next pos char state category");
}
// No matter what, handleNext alway correctly sets the break tag value.
fLastStatusIndexValid = TRUE;
fLastRuleStatusIndex = 0;
// if we're already at the end of the text, return DONE.
if (fText == NULL || fData == NULL || fText->hasNext() == FALSE) {
fLastRuleStatusIndex = 0;
return BreakIterator::DONE;
}
int32_t initialPosition = fText->getIndex();
int32_t result = initialPosition;
int32_t lookaheadResult = 0;
// Set up the starting char.
initialPosition = fText->getIndex();
result = initialPosition;
c = fText->current32();
// Initialize the state machine. Begin in state 1
int32_t state = START_STATE;
int16_t category;
UChar32 c = fText->current32();
RBBIStateTableRow *row;
int32_t lookaheadStatus = 0;
int32_t lookaheadTagIdx = 0;
// Set the initial state for the state machine
state = START_STATE;
row = (RBBIStateTableRow *)
(statetable->fTableData + (statetable->fRowLen * state));
category = (statetable->fFlags & RBBI_BOF_REQUIRED)? 2 : 3;
fLastRuleStatusIndex = 0;
row = (RBBIStateTableRow *) // Point to starting row of state table.
(statetable->fTableData + (statetable->fRowLen * state));
// Character Category fetch for starting character.
// See comments on character category code within loop, below.
UTRIE_GET16(&fData->fTrie, c, category);
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
category &= ~0x4000;
}
// loop until we reach the end of the text or transition to state 0
//
for (;;) {
if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) {
// Reached end of input string.
// Note: CharacterIterator::DONE is 0xffff, which is also a legal
// character value. Check for DONE first, because it's quicker,
// but also need to check fText->hasNext() to be certain.
if (lookaheadResult > result) {
// We ran off the end of the string with a pending look-ahead match.
// Treat this as if the look-ahead condition had been met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
} else if (result == initialPosition) {
// Ran off end, no match found.
// move forward one
fText->setIndex(initialPosition);
fText->next32();
fText->getIndex();
if (endCount++ >= 1) {
// We have already run the loop one last time with the
// character set to the psueudo {eof} value. Now it is time
// to unconditionally bail out.
if (lookaheadResult > result) {
// We ran off the end of the string with a pending look-ahead match.
// Treat this as if the look-ahead condition had been met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
} else if (result == initialPosition) {
// Ran off end, no match found.
// move forward one
fText->setIndex(initialPosition);
fText->next32();
}
break;
}
break;
// Run the loop one last time with the fake end-of-input character category.
category = 1;
}
// look up the current character's character category, which tells us
// which column in the state table to look at.
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
UTRIE_GET16(&fData->fTrie, c, category);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
// Get the char category. An incoming category of 1 or 2 means that
// we are preset for doing the beginning or end of input, and
// that we shouldn't get a category from an actual text input character.
//
if (category >= 3) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
UTRIE_GET16(&fData->fTrie, c, category);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
}
}
#ifdef RBBI_DEBUG
@ -776,38 +793,43 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
}
#endif
// look up a state transition in the state table
// State Transition - move machine to its next state
//
state = row->fNextState[category];
row = (RBBIStateTableRow *)
(statetable->fTableData + (statetable->fRowLen * state));
// Get the next character. Doing it here positions the iterator
// to the correct position for recording matches in the code that
// follows.
c = fText->next32();
// Advance to the next character.
// If this is a beginning-of-input loop iteration, don't advance
// the input position. The next iteration will be processing the
// first real input character.
if (category != 2) {
c = fText->next32();
}
category = 3; // Flag that we aren't at the start of input.
// Exact category doesn't matter, so long as it's >=3.
if (row->fAccepting == -1) {
// Match found, common case, could have lookahead so we move on to check it
// Match found, common case.
result = fText->getIndex();
/// added
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
}
if (row->fLookAhead != 0) {
if (lookaheadStatus != 0
&& row->fAccepting == lookaheadStatus) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
// Lookahead match is completed.
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
/// i think we have to back up to read the lookahead character again
/// fText->setIndex(lookaheadResult);
/// TODO: this is a simple hack since reverse rules only have simple
/// lookahead rules that we can definitely break out from.
/// we need to make the lookahead rules not chain eventually.
/// return result;
/// this is going to be the longest match again
// TODO: make a standalone hard break in a rule work.
if (lookAheadHardBreak) {
fText->setIndex(result);
return result;
}
// Look-ahead completed, but other rules may match further. Continue on
// TODO: junk this feature? I don't think it's used anywhwere.
goto continueOn;
}
@ -819,13 +841,12 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
}
if (row->fAccepting == 0) {
// No match, nothing of interest happening, common case.
goto continueOn;
if (row->fAccepting != 0) {
// Because this is an accepting state, any in-progress look-ahead match
// is no longer relavant. Clear out the pending lookahead status.
lookaheadStatus = 0; // clear out any pending look-ahead match.
}
lookaheadStatus = 0; // clear out any pending look-ahead matches.
continueOn:
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
@ -833,6 +854,7 @@ continueOn:
// longer match is possible, no matter what characters follow.
break;
}
}
// The state machine is done. Check whether it found a match...
@ -878,14 +900,33 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
int32_t lookaheadResult = 0;
int32_t lookaheadTagIdx = 0;
UChar32 c = fText->current32();
UBool doingBOF = (fData->fReverseTable->fFlags & RBBI_BOF_REQUIRED) != 0;
RBBIStateTableRow *row;
//
// Initial (startup) state tble row
//
row = (RBBIStateTableRow *)
(this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
UTRIE_GET16(&fData->fTrie, c, category);
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
category &= ~0x4000;
//
// Initial char category (state table column).
// If this table required a beginning-of-input test,
// hardwire to column 2
// otherwise do the normal char category thing.
//
if (doingBOF) {
// The rules included a test for being at the start {bof}, which
// requires that we start the run with an extra iteration
// of the state machine with the reserved character category of 2.
category = 2;
} else {
UTRIE_GET16(&fData->fTrie, c, category);
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
category &= ~0x4000;
}
}
if (fTrace) {
@ -898,15 +939,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
break;
}
UTRIE_GET16(&fData->fTrie, c, category);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
category &= ~0x4000;
}
#ifdef RBBI_DEBUG
if (fTrace) {
@ -970,7 +1002,20 @@ continueOn:
}
// then advance one character backwards
c = fText->previous32();
if (doingBOF == FALSE) {
c = fText->previous32();
}
doingBOF = FALSE;
UTRIE_GET16(&fData->fTrie, c, category);
U_ASSERT(category>=2);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
category &= ~0x4000;
}
}
// Note: the result postion isn't what is returned to the user by previous(),
@ -996,67 +1041,100 @@ continueOn:
//
//-----------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
if (fText == NULL || statetable == NULL) {
return 0;
}
// break tag is no longer valid after icu switched to exact backwards
// positioning.
fLastStatusIndexValid = FALSE;
if (statetable == NULL) {
return fText->setToStart();
}
int32_t state = START_STATE;
int32_t category;
UBool hasPassedStartText = !fText->hasPrevious();
UChar32 c = fText->previous32();
// previous character
int32_t result = fText->getIndex();
int32_t lookaheadStatus = 0;
int32_t lookaheadResult = 0;
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
RBBIStateTableRow *row;
row = (RBBIStateTableRow *)
(statetable->fTableData + (state * statetable->fRowLen));
UTRIE_GET16(&fData->fTrie, c, category);
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
category &= ~0x4000;
}
int32_t state;
int16_t category = 0;
RBBIStateTableRow *row;
UChar32 c;
int32_t lookaheadStatus = 0;
int32_t result = 0;
int32_t initialPosition = 0;
int32_t lookaheadResult = 0;
int32_t endCount = 0;
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
if (fTrace) {
RBBIDebugPuts("Handle Prev pos char state category");
RBBIDebugPuts("Handle Previous pos char state category");
}
// loop until we reach the beginning of the text or transition to state 0
// handlePrevious() never gets the rule status.
// Flag the status as invalid; if the user ever asks for status, we will need
// to back up, then re-find the break position using handleNext(), which does
// get the status value.
fLastStatusIndexValid = FALSE;
fLastRuleStatusIndex = 0;
// if we're already at the start of the text, return DONE.
if (fText == NULL || fData == NULL || fText->hasPrevious() == FALSE) {
return BreakIterator::DONE;
}
// Set up the starting char.
initialPosition = fText->getIndex();
result = initialPosition;
c = fText->previous32();
// Set the initial state for the state machine
state = START_STATE;
row = (RBBIStateTableRow *)
(statetable->fTableData + (statetable->fRowLen * state));
category = (statetable->fFlags & RBBI_BOF_REQUIRED)? 2 : 3;
// loop until we reach the start of the text or transition to state 0
//
for (;;) {
if (hasPassedStartText) {
// Ran off the beginning of text.
if (*(int32_t *)fData->fHeader->fFormatVersion == 1) {
// This is the old (ICU 3.2 and earlier) format data.
// No explicit support for matching {eof}. Did have hack, though...
if (row->fLookAhead != 0 && lookaheadResult == 0) {
result = 0;
if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
// Reached end of input string.
// Note: CharacterIterator::DONE is 0xffff, which is also a legal
// character value. Check for DONE first, because it's quicker,
// but also need to check fText->hasNext() to be certain.
if (endCount++ >= 1 ||
*(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
// We have already run the loop one last time with the
// character set to the psueudo {eof} value. Now it is time
// to unconditionally bail out.
// (Or we have an old format binary rule file that does not support {eof}.)
if (lookaheadResult < result) {
// We ran off the end of the string with a pending look-ahead match.
// Treat this as if the look-ahead condition had been met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
lookaheadStatus = 0;
} else if (result == initialPosition) {
// Ran off start, no match found.
// move one index one (towards the start, since we are doing a previous())
fText->setIndex(initialPosition);
fText->previous32(); // TODO: shouldn't be necessary. We're already at beginning. Check.
}
break;
}
// Newer data format, with support for {eof}.
// end of input is hardwired by rule builder as category/column 1.
// Run the loop one last time with the fake end-of-input character category.
category = 1;
} else {
// Not at {eof}.
// look up the current character's category (the table column)
UTRIE_GET16(&fData->fTrie, c, category);
}
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
category &= ~0x4000;
// Get the char category. An incoming category of 1 or 2 means that
// we are preset for doing the beginning or end of input, and
// that we shouldn't get a category from an actual text input character.
//
if (category >= 3) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
UTRIE_GET16(&fData->fTrie, c, category);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
}
}
#ifdef RBBI_DEBUG
@ -1071,84 +1149,85 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
}
#endif
// look up a state transition in the backwards state table
// State Transition - move machine to its next state
//
state = row->fNextState[category];
row = (RBBIStateTableRow *)
(statetable->fTableData + (state * statetable->fRowLen));
(statetable->fTableData + (statetable->fRowLen * state));
if (row->fAccepting == -1) {
// Match found, common case, could have lookahead so we move on to check it
// Match found, common case.
result = fText->getIndex();
}
if (row->fLookAhead != 0) {
if (lookaheadStatus != 0
&& row->fAccepting == lookaheadStatus) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
// Lookahead match is completed.
result = lookaheadResult;
lookaheadStatus = 0;
/// i think we have to back up to read the lookahead character again
/// fText->setIndex(lookaheadResult);
/// TODO: this is a simple hack since reverse rules only have simple
/// lookahead rules that we can definitely break out from.
/// we need to make the lookahead rules not chain eventually.
/// return result;
/// this is going to be the longest match again
/// syn wee todo hard coded for line breaks stuff
/// needs to provide a tag in rules to ensure a stop.
// TODO: make a standalone hard break in a rule work.
if (lookAheadHardBreak) {
fText->setIndex(result);
return result;
}
fText->setIndex(result);
// Look-ahead completed, but other rules may match further. Continue on
// TODO: junk this feature? I don't think it's used anywhwere.
goto continueOn;
}
int32_t r = fText->getIndex();
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
goto continueOn;
}
// not lookahead
if (row->fAccepting == 0) {
// No match, nothing of interest happening, common case.
int32_t r = fText->getIndex();
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
goto continueOn;
}
// This is a plain (non-look-ahead) accepting state
if (!lookAheadHardBreak) {
lookaheadStatus = 0; // clear out any pending look-ahead matches.
// But only if not doing the lookAheadHardBreak option,
// which needs to force a break no matter what is going
// on with the rest of the match, i.e. we can't abandon
// a partially completed look-ahead match because some
// other rule matched further than the '/' position
// in the look-ahead match.
if (row->fAccepting != 0) {
// Because this is an accepting state, any in-progress look-ahead match
// is no longer relavant. Clear out the pending lookahead status.
lookaheadStatus = 0; // clear out any pending look-ahead match.
}
continueOn:
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
// We have advanced through the string until it is certain that no
// longer match is possible, no matter what characters follow.
break;
}
if (hasPassedStartText) {
break;
// Move (backwards) to the next character to process.
// If this is a beginning-of-input loop iteration, don't advance
// the input position. The next iteration will be processing the
// first real input character.
if (category != 2) {
c = fText->previous32();
}
category = 3; // Flag that this is no longer the first loop iteration.
// Exact category doesn't matter, so long as it's >=3.
// Advance one character backwards
hasPassedStartText = !fText->hasPrevious();
c = fText->previous32();
}
// The state machine is done. Check whether it found a match...
// If the iterator failed to advance in the match engine, force it ahead by one.
// (This really indicates a defect in the break rules. They should always match
// at least one character.)
if (result == initialPosition) {
result = fText->setIndex(initialPosition);
fText ->previous32();
result = fText->getIndex();
}
// Leave the iterator at our result position.
fText->setIndex(result);
#ifdef RBBI_DEBUG
if (fTrace) {
RBBIDebugPrintf("result = %d\n\n", result);
}
#endif
return result;
}

View file

@ -131,7 +131,8 @@ struct RBBIStateTable {
};
typedef enum {
RBBI_LOOKAHEAD_HARD_BREAK = 1
RBBI_LOOKAHEAD_HARD_BREAK = 1,
RBBI_BOF_REQUIRED = 2
} RBBIStateTableFlags;

View file

@ -1148,7 +1148,7 @@ void RBBIRuleScanner::scanSet() {
// Verify that the set contains at least one code point.
//
if (uset->charAt(0) == -1) {
if (uset->isEmpty()) {
// This set is empty.
// Make it an error, because it almost certainly is not what the user wanted.
// Also, avoids having to think about corner cases in the tree manipulation code

View file

@ -94,6 +94,7 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
fTrie = 0;
fTrieSize = 0;
fGroupCount = 0;
fSawBOF = FALSE;
}
@ -224,7 +225,8 @@ void RBBISetBuilder::build() {
//
// Numbering: # 0 (state table column 0) is unused.
// # 1 is reserved - table column 1 is for end-of-input
// # 2 is the first range list.
// # 2 is reserved - table column 2 is for beginning-in-input
// # 3 is the first range list.
//
RangeDescriptor *rlSearchRange;
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
@ -236,20 +238,26 @@ void RBBISetBuilder::build() {
}
if (rlRange->fNum == 0) {
fGroupCount ++;
rlRange->fNum = fGroupCount+1;
rlRange->fNum = fGroupCount+2;
rlRange->setDictionaryFlag();
addValToSets(rlRange->fIncludesSets, fGroupCount+1);
addValToSets(rlRange->fIncludesSets, fGroupCount+2);
}
}
// Handle input sets that contain the special string {eof}.
// Column 1 of the state table is reserved for EOF on input.
// Add this column value (1) to the equivalent expression
// Column 2 is reserved for before-the-start-input.
// (This column can be optimized away later if there are no rule
// references to {bof}.)
// Add this column value (1 or 2) to the equivalent expression
// subtree for each UnicodeSet that contains the string {eof}
// Because EOF is not a character in the normal sense, it doesn't
// affect the computation of ranges or TRIE.
// Because {bof} and {eof} are not a characters in the normal sense,
// they doesn't affect the computation of ranges or TRIE.
static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
UnicodeString eofString(eofUString);
UnicodeString bofString(bofUString);
for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
if (usetNode==NULL) {
@ -259,6 +267,10 @@ void RBBISetBuilder::build() {
if (inputSet->contains(eofString)) {
addValToSet(usetNode, 1);
}
if (inputSet->contains(bofString)) {
addValToSet(usetNode, 2);
fSawBOF = TRUE;
}
}
@ -367,10 +379,19 @@ void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
//
//------------------------------------------------------------------------
int32_t RBBISetBuilder::getNumCharCategories() const {
return fGroupCount + 2;
return fGroupCount + 3;
}
//------------------------------------------------------------------------
//
// sawBOF
//
//------------------------------------------------------------------------
UBool RBBISetBuilder::sawBOF() const {
return fSawBOF;
}
//------------------------------------------------------------------------
//

View file

@ -82,11 +82,13 @@ public:
void addValToSets(UVector *sets, uint32_t val);
void addValToSet (RBBINode *usetNode, uint32_t val);
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
// runtime state machine, which are the same as
// columns in the DFA state table
// runtime state machine, which are the same as
// columns in the DFA state table
int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
void serializeTrie(uint8_t *where); // write out the serialized Trie.
UChar32 getFirstChar(int32_t val) const;
UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
// character were encountered.
#ifdef RBBI_DEBUG
void printSets();
void printRanges();
@ -116,6 +118,8 @@ private:
// column 2 is for group 0. Funny counting.
int32_t fGroupCount;
UBool fSawBOF;
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
};

View file

@ -80,6 +80,22 @@ void RBBITableBuilder::build() {
fTree->printTree(TRUE);
}
//
// If the rules contained any references to {bof}
// add a {bof} <cat> <former root of tree> to the
// tree. Means that all matches must start out with the
// {bof} fake character.
//
if (fRB->fSetBuilder->sawBOF()) {
RBBINode *bofTop = new RBBINode(RBBINode::opCat);
RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
bofTop->fLeftChild = bofLeaf;
bofTop->fRightChild = fTree;
bofLeaf->fParent = bofTop;
bofLeaf->fVal = 2; // Reserved value for {bof}.
fTree = bofTop;
}
//
// Add a unique right-end marker to the expression.
// Appears as a cat-node, left child being the original tree,
@ -126,6 +142,13 @@ void RBBITableBuilder::build() {
calcChainedFollowPos(fTree);
}
//
// BOF (start of input) test fixup.
//
if (fRB->fSetBuilder->sawBOF()) {
bofFixup(fTree);
}
//
// Build the DFA state transition tables.
//
@ -349,8 +372,15 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
return;
}
// Get all nodes that can be the start a match, which is FirstPosition(root)
UVector *matchStartNodes = tree->fFirstPosSet;
// Get all nodes that can be the start a match, which is FirstPosition()
// of the portion of the tree corresponding to user-written rules.
// See the tree description in bofFixup().
RBBINode *userRuleRoot = tree;
if (fRB->fSetBuilder->sawBOF()) {
userRuleRoot = tree->fLeftChild->fRightChild;
}
U_ASSERT(userRuleRoot != NULL);
UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
// Iteratate over all leaf nodes,
@ -417,6 +447,62 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
}
//-----------------------------------------------------------------------------
//
// bofFixup. Fixup for state tables that include {bof} beginning of input testing.
// Do an swizzle similar to chaining, modifying the followPos set of
// the bofNode to include the followPos nodes from other {bot} nodes
// scattered through the tree.
//
// This function has much in common with calcChainedFollowPos().
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::bofFixup(RBBINode *tree) {
if (U_FAILURE(*fStatus)) {
return;
}
// The parse tree looks like this ...
// fTree root ---> <cat>
// / \
// <cat> <#end node>
// / \
// <bofNode> rest
// of tree
//
// We will be adding things to the followPos set of the <bofNode>
//
RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
U_ASSERT(bofNode->fType == RBBINode::leafChar);
U_ASSERT(bofNode->fVal == 2);
// Get all nodes that can be the start a match of the user-written rules
// (excluding the fake bofNode)
// We want the nodes that can start a match in the
// part labeled "rest of tree"
//
UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
RBBINode *startNode;
int startNodeIx;
for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
if (startNode->fType != RBBINode::leafChar) {
continue;
}
if (startNode->fVal == bofNode->fVal) {
// We found a leaf node corresponding to a {bof} that was
// explicitly written into a rule.
// Add everything from the followPos set of this node to the
// followPos set of the fake bofNode at the start of the tree.
//
setAdd(bofNode->fFollowPos, startNode->fFollowPos);
}
}
}
//-----------------------------------------------------------------------------
//
// buildStateTable() Determine the set of runtime DFA states and the
@ -958,6 +1044,9 @@ void RBBITableBuilder::exportTable(void *where) {
if (fRB->fLookAheadHardBreak) {
table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
}
if (fRB->fSetBuilder->sawBOF()) {
table->fFlags |= RBBI_BOF_REQUIRED;
}
table->fReserved = 0;
for (state=0; state<table->fNumStates; state++) {

View file

@ -4,7 +4,7 @@
/*
**********************************************************************
* Copyright (c) 2002-2004, International Business Machines
* Copyright (c) 2002-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
@ -51,6 +51,7 @@ private:
void calcLastPos(RBBINode *n);
void calcFollowPos(RBBINode *n);
void calcChainedFollowPos(RBBINode *n);
void bofFixup(RBBINode *n);
void buildStateTable();
void flagAcceptingStates();
void flagLookAheadStates();