mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-4773 add beginning of input test to rbbi rules
X-SVN-Rev: 18589
This commit is contained in:
parent
3b26cf3000
commit
f327cc7af2
7 changed files with 388 additions and 193 deletions
|
@ -413,6 +413,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
|
|||
// break position before the current position (we back our internal
|
||||
// iterator up one step to prevent handlePrevious() from returning
|
||||
// the current position), but not necessarily the last one before
|
||||
|
||||
// where we started
|
||||
|
||||
int32_t start = current();
|
||||
|
@ -653,7 +654,10 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
|
|||
// otherwise, we can use following() on the position before the specified
|
||||
// one and return true if the position we get back is the one the user
|
||||
// specified
|
||||
return following(offset - 1) == offset;
|
||||
fText->setIndex(offset);
|
||||
int32_t backOne = fText->move32(-1, CharacterIterator::kCurrent);
|
||||
UBool result = following(backOne) == offset;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -684,84 +688,97 @@ int32_t RuleBasedBreakIterator::handleNext() {
|
|||
}
|
||||
|
||||
int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
|
||||
int32_t state;
|
||||
int16_t category = 0;
|
||||
RBBIStateTableRow *row;
|
||||
UChar32 c;
|
||||
int32_t lookaheadStatus = 0;
|
||||
int32_t lookaheadTagIdx = 0;
|
||||
int32_t result = 0;
|
||||
int32_t initialPosition = 0;
|
||||
int32_t lookaheadResult = 0;
|
||||
int32_t endCount = 0;
|
||||
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
|
||||
|
||||
if (fTrace) {
|
||||
RBBIDebugPuts("Handle Next pos char state category");
|
||||
}
|
||||
|
||||
// No matter what, handleNext alway correctly sets the break tag value.
|
||||
fLastStatusIndexValid = TRUE;
|
||||
fLastRuleStatusIndex = 0;
|
||||
|
||||
// if we're already at the end of the text, return DONE.
|
||||
if (fText == NULL || fData == NULL || fText->hasNext() == FALSE) {
|
||||
fLastRuleStatusIndex = 0;
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
int32_t initialPosition = fText->getIndex();
|
||||
int32_t result = initialPosition;
|
||||
int32_t lookaheadResult = 0;
|
||||
// Set up the starting char.
|
||||
initialPosition = fText->getIndex();
|
||||
result = initialPosition;
|
||||
c = fText->current32();
|
||||
|
||||
// Initialize the state machine. Begin in state 1
|
||||
int32_t state = START_STATE;
|
||||
int16_t category;
|
||||
UChar32 c = fText->current32();
|
||||
RBBIStateTableRow *row;
|
||||
int32_t lookaheadStatus = 0;
|
||||
int32_t lookaheadTagIdx = 0;
|
||||
// Set the initial state for the state machine
|
||||
state = START_STATE;
|
||||
row = (RBBIStateTableRow *)
|
||||
(statetable->fTableData + (statetable->fRowLen * state));
|
||||
category = (statetable->fFlags & RBBI_BOF_REQUIRED)? 2 : 3;
|
||||
|
||||
fLastRuleStatusIndex = 0;
|
||||
|
||||
row = (RBBIStateTableRow *) // Point to starting row of state table.
|
||||
(statetable->fTableData + (statetable->fRowLen * state));
|
||||
|
||||
// Character Category fetch for starting character.
|
||||
// See comments on character category code within loop, below.
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
}
|
||||
|
||||
// loop until we reach the end of the text or transition to state 0
|
||||
//
|
||||
for (;;) {
|
||||
if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) {
|
||||
// Reached end of input string.
|
||||
// Note: CharacterIterator::DONE is 0xffff, which is also a legal
|
||||
// character value. Check for DONE first, because it's quicker,
|
||||
// but also need to check fText->hasNext() to be certain.
|
||||
|
||||
if (lookaheadResult > result) {
|
||||
// We ran off the end of the string with a pending look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
lookaheadStatus = 0;
|
||||
} else if (result == initialPosition) {
|
||||
// Ran off end, no match found.
|
||||
// move forward one
|
||||
fText->setIndex(initialPosition);
|
||||
fText->next32();
|
||||
fText->getIndex();
|
||||
if (endCount++ >= 1) {
|
||||
// We have already run the loop one last time with the
|
||||
// character set to the psueudo {eof} value. Now it is time
|
||||
// to unconditionally bail out.
|
||||
if (lookaheadResult > result) {
|
||||
// We ran off the end of the string with a pending look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
lookaheadStatus = 0;
|
||||
} else if (result == initialPosition) {
|
||||
// Ran off end, no match found.
|
||||
// move forward one
|
||||
fText->setIndex(initialPosition);
|
||||
fText->next32();
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
// Run the loop one last time with the fake end-of-input character category.
|
||||
category = 1;
|
||||
}
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
// in their category values.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
// And off the dictionary flag bit.
|
||||
category &= ~0x4000;
|
||||
// Get the char category. An incoming category of 1 or 2 means that
|
||||
// we are preset for doing the beginning or end of input, and
|
||||
// that we shouldn't get a category from an actual text input character.
|
||||
//
|
||||
if (category >= 3) {
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
// in their category values.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
// And off the dictionary flag bit.
|
||||
category &= ~0x4000;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
|
@ -776,38 +793,43 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
|
|||
}
|
||||
#endif
|
||||
|
||||
// look up a state transition in the state table
|
||||
// State Transition - move machine to its next state
|
||||
//
|
||||
state = row->fNextState[category];
|
||||
row = (RBBIStateTableRow *)
|
||||
(statetable->fTableData + (statetable->fRowLen * state));
|
||||
|
||||
// Get the next character. Doing it here positions the iterator
|
||||
// to the correct position for recording matches in the code that
|
||||
// follows.
|
||||
c = fText->next32();
|
||||
// Advance to the next character.
|
||||
// If this is a beginning-of-input loop iteration, don't advance
|
||||
// the input position. The next iteration will be processing the
|
||||
// first real input character.
|
||||
if (category != 2) {
|
||||
c = fText->next32();
|
||||
}
|
||||
category = 3; // Flag that we aren't at the start of input.
|
||||
// Exact category doesn't matter, so long as it's >=3.
|
||||
|
||||
|
||||
if (row->fAccepting == -1) {
|
||||
// Match found, common case, could have lookahead so we move on to check it
|
||||
// Match found, common case.
|
||||
result = fText->getIndex();
|
||||
/// added
|
||||
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
|
||||
}
|
||||
|
||||
if (row->fLookAhead != 0) {
|
||||
if (lookaheadStatus != 0
|
||||
&& row->fAccepting == lookaheadStatus) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
// Lookahead match is completed.
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
lookaheadStatus = 0;
|
||||
/// i think we have to back up to read the lookahead character again
|
||||
/// fText->setIndex(lookaheadResult);
|
||||
/// TODO: this is a simple hack since reverse rules only have simple
|
||||
/// lookahead rules that we can definitely break out from.
|
||||
/// we need to make the lookahead rules not chain eventually.
|
||||
/// return result;
|
||||
/// this is going to be the longest match again
|
||||
// TODO: make a standalone hard break in a rule work.
|
||||
if (lookAheadHardBreak) {
|
||||
fText->setIndex(result);
|
||||
return result;
|
||||
}
|
||||
// Look-ahead completed, but other rules may match further. Continue on
|
||||
// TODO: junk this feature? I don't think it's used anywhwere.
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
|
@ -819,13 +841,12 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
|
|||
}
|
||||
|
||||
|
||||
if (row->fAccepting == 0) {
|
||||
// No match, nothing of interest happening, common case.
|
||||
goto continueOn;
|
||||
if (row->fAccepting != 0) {
|
||||
// Because this is an accepting state, any in-progress look-ahead match
|
||||
// is no longer relavant. Clear out the pending lookahead status.
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead match.
|
||||
}
|
||||
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||
|
||||
continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
// This is the normal exit from the lookup state machine.
|
||||
|
@ -833,6 +854,7 @@ continueOn:
|
|||
// longer match is possible, no matter what characters follow.
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// The state machine is done. Check whether it found a match...
|
||||
|
@ -878,14 +900,33 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
|||
int32_t lookaheadResult = 0;
|
||||
int32_t lookaheadTagIdx = 0;
|
||||
UChar32 c = fText->current32();
|
||||
UBool doingBOF = (fData->fReverseTable->fFlags & RBBI_BOF_REQUIRED) != 0;
|
||||
RBBIStateTableRow *row;
|
||||
|
||||
|
||||
//
|
||||
// Initial (startup) state tble row
|
||||
//
|
||||
row = (RBBIStateTableRow *)
|
||||
(this->fData->fReverseTable->fTableData + (state * fData->fReverseTable->fRowLen));
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
|
||||
//
|
||||
// Initial char category (state table column).
|
||||
// If this table required a beginning-of-input test,
|
||||
// hardwire to column 2
|
||||
// otherwise do the normal char category thing.
|
||||
//
|
||||
if (doingBOF) {
|
||||
// The rules included a test for being at the start {bof}, which
|
||||
// requires that we start the run with an extra iteration
|
||||
// of the state machine with the reserved character category of 2.
|
||||
category = 2;
|
||||
} else {
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
}
|
||||
}
|
||||
|
||||
if (fTrace) {
|
||||
|
@ -898,15 +939,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
|
|||
break;
|
||||
}
|
||||
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
}
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fTrace) {
|
||||
|
@ -970,7 +1002,20 @@ continueOn:
|
|||
}
|
||||
|
||||
// then advance one character backwards
|
||||
c = fText->previous32();
|
||||
if (doingBOF == FALSE) {
|
||||
c = fText->previous32();
|
||||
}
|
||||
doingBOF = FALSE;
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
U_ASSERT(category>=2);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
}
|
||||
}
|
||||
|
||||
// Note: the result postion isn't what is returned to the user by previous(),
|
||||
|
@ -996,67 +1041,100 @@ continueOn:
|
|||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable) {
|
||||
if (fText == NULL || statetable == NULL) {
|
||||
return 0;
|
||||
}
|
||||
// break tag is no longer valid after icu switched to exact backwards
|
||||
// positioning.
|
||||
fLastStatusIndexValid = FALSE;
|
||||
if (statetable == NULL) {
|
||||
return fText->setToStart();
|
||||
}
|
||||
|
||||
int32_t state = START_STATE;
|
||||
int32_t category;
|
||||
UBool hasPassedStartText = !fText->hasPrevious();
|
||||
UChar32 c = fText->previous32();
|
||||
// previous character
|
||||
int32_t result = fText->getIndex();
|
||||
int32_t lookaheadStatus = 0;
|
||||
int32_t lookaheadResult = 0;
|
||||
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
|
||||
|
||||
RBBIStateTableRow *row;
|
||||
|
||||
row = (RBBIStateTableRow *)
|
||||
(statetable->fTableData + (state * statetable->fRowLen));
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
}
|
||||
int32_t state;
|
||||
int16_t category = 0;
|
||||
RBBIStateTableRow *row;
|
||||
UChar32 c;
|
||||
int32_t lookaheadStatus = 0;
|
||||
int32_t result = 0;
|
||||
int32_t initialPosition = 0;
|
||||
int32_t lookaheadResult = 0;
|
||||
int32_t endCount = 0;
|
||||
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
|
||||
|
||||
if (fTrace) {
|
||||
RBBIDebugPuts("Handle Prev pos char state category");
|
||||
RBBIDebugPuts("Handle Previous pos char state category");
|
||||
}
|
||||
|
||||
// loop until we reach the beginning of the text or transition to state 0
|
||||
// handlePrevious() never gets the rule status.
|
||||
// Flag the status as invalid; if the user ever asks for status, we will need
|
||||
// to back up, then re-find the break position using handleNext(), which does
|
||||
// get the status value.
|
||||
fLastStatusIndexValid = FALSE;
|
||||
fLastRuleStatusIndex = 0;
|
||||
|
||||
// if we're already at the start of the text, return DONE.
|
||||
if (fText == NULL || fData == NULL || fText->hasPrevious() == FALSE) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
// Set up the starting char.
|
||||
initialPosition = fText->getIndex();
|
||||
result = initialPosition;
|
||||
c = fText->previous32();
|
||||
|
||||
// Set the initial state for the state machine
|
||||
state = START_STATE;
|
||||
row = (RBBIStateTableRow *)
|
||||
(statetable->fTableData + (statetable->fRowLen * state));
|
||||
category = (statetable->fFlags & RBBI_BOF_REQUIRED)? 2 : 3;
|
||||
|
||||
|
||||
// loop until we reach the start of the text or transition to state 0
|
||||
//
|
||||
for (;;) {
|
||||
if (hasPassedStartText) {
|
||||
// Ran off the beginning of text.
|
||||
if (*(int32_t *)fData->fHeader->fFormatVersion == 1) {
|
||||
// This is the old (ICU 3.2 and earlier) format data.
|
||||
// No explicit support for matching {eof}. Did have hack, though...
|
||||
if (row->fLookAhead != 0 && lookaheadResult == 0) {
|
||||
result = 0;
|
||||
if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
|
||||
// Reached end of input string.
|
||||
// Note: CharacterIterator::DONE is 0xffff, which is also a legal
|
||||
// character value. Check for DONE first, because it's quicker,
|
||||
// but also need to check fText->hasNext() to be certain.
|
||||
if (endCount++ >= 1 ||
|
||||
*(int32_t *)fData->fHeader->fFormatVersion == 1 ) {
|
||||
// We have already run the loop one last time with the
|
||||
// character set to the psueudo {eof} value. Now it is time
|
||||
// to unconditionally bail out.
|
||||
// (Or we have an old format binary rule file that does not support {eof}.)
|
||||
if (lookaheadResult < result) {
|
||||
// We ran off the end of the string with a pending look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
lookaheadStatus = 0;
|
||||
} else if (result == initialPosition) {
|
||||
// Ran off start, no match found.
|
||||
// move one index one (towards the start, since we are doing a previous())
|
||||
fText->setIndex(initialPosition);
|
||||
fText->previous32(); // TODO: shouldn't be necessary. We're already at beginning. Check.
|
||||
}
|
||||
break;
|
||||
}
|
||||
// Newer data format, with support for {eof}.
|
||||
// end of input is hardwired by rule builder as category/column 1.
|
||||
// Run the loop one last time with the fake end-of-input character category.
|
||||
category = 1;
|
||||
} else {
|
||||
// Not at {eof}.
|
||||
// look up the current character's category (the table column)
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
}
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
category &= ~0x4000;
|
||||
// Get the char category. An incoming category of 1 or 2 means that
|
||||
// we are preset for doing the beginning or end of input, and
|
||||
// that we shouldn't get a category from an actual text input character.
|
||||
//
|
||||
if (category >= 3) {
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
|
||||
// Check the dictionary bit in the character's category.
|
||||
// Counter is only used by dictionary based iterators (subclasses).
|
||||
// Chars that need to be handled by a dictionary have a flag bit set
|
||||
// in their category values.
|
||||
//
|
||||
if ((category & 0x4000) != 0) {
|
||||
fDictionaryCharCount++;
|
||||
// And off the dictionary flag bit.
|
||||
category &= ~0x4000;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
|
@ -1071,84 +1149,85 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
|
|||
}
|
||||
#endif
|
||||
|
||||
// look up a state transition in the backwards state table
|
||||
// State Transition - move machine to its next state
|
||||
//
|
||||
state = row->fNextState[category];
|
||||
row = (RBBIStateTableRow *)
|
||||
(statetable->fTableData + (state * statetable->fRowLen));
|
||||
(statetable->fTableData + (statetable->fRowLen * state));
|
||||
|
||||
if (row->fAccepting == -1) {
|
||||
// Match found, common case, could have lookahead so we move on to check it
|
||||
// Match found, common case.
|
||||
result = fText->getIndex();
|
||||
}
|
||||
|
||||
if (row->fLookAhead != 0) {
|
||||
if (lookaheadStatus != 0
|
||||
&& row->fAccepting == lookaheadStatus) {
|
||||
// Lookahead match is completed. Set the result accordingly, but only
|
||||
// if no other rule has matched further in the mean time.
|
||||
// Lookahead match is completed.
|
||||
result = lookaheadResult;
|
||||
lookaheadStatus = 0;
|
||||
/// i think we have to back up to read the lookahead character again
|
||||
/// fText->setIndex(lookaheadResult);
|
||||
/// TODO: this is a simple hack since reverse rules only have simple
|
||||
/// lookahead rules that we can definitely break out from.
|
||||
/// we need to make the lookahead rules not chain eventually.
|
||||
/// return result;
|
||||
/// this is going to be the longest match again
|
||||
|
||||
/// syn wee todo hard coded for line breaks stuff
|
||||
/// needs to provide a tag in rules to ensure a stop.
|
||||
|
||||
// TODO: make a standalone hard break in a rule work.
|
||||
if (lookAheadHardBreak) {
|
||||
fText->setIndex(result);
|
||||
return result;
|
||||
}
|
||||
fText->setIndex(result);
|
||||
|
||||
// Look-ahead completed, but other rules may match further. Continue on
|
||||
// TODO: junk this feature? I don't think it's used anywhwere.
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
int32_t r = fText->getIndex();
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
// not lookahead
|
||||
if (row->fAccepting == 0) {
|
||||
// No match, nothing of interest happening, common case.
|
||||
int32_t r = fText->getIndex();
|
||||
lookaheadResult = r;
|
||||
lookaheadStatus = row->fLookAhead;
|
||||
goto continueOn;
|
||||
}
|
||||
|
||||
|
||||
// This is a plain (non-look-ahead) accepting state
|
||||
if (!lookAheadHardBreak) {
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead matches.
|
||||
// But only if not doing the lookAheadHardBreak option,
|
||||
// which needs to force a break no matter what is going
|
||||
// on with the rest of the match, i.e. we can't abandon
|
||||
// a partially completed look-ahead match because some
|
||||
// other rule matched further than the '/' position
|
||||
// in the look-ahead match.
|
||||
if (row->fAccepting != 0) {
|
||||
// Because this is an accepting state, any in-progress look-ahead match
|
||||
// is no longer relavant. Clear out the pending lookahead status.
|
||||
lookaheadStatus = 0; // clear out any pending look-ahead match.
|
||||
}
|
||||
|
||||
continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
// This is the normal exit from the lookup state machine.
|
||||
// We have advanced through the string until it is certain that no
|
||||
// longer match is possible, no matter what characters follow.
|
||||
break;
|
||||
}
|
||||
|
||||
if (hasPassedStartText) {
|
||||
break;
|
||||
// Move (backwards) to the next character to process.
|
||||
// If this is a beginning-of-input loop iteration, don't advance
|
||||
// the input position. The next iteration will be processing the
|
||||
// first real input character.
|
||||
if (category != 2) {
|
||||
c = fText->previous32();
|
||||
}
|
||||
category = 3; // Flag that this is no longer the first loop iteration.
|
||||
// Exact category doesn't matter, so long as it's >=3.
|
||||
|
||||
|
||||
// Advance one character backwards
|
||||
hasPassedStartText = !fText->hasPrevious();
|
||||
c = fText->previous32();
|
||||
}
|
||||
|
||||
// The state machine is done. Check whether it found a match...
|
||||
|
||||
// If the iterator failed to advance in the match engine, force it ahead by one.
|
||||
// (This really indicates a defect in the break rules. They should always match
|
||||
// at least one character.)
|
||||
if (result == initialPosition) {
|
||||
result = fText->setIndex(initialPosition);
|
||||
fText ->previous32();
|
||||
result = fText->getIndex();
|
||||
}
|
||||
|
||||
// Leave the iterator at our result position.
|
||||
fText->setIndex(result);
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fTrace) {
|
||||
RBBIDebugPrintf("result = %d\n\n", result);
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -131,7 +131,8 @@ struct RBBIStateTable {
|
|||
};
|
||||
|
||||
typedef enum {
|
||||
RBBI_LOOKAHEAD_HARD_BREAK = 1
|
||||
RBBI_LOOKAHEAD_HARD_BREAK = 1,
|
||||
RBBI_BOF_REQUIRED = 2
|
||||
} RBBIStateTableFlags;
|
||||
|
||||
|
||||
|
|
|
@ -1148,7 +1148,7 @@ void RBBIRuleScanner::scanSet() {
|
|||
|
||||
// Verify that the set contains at least one code point.
|
||||
//
|
||||
if (uset->charAt(0) == -1) {
|
||||
if (uset->isEmpty()) {
|
||||
// This set is empty.
|
||||
// Make it an error, because it almost certainly is not what the user wanted.
|
||||
// Also, avoids having to think about corner cases in the tree manipulation code
|
||||
|
|
|
@ -94,6 +94,7 @@ RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
|
|||
fTrie = 0;
|
||||
fTrieSize = 0;
|
||||
fGroupCount = 0;
|
||||
fSawBOF = FALSE;
|
||||
}
|
||||
|
||||
|
||||
|
@ -224,7 +225,8 @@ void RBBISetBuilder::build() {
|
|||
//
|
||||
// Numbering: # 0 (state table column 0) is unused.
|
||||
// # 1 is reserved - table column 1 is for end-of-input
|
||||
// # 2 is the first range list.
|
||||
// # 2 is reserved - table column 2 is for beginning-in-input
|
||||
// # 3 is the first range list.
|
||||
//
|
||||
RangeDescriptor *rlSearchRange;
|
||||
for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
|
||||
|
@ -236,20 +238,26 @@ void RBBISetBuilder::build() {
|
|||
}
|
||||
if (rlRange->fNum == 0) {
|
||||
fGroupCount ++;
|
||||
rlRange->fNum = fGroupCount+1;
|
||||
rlRange->fNum = fGroupCount+2;
|
||||
rlRange->setDictionaryFlag();
|
||||
addValToSets(rlRange->fIncludesSets, fGroupCount+1);
|
||||
addValToSets(rlRange->fIncludesSets, fGroupCount+2);
|
||||
}
|
||||
}
|
||||
|
||||
// Handle input sets that contain the special string {eof}.
|
||||
// Column 1 of the state table is reserved for EOF on input.
|
||||
// Add this column value (1) to the equivalent expression
|
||||
// Column 2 is reserved for before-the-start-input.
|
||||
// (This column can be optimized away later if there are no rule
|
||||
// references to {bof}.)
|
||||
// Add this column value (1 or 2) to the equivalent expression
|
||||
// subtree for each UnicodeSet that contains the string {eof}
|
||||
// Because EOF is not a character in the normal sense, it doesn't
|
||||
// affect the computation of ranges or TRIE.
|
||||
// Because {bof} and {eof} are not a characters in the normal sense,
|
||||
// they doesn't affect the computation of ranges or TRIE.
|
||||
static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
|
||||
static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
|
||||
|
||||
UnicodeString eofString(eofUString);
|
||||
UnicodeString bofString(bofUString);
|
||||
for (ni=0; ; ni++) { // Loop over each of the UnicodeSets encountered in the input rules
|
||||
usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
|
||||
if (usetNode==NULL) {
|
||||
|
@ -259,6 +267,10 @@ void RBBISetBuilder::build() {
|
|||
if (inputSet->contains(eofString)) {
|
||||
addValToSet(usetNode, 1);
|
||||
}
|
||||
if (inputSet->contains(bofString)) {
|
||||
addValToSet(usetNode, 2);
|
||||
fSawBOF = TRUE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -367,10 +379,19 @@ void RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
|
|||
//
|
||||
//------------------------------------------------------------------------
|
||||
int32_t RBBISetBuilder::getNumCharCategories() const {
|
||||
return fGroupCount + 2;
|
||||
return fGroupCount + 3;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
// sawBOF
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
UBool RBBISetBuilder::sawBOF() const {
|
||||
return fSawBOF;
|
||||
}
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
|
|
|
@ -82,11 +82,13 @@ public:
|
|||
void addValToSets(UVector *sets, uint32_t val);
|
||||
void addValToSet (RBBINode *usetNode, uint32_t val);
|
||||
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
|
||||
// runtime state machine, which are the same as
|
||||
// columns in the DFA state table
|
||||
// runtime state machine, which are the same as
|
||||
// columns in the DFA state table
|
||||
int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
|
||||
void serializeTrie(uint8_t *where); // write out the serialized Trie.
|
||||
UChar32 getFirstChar(int32_t val) const;
|
||||
UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
|
||||
// character were encountered.
|
||||
#ifdef RBBI_DEBUG
|
||||
void printSets();
|
||||
void printRanges();
|
||||
|
@ -116,6 +118,8 @@ private:
|
|||
// column 2 is for group 0. Funny counting.
|
||||
int32_t fGroupCount;
|
||||
|
||||
UBool fSawBOF;
|
||||
|
||||
RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
|
||||
RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
|
||||
};
|
||||
|
|
|
@ -80,6 +80,22 @@ void RBBITableBuilder::build() {
|
|||
fTree->printTree(TRUE);
|
||||
}
|
||||
|
||||
//
|
||||
// If the rules contained any references to {bof}
|
||||
// add a {bof} <cat> <former root of tree> to the
|
||||
// tree. Means that all matches must start out with the
|
||||
// {bof} fake character.
|
||||
//
|
||||
if (fRB->fSetBuilder->sawBOF()) {
|
||||
RBBINode *bofTop = new RBBINode(RBBINode::opCat);
|
||||
RBBINode *bofLeaf = new RBBINode(RBBINode::leafChar);
|
||||
bofTop->fLeftChild = bofLeaf;
|
||||
bofTop->fRightChild = fTree;
|
||||
bofLeaf->fParent = bofTop;
|
||||
bofLeaf->fVal = 2; // Reserved value for {bof}.
|
||||
fTree = bofTop;
|
||||
}
|
||||
|
||||
//
|
||||
// Add a unique right-end marker to the expression.
|
||||
// Appears as a cat-node, left child being the original tree,
|
||||
|
@ -126,6 +142,13 @@ void RBBITableBuilder::build() {
|
|||
calcChainedFollowPos(fTree);
|
||||
}
|
||||
|
||||
//
|
||||
// BOF (start of input) test fixup.
|
||||
//
|
||||
if (fRB->fSetBuilder->sawBOF()) {
|
||||
bofFixup(fTree);
|
||||
}
|
||||
|
||||
//
|
||||
// Build the DFA state transition tables.
|
||||
//
|
||||
|
@ -349,8 +372,15 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
|
|||
return;
|
||||
}
|
||||
|
||||
// Get all nodes that can be the start a match, which is FirstPosition(root)
|
||||
UVector *matchStartNodes = tree->fFirstPosSet;
|
||||
// Get all nodes that can be the start a match, which is FirstPosition()
|
||||
// of the portion of the tree corresponding to user-written rules.
|
||||
// See the tree description in bofFixup().
|
||||
RBBINode *userRuleRoot = tree;
|
||||
if (fRB->fSetBuilder->sawBOF()) {
|
||||
userRuleRoot = tree->fLeftChild->fRightChild;
|
||||
}
|
||||
U_ASSERT(userRuleRoot != NULL);
|
||||
UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
|
||||
|
||||
|
||||
// Iteratate over all leaf nodes,
|
||||
|
@ -417,6 +447,62 @@ void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
|
|||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// bofFixup. Fixup for state tables that include {bof} beginning of input testing.
|
||||
// Do an swizzle similar to chaining, modifying the followPos set of
|
||||
// the bofNode to include the followPos nodes from other {bot} nodes
|
||||
// scattered through the tree.
|
||||
//
|
||||
// This function has much in common with calcChainedFollowPos().
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void RBBITableBuilder::bofFixup(RBBINode *tree) {
|
||||
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// The parse tree looks like this ...
|
||||
// fTree root ---> <cat>
|
||||
// / \
|
||||
// <cat> <#end node>
|
||||
// / \
|
||||
// <bofNode> rest
|
||||
// of tree
|
||||
//
|
||||
// We will be adding things to the followPos set of the <bofNode>
|
||||
//
|
||||
RBBINode *bofNode = fTree->fLeftChild->fLeftChild;
|
||||
U_ASSERT(bofNode->fType == RBBINode::leafChar);
|
||||
U_ASSERT(bofNode->fVal == 2);
|
||||
|
||||
// Get all nodes that can be the start a match of the user-written rules
|
||||
// (excluding the fake bofNode)
|
||||
// We want the nodes that can start a match in the
|
||||
// part labeled "rest of tree"
|
||||
//
|
||||
UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
|
||||
|
||||
RBBINode *startNode;
|
||||
int startNodeIx;
|
||||
for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
|
||||
startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
|
||||
if (startNode->fType != RBBINode::leafChar) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (startNode->fVal == bofNode->fVal) {
|
||||
// We found a leaf node corresponding to a {bof} that was
|
||||
// explicitly written into a rule.
|
||||
// Add everything from the followPos set of this node to the
|
||||
// followPos set of the fake bofNode at the start of the tree.
|
||||
//
|
||||
setAdd(bofNode->fFollowPos, startNode->fFollowPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// buildStateTable() Determine the set of runtime DFA states and the
|
||||
|
@ -958,6 +1044,9 @@ void RBBITableBuilder::exportTable(void *where) {
|
|||
if (fRB->fLookAheadHardBreak) {
|
||||
table->fFlags |= RBBI_LOOKAHEAD_HARD_BREAK;
|
||||
}
|
||||
if (fRB->fSetBuilder->sawBOF()) {
|
||||
table->fFlags |= RBBI_BOF_REQUIRED;
|
||||
}
|
||||
table->fReserved = 0;
|
||||
|
||||
for (state=0; state<table->fNumStates; state++) {
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2002-2004, International Business Machines
|
||||
* Copyright (c) 2002-2005, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
@ -51,6 +51,7 @@ private:
|
|||
void calcLastPos(RBBINode *n);
|
||||
void calcFollowPos(RBBINode *n);
|
||||
void calcChainedFollowPos(RBBINode *n);
|
||||
void bofFixup(RBBINode *n);
|
||||
void buildStateTable();
|
||||
void flagAcceptingStates();
|
||||
void flagLookAheadStates();
|
||||
|
|
Loading…
Add table
Reference in a new issue