mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 23:10:40 +00:00
ICU-2924 RBBI, minor tweaks to matching engine.
X-SVN-Rev: 13469
This commit is contained in:
parent
d52bbb8da6
commit
d466dc343d
1 changed files with 35 additions and 27 deletions
|
@ -572,7 +572,7 @@ int32_t RuleBasedBreakIterator::current(void) const {
|
|||
// vectors through here. This method initializes the state machine to state 1
|
||||
// and advances through the text character by character until we reach the end
|
||||
// of the text or the state machine transitions to state 0. We update our return
|
||||
// value every time the state machine passes through a possible end state.
|
||||
// value every time the state machine passes through an accepting state.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::handleNext(void) {
|
||||
|
@ -584,17 +584,13 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||
fLastBreakTagValid = TRUE;
|
||||
|
||||
// if we're already at the end of the text, return DONE.
|
||||
if (fText == NULL || fData == NULL || fText->getIndex() == fText->endIndex()) {
|
||||
if (fText == NULL || fData == NULL || fText->hasNext() == FALSE) {
|
||||
fLastBreakTag = 0;
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
// no matter what, we always advance at least one character forward
|
||||
int32_t temp = fText->getIndex();
|
||||
fText->next32();
|
||||
int32_t result = fText->getIndex();
|
||||
fText->setIndex(temp);
|
||||
|
||||
int32_t initialPosition = fText->getIndex();
|
||||
int32_t result = initialPosition;
|
||||
int32_t lookaheadResult = 0;
|
||||
|
||||
// Initialize the state machine. Begin in state 1
|
||||
|
@ -621,15 +617,29 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||
// loop until we reach the end of the text or transition to state 0
|
||||
for (;;) {
|
||||
if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) {
|
||||
// Note: CharacterIterator::DONE is 0xffff, which is also a legal
|
||||
// character value. Check for DONE first, because it's quicker,
|
||||
// but also need to check fText->hasNext() to be certain.
|
||||
// Reached end of input string.
|
||||
// Note: CharacterIterator::DONE is 0xffff, which is also a legal
|
||||
// character value. Check for DONE first, because it's quicker,
|
||||
// but also need to check fText->hasNext() to be certain.
|
||||
|
||||
if (lookaheadResult > result) {
|
||||
// We ran off the end of the string with a pending look-ahead match.
|
||||
// Treat this as if the look-ahead condition had been met, and return
|
||||
// the match at the / position from the look-ahead rule.
|
||||
result = lookaheadResult;
|
||||
fLastBreakTag = lookaheadTag;
|
||||
lookaheadStatus = 0;
|
||||
} else if (result == initialPosition) {
|
||||
// Ran off end, no match found.
|
||||
// Treat as a break at the end of the input string.
|
||||
result = fText->endIndex();
|
||||
}
|
||||
break;
|
||||
}
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in.
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
UTRIE_GET16(&fData->fTrie, c, category);
|
||||
|
||||
|
@ -662,8 +672,6 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||
// Get the next character. Doing it here positions the iterator
|
||||
// to the correct position for recording matches in the code that
|
||||
// follows.
|
||||
// TODO: 16 bit next, and a 16 bit TRIE lookup, with escape code
|
||||
// for non-BMP chars, would be faster.
|
||||
c = fText->next32();
|
||||
|
||||
if (row->fAccepting == 0 && row->fLookAhead == 0) {
|
||||
|
@ -710,25 +718,25 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
|
|||
|
||||
continueOn:
|
||||
if (state == STOP_STATE) {
|
||||
// This is the normal exit from the lookup state machine.
|
||||
// We have advanced through the string until it is certain that no
|
||||
// longer match is possible, no matter what characters follow.
|
||||
break;
|
||||
}
|
||||
|
||||
// c = fText->next32();
|
||||
}
|
||||
|
||||
// if we've run off the end of the text, and the very last character took us into
|
||||
// a lookahead state, advance the break position to the lookahead position
|
||||
// (the theory here is that if there are no characters at all after the lookahead
|
||||
// position, that always matches the lookahead criteria)
|
||||
// TODO: is this really the right behavior?
|
||||
if (c == CharacterIterator::DONE &&
|
||||
fText->hasNext()==FALSE &&
|
||||
lookaheadResult == fText->endIndex()) {
|
||||
result = lookaheadResult;
|
||||
fLastBreakTag = lookaheadTag;
|
||||
// The state machine is done. Check whether it found a match...
|
||||
|
||||
// If the iterator failed to advance in the match engine, force it ahead by one.
|
||||
// (This really indicates a defect in the break rules. They should always match
|
||||
// at least one character.)
|
||||
if (result == initialPosition) {
|
||||
result = fText->setIndex(initialPosition);
|
||||
fText ->next32();
|
||||
result = fText->getIndex();
|
||||
}
|
||||
|
||||
|
||||
// Leave the iterator at our result position.
|
||||
fText->setIndex(result);
|
||||
if (fTrace) {
|
||||
RBBIDebugPrintf("result = %d\n\n", result);
|
||||
|
|
Loading…
Add table
Reference in a new issue