ICU-3043 multiple status values implementation

X-SVN-Rev: 14640
This commit is contained in:
Andy Heninger 2004-03-05 05:04:10 +00:00
parent eef53a9191
commit fdb386fb81
10 changed files with 414 additions and 115 deletions

View file

@ -173,11 +173,11 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
UBool RuleBasedBreakIterator::fTrace = FALSE;
void RuleBasedBreakIterator::init() {
fText = NULL;
fData = NULL;
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
fDictionaryCharCount = 0;
fText = NULL;
fData = NULL;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
fDictionaryCharCount = 0;
#ifdef RBBI_DEBUG
static UBool debugInitDone = FALSE;
@ -326,8 +326,8 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
*/
int32_t RuleBasedBreakIterator::first(void) {
reset();
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
if (fText == NULL)
return BreakIterator::DONE;
@ -343,8 +343,8 @@ int32_t RuleBasedBreakIterator::first(void) {
int32_t RuleBasedBreakIterator::last(void) {
reset();
if (fText == NULL) {
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
return BreakIterator::DONE;
}
@ -355,7 +355,7 @@ int32_t RuleBasedBreakIterator::last(void) {
// will work correctly.)
fLastBreakTagValid = FALSE;
fLastStatusIndexValid = FALSE;
int32_t pos = fText->endIndex();
fText->setIndex(pos);
@ -399,8 +399,8 @@ int32_t RuleBasedBreakIterator::next(void) {
int32_t RuleBasedBreakIterator::previous(void) {
// if we're already sitting at the beginning of the text, return DONE
if (fText == NULL || current() == fText->startIndex()) {
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
return BreakIterator::DONE;
}
@ -433,7 +433,7 @@ int32_t RuleBasedBreakIterator::previous(void) {
break;
}
lastResult = result;
lastTag = fLastBreakTag;
lastTag = fLastRuleStatusIndex;
breakTagValid = TRUE;
}
@ -447,8 +447,8 @@ int32_t RuleBasedBreakIterator::previous(void) {
// set the current iteration position to be the last break position
// before where we started, and then return that value
fText->setIndex(lastResult);
fLastBreakTag = lastTag; // for use by getRuleStatus()
fLastBreakTagValid = breakTagValid;
fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
fLastStatusIndexValid = breakTagValid;
return lastResult;
}
@ -462,17 +462,13 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
fLastBreakTag = 0;
fLastBreakTagValid = TRUE;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
if (fText == NULL || offset >= fText->endIndex()) {
// fText->setToEnd();
// return BreakIterator::DONE;
last();
return next();
}
else if (offset < fText->startIndex()) {
// fText->setToStart();
// return fText->startIndex();
return first();
}
@ -552,6 +548,7 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
// if the offset passed in is already past the end of the text,
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == NULL || offset > fText->endIndex()) {
// return BreakIterator::DONE;
@ -679,11 +676,11 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
}
// No matter what, handleNext alway correctly sets the break tag value.
fLastBreakTagValid = TRUE;
fLastStatusIndexValid = TRUE;
// if we're already at the end of the text, return DONE.
if (fText == NULL || fData == NULL || fText->hasNext() == FALSE) {
fLastBreakTag = 0;
fLastRuleStatusIndex = 0;
return BreakIterator::DONE;
}
@ -697,9 +694,9 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
UChar32 c = fText->current32();
RBBIStateTableRow *row;
int32_t lookaheadStatus = 0;
int32_t lookaheadTag = 0;
int32_t lookaheadTagIdx = 0;
fLastBreakTag = 0;
fLastRuleStatusIndex = 0;
row = (RBBIStateTableRow *) // Point to starting row of state table.
(statetable->fTableData + (statetable->fRowLen * state));
@ -724,8 +721,8 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
// We ran off the end of the string with a pending look-ahead match.
// Treat this as if the look-ahead condition had been met, and return
// the match at the / position from the look-ahead rule.
result = lookaheadResult;
fLastBreakTag = lookaheadTag;
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
} else if (result == initialPosition) {
// Ran off end, no match found.
@ -778,7 +775,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
// Match found, common case, could have lookahead so we move on to check it
result = fText->getIndex();
/// added
fLastBreakTag = row->fTag; // Remember the break status (tag) value.
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values.
}
if (row->fLookAhead != 0) {
@ -786,9 +783,9 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
&& row->fAccepting == lookaheadStatus) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
result = lookaheadResult;
fLastBreakTag = lookaheadTag;
lookaheadStatus = 0;
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
/// i think we have to back up to read the lookahead character again
/// fText->setIndex(lookaheadResult);
/// TODO: this is a simple hack since reverse rules only have simple
@ -802,7 +799,7 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
int32_t r = fText->getIndex();
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
lookaheadTagIdx = row->fTagIdx;
goto continueOn;
}
@ -856,7 +853,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
int32_t result = fText->getIndex();
int32_t lookaheadStatus = 0;
int32_t lookaheadResult = 0;
int32_t lookaheadTag = 0;
int32_t lookaheadTagIdx = 0;
UChar32 c = fText->current32();
RBBIStateTableRow *row;
@ -927,7 +924,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
if (r > result) {
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
lookaheadTagIdx = row->fTagIdx;
}
goto continueOn;
}
@ -938,8 +935,8 @@ int32_t RuleBasedBreakIterator::handlePrevious(void) {
if (lookaheadResult > result) {
U_ASSERT(row->fAccepting == lookaheadStatus); // TODO: handle this case
// of overlapping lookahead matches.
result = lookaheadResult;
fLastBreakTag = lookaheadTag;
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
}
goto continueOn;
@ -986,7 +983,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
}
// break tag is no longer valid after icu switched to exact backwards
// positioning.
fLastBreakTagValid = FALSE;
fLastStatusIndexValid = FALSE;
if (statetable == NULL) {
return fText->setToStart();
}
@ -1000,7 +997,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
int32_t result = fText->getIndex();
int32_t lookaheadStatus = 0;
int32_t lookaheadResult = 0;
int32_t lookaheadTag = 0;
int32_t lookaheadTagIdx = 0;
UBool lookAheadHardBreak = (statetable->fFlags & RBBI_LOOKAHEAD_HARD_BREAK) != 0;
RBBIStateTableRow *row;
@ -1060,7 +1057,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
// Match found, common case, could have lookahead so we move on to check it
result = fText->getIndex();
/// added
fLastBreakTag = row->fTag; // Remember the break status (tag) value.
fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) value.
}
if (row->fLookAhead != 0) {
@ -1068,9 +1065,9 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
&& row->fAccepting == lookaheadStatus) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
result = lookaheadResult;
fLastBreakTag = lookaheadTag;
lookaheadStatus = 0;
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;
lookaheadStatus = 0;
/// i think we have to back up to read the lookahead character again
/// fText->setIndex(lookaheadResult);
/// TODO: this is a simple hack since reverse rules only have simple
@ -1092,10 +1089,10 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
goto continueOn;
}
int32_t r = fText->getIndex();
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
lookaheadTag = row->fTag;
int32_t r = fText->getIndex();
lookaheadResult = r;
lookaheadStatus = row->fLookAhead;
fLastRuleStatusIndex = row->fTagIdx;
goto continueOn;
}
@ -1148,19 +1145,18 @@ RuleBasedBreakIterator::reset()
// status while doing the next().
//
//-------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::getRuleStatus() const {
RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
if (fLastBreakTagValid == FALSE) {
void RuleBasedBreakIterator::makeRuleStatusValid() {
if (fLastStatusIndexValid == FALSE) {
// No cached status is available.
if (fText == NULL || current() == fText->startIndex()) {
// At start of text, or there is no text. Status is always zero.
nonConstThis->fLastBreakTag = 0;
nonConstThis->fLastBreakTagValid = TRUE;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = TRUE;
} else {
// Not at start of text. Find status the tedious way.
int32_t pa = current();
nonConstThis->previous();
int32_t pb = nonConstThis->next();
previous();
int32_t pb = next();
if (pa != pb) {
// note: the if (pa != pb) test is here only to eliminate warnings for
// unused local variables on gcc. Logically, it isn't needed.
@ -1168,10 +1164,51 @@ int32_t RuleBasedBreakIterator::getRuleStatus() const {
}
}
}
return nonConstThis->fLastBreakTag;
U_ASSERT(fLastStatusIndexValid == TRUE);
U_ASSERT(fLastRuleStatusIndex >= 0 && fLastRuleStatusIndex < fData->fStatusMaxIdx);
}
int32_t RuleBasedBreakIterator::getRuleStatus() const {
RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
nonConstThis->makeRuleStatusValid();
// fLastRuleStatusIndex indexes to the start of the appropriate status record
// (the number of status values.)
// This function returns the last (largest) of the array of status values.
int32_t idx = fLastRuleStatusIndex + fData->fRuleStatusTable[fLastRuleStatusIndex];
int32_t tagVal = fData->fRuleStatusTable[idx];
return tagVal;
}
int32_t RuleBasedBreakIterator::getRuleStatusVec(
int32_t *fillInVec, int32_t capacity, UErrorCode &status)
{
if (U_FAILURE(status)) {
return 0;
}
RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
nonConstThis->makeRuleStatusValid();
int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
int32_t numValsToCopy = numVals;
if (numVals > capacity) {
status = U_STRING_NOT_TERMINATED_WARNING; // TODO: probably need a different warning
numValsToCopy = capacity;
}
int i;
for (i=0; i<numValsToCopy; i++) {
fillInVec[i] = fData->fRuleStatusTable[fLastRuleStatusIndex + i + 1];
}
return numVals;
}
//-------------------------------------------------------------------------------
//
// getBinaryRules Access to the compiled form of the rules,

View file

@ -104,6 +104,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
fRuleSource = (UChar *)((char *)data + fHeader->fRuleSource);
fRuleString.setTo(TRUE, fRuleSource, -1);
U_ASSERT(data->fRuleSourceLen > 0);
fRuleStatusTable = (int32_t *)((char *)data + fHeader->fStatusTable);
fStatusMaxIdx = data->fStatusTableLen / sizeof(int32_t);
fRefCount = 1;
@ -116,7 +120,7 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
//-----------------------------------------------------------------------------
//
// Destructor. Don't call this - use removeReferenc() instead.
// Destructor. Don't call this - use removeReference() instead.
//
//-----------------------------------------------------------------------------
RBBIDataWrapper::~RBBIDataWrapper() {
@ -202,7 +206,7 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
RBBIDebugPrintf(" %s\n", heading);
RBBIDebugPrintf("State | Acc LA Tag");
RBBIDebugPrintf("State | Acc LA TagIx");
for (c=0; c<fHeader->fCatCount; c++) {RBBIDebugPrintf("%3d ", c);}
RBBIDebugPrintf("\n------|---------------"); for (c=0;c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("----");
@ -216,7 +220,7 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
for (s=0; s<table->fNumStates; s++) {
RBBIStateTableRow *row = (RBBIStateTableRow *)
(table->fTableData + (table->fRowLen * s));
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTag);
RBBIDebugPrintf("%4d | %3d %3d %3d ", s, row->fAccepting, row->fLookAhead, row->fTagIdx);
for (c=0; c<fHeader->fCatCount; c++) {
RBBIDebugPrintf("%3d ", row->fNextState[c]);
}
@ -247,6 +251,7 @@ void RBBIDataWrapper::printData() {
}
#endif
U_NAMESPACE_END
//-----------------------------------------------------------------------------
@ -403,6 +408,10 @@ ubrk_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outD
ds->swapArray16(ds, inBytes+ds->readUInt32(rbbiDH->fRuleSource), ds->readUInt32(rbbiDH->fRuleSourceLen),
outBytes+ds->readUInt32(rbbiDH->fRuleSource), status);
// Table of rule status values. It's all int_32 values
ds->swapArray32(ds, inBytes+ds->readUInt32(rbbiDH->fStatusTable), ds->readUInt32(rbbiDH->fStatusTableLen),
outBytes+ds->readUInt32(rbbiDH->fStatusTable), status);
// And, last, the header. All 32 bit values.
ds->swapArray32(ds, inBytes, sizeof(RBBIDataHeader), outBytes, status);

View file

@ -68,42 +68,50 @@ struct RBBIDataHeader {
/* All offsets are bytes from the start of the RBBIDataHeader. */
/* All sizes are in bytes. */
/* */
uint32_t fFTable; /* forward state transition table. */
uint32_t fFTable; /* forward state transition table. */
uint32_t fFTableLen;
uint32_t fRTable; /* Offset to the reverse state transition table. */
uint32_t fRTable; /* Offset to the reverse state transition table. */
uint32_t fRTableLen;
uint32_t fSFTable; /* safe point forward transition table */
uint32_t fSFTable; /* safe point forward transition table */
uint32_t fSFTableLen;
uint32_t fSRTable; /* safe point reverse transition table */
uint32_t fSRTable; /* safe point reverse transition table */
uint32_t fSRTableLen;
uint32_t fTrie; /* Offset to Trie data for character categories */
uint32_t fTrie; /* Offset to Trie data for character categories */
uint32_t fTrieLen;
uint32_t fRuleSource; /* Offset to the source for for the break */
uint32_t fRuleSource; /* Offset to the source for for the break */
uint32_t fRuleSourceLen; /* rules. Stored UChar *. */
uint32_t fStatusTable; /* Offset to the table of rule status values */
uint32_t fStatusTableLen;
uint32_t fReserved[8]; /* Reserved for expansion */
uint32_t fReserved[6]; /* Reserved for expansion */
};
struct RBBIStateTableRow {
int16_t fAccepting; /* Non-zero if this row is for an accepting state. */
/* Value is the {nnn} value to return to calling */
/* application. */
int16_t fLookAhead; /* Non-zero if this row is for a state that */
/* corresponds to a '/' in the rule source. */
/* Value is the same as the fAccepting */
/* value for the rule (which will appear */
/* in a different state. */
int16_t fTag; /* Non-zero if this row covers a {tagged} position */
/* from a rule. value is the tag number. */
int16_t fAccepting; /* Non-zero if this row is for an accepting state. */
/* Value 0: not an accepting state. */
/* -1: Unconditional Accepting state. */
/* positive: Look-ahead match has completed. */
/* Actual boundary position happened earlier */
/* Value here == fLookAhead in earlier */
/* state, at actual boundary pos. */
int16_t fLookAhead; /* Non-zero if this row is for a state that */
/* corresponds to a '/' in the rule source. */
/* Value is the same as the fAccepting */
/* value for the rule (which will appear */
/* in a different state. */
int16_t fTagIdx; /* Non-zero if this row covers a {tagged} position */
/* from a rule. Value is the index in the */
/* StatusTable of the set of matching */
/* tags (rule status values) */
int16_t fReserved;
uint16_t fNextState[2]; /* Next State, indexed by char category. */
/* Array Size is fNumCols from the */
/* state table header. */
/* CAUTION: see RBBITableBuilder::getTableSize() */
/* before changing anything here. */
uint16_t fNextState[2]; /* Next State, indexed by char category. */
/* Array Size is fNumCols from the */
/* state table header. */
/* CAUTION: see RBBITableBuilder::getTableSize() */
/* before changing anything here. */
};
@ -122,9 +130,9 @@ typedef enum {
} RBBIStateTableFlags;
/* */
/* */
/* The reference counting wrapper class */
/* */
/* */
class RBBIDataWrapper : public UMemory {
public:
RBBIDataWrapper(const RBBIDataHeader *data, UErrorCode &status);
@ -145,15 +153,19 @@ public:
#define printTable(heading, table)
#endif
/* */
/* */
/* Pointers to items within the data */
/* */
/* */
const RBBIDataHeader *fHeader;
const RBBIStateTable *fForwardTable;
const RBBIStateTable *fReverseTable;
const RBBIStateTable *fSafeFwdTable;
const RBBIStateTable *fSafeRevTable;
const UChar *fRuleSource;
const int32_t *fRuleStatusTable;
/* number of int32_t values in the rule status table. Used to sanity check indexing */
int32_t fStatusMaxIdx;
UTrie fTrie;

View file

@ -64,22 +64,27 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
fReverseTables = NULL;
fSafeFwdTables = NULL;
fSafeRevTables = NULL;
fRuleStatusVals = NULL;
fChainRules = FALSE;
fLBCMNoChain = FALSE;
fLookAheadHardBreak = FALSE;
fUSetNodes = NULL;
fRuleStatusVals = NULL;
fScanner = NULL;
fSetBuilder = NULL;
UErrorCode oldstatus = status;
fUSetNodes = new UVector(status); // bcos status gets overwritten here
fScanner = new RBBIRuleScanner(this);
fSetBuilder = new RBBISetBuilder(this);
if (U_FAILURE(oldstatus)) {
status = oldstatus;
}
if (U_FAILURE(status)) {
return;
}
if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0) {
fUSetNodes = new UVector(status); // bcos status gets overwritten here
fRuleStatusVals = new UVector(status);
fScanner = new RBBIRuleScanner(this);
fSetBuilder = new RBBISetBuilder(this);
if (U_FAILURE(status)) {
return;
}
if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
@ -114,6 +119,7 @@ RBBIRuleBuilder::~RBBIRuleBuilder() {
delete fSafeFwdTree;
delete fSafeRevTree;
delete fScanner;
delete fRuleStatusVals;
}
@ -130,6 +136,8 @@ RBBIRuleBuilder::~RBBIRuleBuilder() {
static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
RBBIDataHeader *RBBIRuleBuilder::flattenData() {
int32_t i;
if (U_FAILURE(*fStatus)) {
return NULL;
}
@ -148,10 +156,13 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar));
int32_t totalSize = headerSize + forwardTableSize + reverseTableSize
+ safeFwdTableSize + safeRevTableSize + trieSize + rulesSize;
+ safeFwdTableSize + safeRevTableSize
+ statusTableSize + trieSize + rulesSize;
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
if (data == NULL) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;
@ -176,7 +187,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
data->fTrie = data->fSRTable + safeRevTableSize;
data->fTrieLen = fSetBuilder->getTrieSize();
data->fRuleSource = data->fTrie + trieSize;
data->fStatusTable = data->fTrie + trieSize;
data->fStatusTableLen= statusTableSize;
data->fRuleSource = data->fStatusTable + statusTableSize;
data->fRuleSourceLen = strippedRules.length() * sizeof(UChar);
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
@ -186,6 +199,12 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable);
for (i=0; i<fRuleStatusVals->size(); i++) {
ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
}
strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus);
return data;
@ -251,6 +270,10 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
return NULL;
}
if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
builder.fForwardTables->printRuleStatusTable();
}
//
// Package up the compiled data into a memory image
// in the run-time format.

View file

@ -153,6 +153,9 @@ public:
RBBITableBuilder *fSafeFwdTables;
RBBITableBuilder *fSafeRevTables;
UVector *fRuleStatusVals; // The values that can be returned
// from getRuleStatus().
RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
// data tables..
private:

View file

@ -133,8 +133,15 @@ void RBBITableBuilder::build() {
flagAcceptingStates();
flagLookAheadStates();
flagTaggedStates();
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
//
// Update the global table of rule status {tag} values
// The rule builder has a global vector of status values that are common
// for all tables. Merge the ones from this table into the global set.
//
mergeRuleStatusVals();
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
}
@ -637,18 +644,142 @@ void RBBITableBuilder::flagTaggedStates() {
}
for (i=0; i<tagNodes.size(); i++) { // For each tag node t (all of 'em)
tagNode = (RBBINode *)tagNodes.elementAt(i);
for (n=0; n<fDStates->size(); n++) { // For each state s (row in the state table)
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
if (sd->fPositions->indexOf(tagNode) >= 0) { // if s include the tag node t
if (sd->fTagVal < tagNode->fVal) {
// If more than one rule tag applies to this state, the larger
// tag takes precedence.
sd->fTagVal = tagNode->fVal;
sortedAdd(&sd->fTagVals, tagNode->fVal);
}
}
}
}
//-----------------------------------------------------------------------------
//
// mergeRuleStatusVals
//
// Update the global table of rule status {tag} values
// The rule builder has a global vector of status values that are common
// for all tables. Merge the ones from this table into the global set.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::mergeRuleStatusVals() {
//
// The basic outline of what happens here is this...
//
// for each state in this state table
// if the status tag list for this state is in the global statuses list
// record where and
// continue with the next state
// else
// add the tag list for this state to the global list.
//
int i;
int n;
// Pre-set a single tag of {0} into the table.
// We will need this as a default, for rule sets with no explicit tagging.
if (fRB->fRuleStatusVals->size() == 0) {
fRB->fRuleStatusVals->addElement(1, *fStatus); // Num of statuses in group
fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus); // and our single status of zero
}
// For each state
for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
UVector *thisStatesTagValues = sd->fTagVals;
if (thisStatesTagValues == NULL) {
// No tag values are explicitly associated with this state.
// Set the default tag value.
sd->fTagsIdx = 0;
continue;
}
// There are tag(s) associated with this state.
// fTagsIdx will be the index into the global tag list for this state's tag values.
// Initial value of -1 flags that we haven't got it set yet.
sd->fTagsIdx = -1;
int32_t thisTagGroupStart = 0; // indexes into the global rule status vals list
int32_t nextTagGroupStart = 0;
// Loop runs once per group of tags in the global list
while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
thisTagGroupStart = nextTagGroupStart;
nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
// The number of tags for this state is different from
// the number of tags in this group from the global list.
// Continue with the next group from the global list.
continue;
}
// The lengths match, go ahead and compare the actual tag values
// between this state and the group from the global list.
for (i=0; i<thisStatesTagValues->size(); i++) {
if (thisStatesTagValues->elementAti(i) !=
fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
// Mismatch.
break;
}
}
if (i == thisStatesTagValues->size()) {
// We found a set of tag values in the global list that match
// those for this state. Use them.
sd->fTagsIdx = thisTagGroupStart;
break;
}
}
if (sd->fTagsIdx == -1) {
// No suitable entry in the global tag list already. Add one
sd->fTagsIdx = fRB->fRuleStatusVals->size();
fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
for (i=0; i<thisStatesTagValues->size(); i++) {
fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
}
}
}
}
//-----------------------------------------------------------------------------
//
// sortedAdd Add a value to a vector of sorted values (ints).
// Do not replicate entries; if the value is already there, do not
// add a second one.
// Lazily create the vector if it does not already exist.
//
//-----------------------------------------------------------------------------
void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
int32_t i;
if (*vector == NULL) {
*vector = new UVector(*fStatus);
}
if (*vector == NULL || U_FAILURE(*fStatus)) {
return;
}
UVector *vec = *vector;
int32_t vSize = vec->size();
for (i=0; i<vSize; i++) {
int32_t valAtI = vec->elementAti(i);
if (valAtI == val) {
// The value is already in the vector. Don't add it again.
return;
}
if (valAtI > val) {
break;
}
}
vec->insertElementAt(val, i, *fStatus);
}
@ -678,6 +809,7 @@ void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
}
//-----------------------------------------------------------------------------
//
// setEqual Set operation on UVector.
@ -818,7 +950,7 @@ void RBBITableBuilder::exportTable(void *where) {
U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
row->fAccepting = (int16_t)sd->fAccepting;
row->fLookAhead = (int16_t)sd->fLookAhead;
row->fTag = (int16_t)sd->fTagVal;
row->fTagIdx = (int16_t)sd->fTagsIdx;
for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
}
@ -856,16 +988,20 @@ void RBBITableBuilder::printStates() {
RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
RBBIDebugPrintf(" | Acc LA Tag");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {RBBIDebugPrintf(" %2d", c);};
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf(" %2d", c);
}
RBBIDebugPrintf("\n");
RBBIDebugPrintf(" |---------------");
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {RBBIDebugPrintf("---");};
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf("---");
}
RBBIDebugPrintf("\n");
for (n=0; n<fDStates->size(); n++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
RBBIDebugPrintf(" %3d | " , n);
RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagVal);
RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
}
@ -877,6 +1013,33 @@ void RBBITableBuilder::printStates() {
//-----------------------------------------------------------------------------
//
// printRuleStatusTable Debug Function. Dump the common rule status table
//
//-----------------------------------------------------------------------------
#ifdef RBBI_DEBUG
void RBBITableBuilder::printRuleStatusTable() {
int32_t thisRecord = 0;
int32_t nextRecord = 0;
int i;
UVector *tbl = fRB->fRuleStatusVals;
RBBIDebugPrintf("index | tags \n");
RBBIDebugPrintf("-------------------\n");
while (nextRecord < tbl->size()) {
thisRecord = nextRecord;
nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
RBBIDebugPrintf("%4d ", thisRecord);
for (i=thisRecord+1; i<nextRecord; i++) {
RBBIDebugPrintf(" %5d", tbl->elementAti(i));
}
RBBIDebugPrintf("\n");
}
RBBIDebugPrintf("\n\n");
}
#endif
//-----------------------------------------------------------------------------
@ -890,19 +1053,15 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
fMarked = FALSE;
fAccepting = 0;
fLookAhead = 0;
fTagVal = 0;
fTagsIdx = 0;
fTagVals = NULL;
fPositions = NULL;
fDtran = NULL;
UErrorCode status = U_ZERO_ERROR;
fDtran = new UVector(lastInputSymbol+1, status);
fDtran = new UVector(lastInputSymbol+1, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
}
if (U_FAILURE(status)) {
*fStatus = status;
return;
}
if (fDtran == NULL) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return;
@ -917,8 +1076,10 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
RBBIStateDescriptor::~RBBIStateDescriptor() {
delete fPositions;
delete fDtran;
delete fTagVals;
fPositions = NULL;
fDtran = NULL;
fTagVals = NULL;
}
U_NAMESPACE_END

View file

@ -55,6 +55,7 @@ private:
void flagAcceptingStates();
void flagLookAheadStates();
void flagTaggedStates();
void mergeRuleStatusVals();
// Set functions for UVector.
// TODO: make a USet subclass of UVector
@ -62,14 +63,19 @@ private:
void setAdd(UVector *dest, UVector *source);
UBool setEquals(UVector *a, UVector *b);
void sortedAdd(UVector **dest, int32_t val);
public:
#ifdef RBBI_DEBUG
void printSet(UVector *s);
void printPosSets(RBBINode *n /* = NULL*/);
void printStates();
void printRuleStatusTable();
#else
#define printSet(s)
#define printPosSets(n)
#define printStates()
#define printRuleStatusTable()
#endif
private:
@ -95,7 +101,8 @@ public:
UBool fMarked;
int32_t fAccepting;
int32_t fLookAhead;
int32_t fTagVal;
UVector *fTagVals;
int32_t fTagsIdx;
UVector *fPositions; // Set of parse tree positions associated
// with this state. Unordered (it's a set).
// UVector contents are RBBINode *

View file

@ -63,10 +63,10 @@ protected:
*/
RBBIDataWrapper *fData;
/** Rule {tag} value for the most recent match.
/** Index of the Rule {tag} values for the most recent match.
* @internal
*/
int32_t fLastBreakTag;
int32_t fLastRuleStatusIndex;
/**
* Rule tag value valid flag.
@ -74,7 +74,7 @@ protected:
* This flag lets us lazily compute the value if we are ever asked for it.
* @internal
*/
UBool fLastBreakTagValid;
UBool fLastStatusIndexValid;
/**
* Counter for the number of characters encountered with the "dictionary"
@ -349,6 +349,24 @@ public:
*/
virtual int32_t getRuleStatus() const;
/**
* Get the statuses from the break rules that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. The default status value for rules
* that do not explicitly provide one is zero.
* <p>
* For word break iterators, the possible values are defined in enum UWordBreak.
* @param fillInVec an array to be filled in with the status values.
* @param capacity the length of the supplied vector. A length of zero causes
* the function to return the number of status values, in the
* normal way, without attemtping to store any values.
* @param status receives error codes.
* @return The number of rule status values from rules that determined
* the most recent boundary returned by the break iterator.
* @draft ICU 3.0
*/
virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
/**
* Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
* This method is to implement a simple version of RTTI, since not all
@ -496,6 +514,12 @@ private:
* @internal
*/
int32_t handleNext(const RBBIStateTable *statetable);
/**
* @internal
*/
void makeRuleStatusValid();
};
//------------------------------------------------------------------------------

View file

@ -521,6 +521,26 @@ ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
U_CAPI int32_t U_EXPORT2
ubrk_getRuleStatus(UBreakIterator *bi);
/**
* Get the statuses from the break rules that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. The default status value for rules
* that do not explicitly provide one is zero.
* <p>
* For word break iterators, the possible values are defined in enum UWordBreak.
* @param bi The break iterator to use
* @param fillInVec an array to be filled in with the status values.
* @param capacity the length of the supplied vector. A length of zero causes
* the function to return the number of status values, in the
* normal way, without attemtping to store any values.
* @param status receives error codes.
* @return The number of rule status values from rules that determined
* the most recent boundary returned by the break iterator.
* @draft ICU 3.0
*/
U_CAPI int32_t U_EXPORT2
ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity, UErrorCode *status);
/**
* Return the locale of the break iterator. You can choose between the valid and
* the actual locale.

View file

@ -66,6 +66,9 @@ UVector::UVector(UObjectDeleter *d, UKeyComparator *c, int32_t initialCapacity,
}
void UVector::_init(int32_t initialCapacity, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
// Fix bogus initialCapacity values; avoid malloc(0)
if (initialCapacity < 1) {
initialCapacity = DEFUALT_CAPACITY;