mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 05:55:35 +00:00
ICU-13194 rbbi safe rule synth, work in progress.
X-SVN-Rev: 41118
This commit is contained in:
parent
5b55224ac5
commit
660d38bc7f
12 changed files with 302 additions and 146 deletions
|
@ -1100,6 +1100,91 @@ int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
|
|||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// handleSafePrevious()
|
||||
//
|
||||
// Iterate backwards using the safe reverse rules.
|
||||
// The logic of this function is similar to handleNext(), but simpler
|
||||
// because the safe table does not require as many options.
|
||||
//
|
||||
//-----------------------------------------------------------------------------------
|
||||
int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
|
||||
int32_t state;
|
||||
uint16_t category = 0;
|
||||
RBBIStateTableRow *row;
|
||||
UChar32 c;
|
||||
int32_t result = 0;
|
||||
|
||||
const RBBIStateTable *stateTable = fData->fSafeRevTable;
|
||||
UTEXT_SETNATIVEINDEX(&fText, fromPosition);
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
RBBIDebugPuts("Handle Previous pos char state category");
|
||||
}
|
||||
#endif
|
||||
|
||||
// if we're already at the start of the text, return DONE.
|
||||
if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
|
||||
return BreakIterator::DONE;
|
||||
}
|
||||
|
||||
// Set the initial state for the state machine
|
||||
c = UTEXT_PREVIOUS32(&fText);
|
||||
state = START_STATE;
|
||||
row = (RBBIStateTableRow *)
|
||||
(stateTable->fTableData + (stateTable->fRowLen * state));
|
||||
|
||||
// loop until we reach the start of the text or transition to state 0
|
||||
//
|
||||
for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
|
||||
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
|
||||
// not the size of the character going in, which is a UChar32.
|
||||
//
|
||||
// And off the dictionary flag bit. For reverse iteration it is not used.
|
||||
category = UTRIE2_GET16(fData->fTrie, c);
|
||||
category &= ~0x4000;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
|
||||
if (0x20<=c && c<0x7f) {
|
||||
RBBIDebugPrintf("\"%c\" ", c);
|
||||
} else {
|
||||
RBBIDebugPrintf("%5x ", c);
|
||||
}
|
||||
RBBIDebugPrintf("%3d %3d\n", state, category);
|
||||
}
|
||||
#endif
|
||||
|
||||
// State Transition - move machine to its next state
|
||||
//
|
||||
// fNextState is a variable-length array.
|
||||
U_ASSERT(category<fData->fHeader->fCatCount);
|
||||
state = row->fNextState[category]; /*Not accessing beyond memory*/
|
||||
row = (RBBIStateTableRow *)
|
||||
(stateTable->fTableData + (stateTable->fRowLen * state));
|
||||
|
||||
if (state == STOP_STATE) {
|
||||
// This is the normal exit from the lookup state machine.
|
||||
// Transistion to state zero means we have found a safe point.
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// The state machine is done. Check whether it found a match...
|
||||
result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
|
||||
#ifdef RBBI_DEBUG
|
||||
if (gTrace) {
|
||||
RBBIDebugPrintf("result = %d\n\n", result);
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// getRuleStatus() Return the break rule tag associated with the current
|
||||
|
|
|
@ -80,8 +80,6 @@ UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
|
|||
void RBBIDataWrapper::init0() {
|
||||
fHeader = NULL;
|
||||
fForwardTable = NULL;
|
||||
fReverseTable = NULL;
|
||||
fSafeFwdTable = NULL;
|
||||
fSafeRevTable = NULL;
|
||||
fRuleSource = NULL;
|
||||
fRuleStatusTable = NULL;
|
||||
|
@ -108,25 +106,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
|
|||
if (data->fFTableLen != 0) {
|
||||
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
|
||||
}
|
||||
if (data->fRTableLen != 0) {
|
||||
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
|
||||
}
|
||||
if (data->fSFTableLen != 0) {
|
||||
fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
|
||||
}
|
||||
if (data->fSRTableLen != 0) {
|
||||
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
|
||||
}
|
||||
|
||||
// Rule Compatibility Hacks
|
||||
// If a rule set includes reverse rules but does not explicitly include safe reverse rules,
|
||||
// the reverse rules are to be treated as safe reverse rules.
|
||||
|
||||
if (fSafeRevTable == NULL && fReverseTable != NULL) {
|
||||
fSafeRevTable = fReverseTable;
|
||||
fReverseTable = NULL;
|
||||
}
|
||||
|
||||
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
||||
(uint8_t *)data + fHeader->fTrie,
|
||||
fHeader->fTrieLen,
|
||||
|
@ -276,8 +259,6 @@ void RBBIDataWrapper::printData() {
|
|||
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
|
||||
|
||||
printTable("Forward State Transition Table", fForwardTable);
|
||||
printTable("Reverse State Transition Table", fReverseTable);
|
||||
printTable("Safe Forward State Transition Table", fSafeFwdTable);
|
||||
printTable("Safe Reverse State Transition Table", fSafeRevTable);
|
||||
|
||||
RBBIDebugPrintf("\nOrignal Rules source:\n");
|
||||
|
|
|
@ -173,8 +173,6 @@ public:
|
|||
/* */
|
||||
const RBBIDataHeader *fHeader;
|
||||
const RBBIStateTable *fForwardTable;
|
||||
const RBBIStateTable *fReverseTable;
|
||||
const RBBIStateTable *fSafeFwdTable;
|
||||
const RBBIStateTable *fSafeRevTable;
|
||||
const UChar *fRuleSource;
|
||||
const int32_t *fRuleStatusTable;
|
||||
|
|
|
@ -63,8 +63,6 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
|
|||
fSafeRevTree = NULL;
|
||||
fDefaultTree = &fForwardTree;
|
||||
fForwardTables = NULL;
|
||||
fReverseTables = NULL;
|
||||
fSafeFwdTables = NULL;
|
||||
fSafeRevTables = NULL;
|
||||
fRuleStatusVals = NULL;
|
||||
fChainRules = FALSE;
|
||||
|
@ -115,8 +113,6 @@ RBBIRuleBuilder::~RBBIRuleBuilder() {
|
|||
delete fUSetNodes;
|
||||
delete fSetBuilder;
|
||||
delete fForwardTables;
|
||||
delete fReverseTables;
|
||||
delete fSafeFwdTables;
|
||||
delete fSafeRevTables;
|
||||
|
||||
delete fForwardTree;
|
||||
|
@ -158,20 +154,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
//
|
||||
int32_t headerSize = align8(sizeof(RBBIDataHeader));
|
||||
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
|
||||
int32_t reverseTableSize = align8(fReverseTables->getTableSize());
|
||||
int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
|
||||
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
|
||||
int32_t trieSize = align8(fSetBuilder->getTrieSize());
|
||||
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
|
||||
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
|
||||
|
||||
(void)safeFwdTableSize;
|
||||
|
||||
int32_t totalSize = headerSize
|
||||
+ forwardTableSize
|
||||
+ /* reverseTableSize */ 0
|
||||
+ /* safeFwdTableSize */ 0
|
||||
+ (safeRevTableSize ? safeRevTableSize : reverseTableSize)
|
||||
+ safeRevTableSize
|
||||
+ statusTableSize + trieSize + rulesSize;
|
||||
|
||||
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
|
||||
|
@ -211,16 +201,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
data->fSFTableLen = 0;
|
||||
|
||||
data->fSRTable = data->fSFTable + 0;
|
||||
if (safeRevTableSize > 0) {
|
||||
data->fSRTableLen = safeRevTableSize;
|
||||
} else if (reverseTableSize > 0) {
|
||||
data->fSRTableLen = reverseTableSize;
|
||||
} else {
|
||||
U_ASSERT(FALSE); // Rule build should have failed for lack of a reverse table
|
||||
// before reaching this point.
|
||||
}
|
||||
|
||||
|
||||
data->fSRTableLen = safeRevTableSize;
|
||||
U_ASSERT(safeRevTableSize > 0);
|
||||
|
||||
data->fTrie = data->fSRTable + data->fSRTableLen;
|
||||
data->fTrieLen = fSetBuilder->getTrieSize();
|
||||
data->fStatusTable = data->fTrie + trieSize;
|
||||
|
@ -231,13 +214,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
|
||||
|
||||
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
|
||||
// fReverseTables->exportTable((uint8_t *)data + data->fRTable);
|
||||
// fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
|
||||
if (safeRevTableSize > 0) {
|
||||
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
|
||||
} else {
|
||||
fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
|
||||
}
|
||||
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
|
||||
|
||||
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
|
||||
|
||||
|
@ -252,10 +229,6 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
//
|
||||
// createRuleBasedBreakIterator construct from source rules that are passed in
|
||||
|
@ -267,8 +240,6 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
UParseError *parseError,
|
||||
UErrorCode &status)
|
||||
{
|
||||
// status checked below
|
||||
|
||||
//
|
||||
// Read the input rules, generate a parse tree, symbol table,
|
||||
// and list of all Unicode Sets referenced by the rules.
|
||||
|
@ -277,66 +248,13 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
if (U_FAILURE(status)) { // status checked here bcos build below doesn't
|
||||
return NULL;
|
||||
}
|
||||
builder.fScanner->parse();
|
||||
|
||||
//
|
||||
// UnicodeSet processing.
|
||||
// Munge the Unicode Sets to create a set of character categories.
|
||||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
//
|
||||
builder.fSetBuilder->buildRanges();
|
||||
RBBIDataHeader *data = builder.build(status);
|
||||
|
||||
|
||||
//
|
||||
// Generate the DFA state transition table.
|
||||
//
|
||||
builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
|
||||
builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
|
||||
builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
|
||||
builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
|
||||
if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
|
||||
builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
|
||||
{
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete builder.fForwardTables; builder.fForwardTables = NULL;
|
||||
delete builder.fReverseTables; builder.fReverseTables = NULL;
|
||||
delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
|
||||
delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
|
||||
return NULL;
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
builder.fForwardTables->build();
|
||||
builder.fReverseTables->build();
|
||||
builder.fSafeFwdTables->build();
|
||||
builder.fSafeRevTables->build();
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
|
||||
builder.fForwardTables->printRuleStatusTable();
|
||||
}
|
||||
#endif
|
||||
|
||||
builder.optimizeTables();
|
||||
builder.fSetBuilder->buildTrie();
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Package up the compiled data into a memory image
|
||||
// in the run-time format.
|
||||
//
|
||||
RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
|
||||
if (U_FAILURE(*builder.fStatus)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Clean up the compiler related stuff
|
||||
//
|
||||
|
||||
|
||||
//
|
||||
// Create a break iterator from the compiled rules.
|
||||
// (Identical to creation from stored pre-compiled rules)
|
||||
|
@ -353,6 +271,61 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
return This;
|
||||
}
|
||||
|
||||
RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fScanner->parse();
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
//
|
||||
// UnicodeSet processing.
|
||||
// Munge the Unicode Sets to create a set of character categories.
|
||||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
//
|
||||
fSetBuilder->buildRanges();
|
||||
|
||||
//
|
||||
// Generate the DFA state transition table.
|
||||
//
|
||||
fForwardTables = new RBBITableBuilder(this, &fForwardTree, status);
|
||||
fSafeRevTables = new RBBITableBuilder(this, &fSafeRevTree, status);
|
||||
if (fForwardTables == nullptr || fSafeRevTables == nullptr)
|
||||
{
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete fForwardTables; fForwardTables = nullptr;
|
||||
delete fSafeRevTables; fSafeRevTables = nullptr;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fForwardTables->build();
|
||||
fForwardTables->buildSafe(status);
|
||||
fSafeRevTables->build();
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
|
||||
fForwardTables->printRuleStatusTable();
|
||||
}
|
||||
#endif
|
||||
|
||||
optimizeTables();
|
||||
fSetBuilder->buildTrie();
|
||||
|
||||
//
|
||||
// Package up the compiled data into a memory image
|
||||
// in the run-time format.
|
||||
//
|
||||
RBBIDataHeader *data = flattenData(); // returns NULL if error
|
||||
if (U_FAILURE(status)) {
|
||||
return nullptr;
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
void RBBIRuleBuilder::optimizeTables() {
|
||||
int32_t leftClass;
|
||||
int32_t rightClass;
|
||||
|
@ -362,18 +335,11 @@ void RBBIRuleBuilder::optimizeTables() {
|
|||
while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
|
||||
fSetBuilder->mergeCategories(leftClass, rightClass);
|
||||
fForwardTables->removeColumn(rightClass);
|
||||
fReverseTables->removeColumn(rightClass);
|
||||
fSafeFwdTables->removeColumn(rightClass);
|
||||
fSafeRevTables->removeColumn(rightClass);
|
||||
}
|
||||
|
||||
fForwardTables->removeDuplicateStates();
|
||||
fReverseTables->removeDuplicateStates();
|
||||
fSafeFwdTables->removeDuplicateStates();
|
||||
fSafeRevTables->removeDuplicateStates();
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -123,10 +123,16 @@ public:
|
|||
RBBIRuleBuilder(const UnicodeString &rules,
|
||||
UParseError *parseErr,
|
||||
UErrorCode &status
|
||||
);
|
||||
);
|
||||
|
||||
virtual ~RBBIRuleBuilder();
|
||||
|
||||
/**
|
||||
* Build the state tables and char class Trie from the source rules.
|
||||
*/
|
||||
RBBIDataHeader *build(UErrorCode &status);
|
||||
|
||||
|
||||
/**
|
||||
* Fold together redundant character classes (table columns) and
|
||||
* redundant states (table rows). Done after initial table generation,
|
||||
|
@ -163,8 +169,6 @@ public:
|
|||
UVector *fUSetNodes; // Vector of all uset nodes.
|
||||
|
||||
RBBITableBuilder *fForwardTables; // State transition tables
|
||||
RBBITableBuilder *fReverseTables;
|
||||
RBBITableBuilder *fSafeFwdTables;
|
||||
RBBITableBuilder *fSafeRevTables;
|
||||
|
||||
UVector *fRuleStatusVals; // The values that can be returned
|
||||
|
|
|
@ -372,7 +372,7 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
|
|||
// (forward, reverse, safe_forward, safe_reverse)
|
||||
// OR this rule into the appropriate group of them.
|
||||
//
|
||||
RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
|
||||
RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
|
||||
|
||||
if (*destRules != NULL) {
|
||||
// This is not the first rule encounted.
|
||||
|
@ -1123,17 +1123,17 @@ void RBBIRuleScanner::parse() {
|
|||
}
|
||||
|
||||
//
|
||||
// If there were NO user specified reverse rules, set up the equivalent of ".*;"
|
||||
// If there were NO user specified safe reverse rules, set up the equivalent of ".*;"
|
||||
//
|
||||
if (fRB->fReverseTree == NULL) {
|
||||
fRB->fReverseTree = pushNewNode(RBBINode::opStar);
|
||||
if (fRB->fSafeRevTree == NULL) {
|
||||
fRB->fSafeRevTree = pushNewNode(RBBINode::opStar);
|
||||
RBBINode *operand = pushNewNode(RBBINode::setRef);
|
||||
if (U_FAILURE(*fRB->fStatus)) {
|
||||
return;
|
||||
}
|
||||
findSetFor(UnicodeString(TRUE, kAny, 3), operand);
|
||||
fRB->fReverseTree->fLeftChild = operand;
|
||||
operand->fParent = fRB->fReverseTree;
|
||||
fRB->fSafeRevTree->fLeftChild = operand;
|
||||
operand->fParent = fRB->fSafeRevTree;
|
||||
fNodeStackPtr -= 2;
|
||||
}
|
||||
|
||||
|
|
|
@ -27,21 +27,21 @@
|
|||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
|
||||
fTree(*rootNode) {
|
||||
fRB = rb;
|
||||
fStatus = fRB->fStatus;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fDStates = new UVector(status);
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
|
||||
fRB(rb),
|
||||
fTree(*rootNode),
|
||||
fStatus(&status),
|
||||
fDStates(nullptr),
|
||||
fSafeTable(nullptr) {
|
||||
if (U_FAILURE(status)) {
|
||||
*fStatus = status;
|
||||
return;
|
||||
}
|
||||
if (fDStates == NULL) {
|
||||
*fStatus = U_MEMORY_ALLOCATION_ERROR;;
|
||||
// fDStates is UVector<RBBIStateDescriptor *>
|
||||
fDStates = new UVector(status);
|
||||
// SafeTable is UVector<UnicodeString *>. Contents owned by the UVector.
|
||||
fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
|
||||
if (U_SUCCESS(status) && (fDStates == nullptr || fSafeTable == nullptr)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -52,7 +52,8 @@ RBBITableBuilder::~RBBITableBuilder() {
|
|||
for (i=0; i<fDStates->size(); i++) {
|
||||
delete (RBBIStateDescriptor *)fDStates->elementAt(i);
|
||||
}
|
||||
delete fDStates;
|
||||
delete fDStates;
|
||||
delete fSafeTable;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1277,6 +1278,89 @@ void RBBITableBuilder::exportTable(void *where) {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Synthesize a safe state table from the main state table.
|
||||
*/
|
||||
void RBBITableBuilder::buildSafe(UErrorCode &status) {
|
||||
// Find safe char class pairs.
|
||||
|
||||
// make a state table row for each trailing class, and map from class to row.
|
||||
|
||||
// For each pair
|
||||
// startRow[p1] = p2
|
||||
// p2row[p2] = stopRow
|
||||
// For each unfilled in cell
|
||||
// set to row corresponding to its column.
|
||||
UVector32 safePairs(status);
|
||||
|
||||
int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
|
||||
int32_t numStates = fDStates->size();
|
||||
|
||||
for (int32_t c1=0; c1<numCharClasses; ++c1) {
|
||||
for (int32_t c2=0; c2 < numCharClasses; ++c2) {
|
||||
int32_t wantedEndState = -1;
|
||||
int32_t endState = 0;
|
||||
for (int32_t startState = 1; startState < numStates; ++startState) {
|
||||
RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
|
||||
int32_t s2 = startStateD->fDtran->elementAti(c1);
|
||||
RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
|
||||
endState = s2StateD->fDtran->elementAti(c2);
|
||||
if (wantedEndState < 0) {
|
||||
wantedEndState = endState;
|
||||
} else {
|
||||
if (wantedEndState != endState) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (wantedEndState == endState) {
|
||||
int32_t pair = c1 << 16 | c2;
|
||||
safePairs.addElement(pair, status);
|
||||
// printf("(%d, %d) ", c1, c2);
|
||||
}
|
||||
}
|
||||
//printf("\n");
|
||||
}
|
||||
|
||||
// Populate the initial safe table.
|
||||
// The table as a whole is UVector<UnicodeString>
|
||||
// Each row is represented by a UnicodeString, being used as a Vector<int16>.
|
||||
// Row 0 is the stop state.
|
||||
// Row 1 is the start sate.
|
||||
// Row 2 and beyond are other states, initially one per char class, but
|
||||
// after initial construction, many of the states will be combined, compacting the table.)
|
||||
fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
|
||||
for (int32_t row=0; row<numCharClasses + 2; ++row) {
|
||||
fSafeTable->addElement(new UnicodeString(numCharClasses+4, 0, numCharClasses+4), status);
|
||||
}
|
||||
|
||||
// From the start state, each input char class transitions to the state for that input.
|
||||
UnicodeString &startState = *(UnicodeString *)fSafeTable->elementAt(1);
|
||||
for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
|
||||
// Note: +2 for the start & stop state; +4 for header columns in state table.
|
||||
startState.setCharAt(charClass+4, charClass+2);
|
||||
}
|
||||
|
||||
// Initially make every other state table row look like the start state row,
|
||||
for (int32_t row=2; row<numCharClasses+2; ++row) {
|
||||
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(1);
|
||||
rowState = startState; // UnicodeString assignment, copies contents.
|
||||
}
|
||||
|
||||
// Run through the safe pairs, make next state to zero when pair has been seen.
|
||||
// Zero being the stop state, meaning we found a safe point.
|
||||
for (int32_t pairIdx=0; pairIdx<safePairs.size(); pairIdx++) {
|
||||
int32_t pair = safePairs.elementAti(pairIdx);
|
||||
int32_t c1 = (pair >> 16) & 0x0000ffff;
|
||||
int32_t c2 = pair & 0x0000ffff;
|
||||
|
||||
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(c2 + 2);
|
||||
rowState.setCharAt(c1 + 4, 0);
|
||||
}
|
||||
|
||||
// Merge similar states.
|
||||
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
|
|
|
@ -37,12 +37,13 @@ class UVector32;
|
|||
|
||||
class RBBITableBuilder : public UMemory {
|
||||
public:
|
||||
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
|
||||
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
|
||||
~RBBITableBuilder();
|
||||
|
||||
void build();
|
||||
int32_t getTableSize() const; // Return the runtime size in bytes of
|
||||
// the built state table
|
||||
|
||||
/** Return the runtime size in bytes of the built state table. */
|
||||
int32_t getTableSize() const;
|
||||
|
||||
/** Fill in the runtime state table. Sufficient memory must exist at the specified location.
|
||||
*/
|
||||
|
@ -62,6 +63,15 @@ public:
|
|||
/** Check for, and remove dupicate states (table rows). */
|
||||
void removeDuplicateStates();
|
||||
|
||||
void buildSafe(UErrorCode &status);
|
||||
|
||||
/** Return the runtime size in bytes of the built safe reverse state table. */
|
||||
int32_t getSafeTableSize() const;
|
||||
|
||||
/** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
|
||||
*/
|
||||
void exportSafeTable(void *where);
|
||||
|
||||
|
||||
private:
|
||||
void calcNullable(RBBINode *n);
|
||||
|
@ -126,10 +136,14 @@ private:
|
|||
// table for.
|
||||
UErrorCode *fStatus;
|
||||
|
||||
/** State Descriptors, UVector<RBBIStateDescriptor> */
|
||||
UVector *fDStates; // D states (Aho's terminology)
|
||||
// Index is state number
|
||||
// Contents are RBBIStateDescriptor pointers.
|
||||
|
||||
/** Synthesized safe table, UVector of UnicodeString, one string per table row. */
|
||||
UVector *fSafeTable;
|
||||
|
||||
|
||||
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
|
||||
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
|
||||
|
|
|
@ -648,6 +648,17 @@ private:
|
|||
*/
|
||||
int32_t handlePrevious(int32_t fromPosition);
|
||||
|
||||
/**
|
||||
* Iterate backwards from an arbitrary position in the input text using the
|
||||
* synthesized Safe Reverse rules.
|
||||
* This locates a "Safe Position" from which the forward break rules
|
||||
* will operate correctly. A Safe Position is not necessarily a boundary itself.
|
||||
*
|
||||
* @param fromPosition the position in the input text to begin the iteration.
|
||||
* @internal
|
||||
*/
|
||||
int32_t handleSafePrevious(int32_t fromPosition);
|
||||
|
||||
/**
|
||||
* Find a rule-based boundary by running the state machine.
|
||||
* Input
|
||||
|
|
|
@ -518,7 +518,7 @@ sortiComparator(const void * /*context */, const void *left, const void *right)
|
|||
}
|
||||
|
||||
/**
|
||||
* Sort the vector, assuming it constains ints.
|
||||
* Sort the vector, assuming it contains ints.
|
||||
* (A more general sort would take a comparison function, but it's
|
||||
* not clear whether UVector's UElementComparator or
|
||||
* UComparator from uprv_sortAray would be more appropriate.)
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "uvectr32.h"
|
||||
#include "cmemory.h"
|
||||
#include "putilimp.h"
|
||||
#include "uarrsort.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -328,8 +329,15 @@ void UVector32::sortedInsert(int32_t tok, UErrorCode& ec) {
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Sort the vector, assuming it contains ints.
|
||||
*/
|
||||
void UVector32::sorti(UErrorCode &ec) {
|
||||
if (U_SUCCESS(ec)) {
|
||||
uprv_sortArray(elements, count, sizeof(int32_t),
|
||||
uprv_int32Comparator, nullptr, false, &ec);
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
|
|
@ -160,6 +160,11 @@ public:
|
|||
*/
|
||||
void sortedInsert(int32_t elem, UErrorCode& ec);
|
||||
|
||||
/**
|
||||
* Sort the contents of the int32_t vector.
|
||||
*/
|
||||
void sorti(UErrorCode &ec);
|
||||
|
||||
/**
|
||||
* Returns a pointer to the internal array holding the vector.
|
||||
*/
|
||||
|
|
Loading…
Add table
Reference in a new issue