ICU-13194 rbbi safe rule synth, work in progress.

X-SVN-Rev: 41118
This commit is contained in:
Andy Heninger 2018-03-17 00:34:48 +00:00
parent 5b55224ac5
commit 660d38bc7f
12 changed files with 302 additions and 146 deletions

View file

@ -1100,6 +1100,91 @@ int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
}
//-----------------------------------------------------------------------------------
//
// handleSafePrevious()
//
// Iterate backwards using the safe reverse rules.
// The logic of this function is similar to handleNext(), but simpler
// because the safe table does not require as many options.
//
//-----------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
int32_t state;
uint16_t category = 0;
RBBIStateTableRow *row;
UChar32 c;
int32_t result = 0;
const RBBIStateTable *stateTable = fData->fSafeRevTable;
UTEXT_SETNATIVEINDEX(&fText, fromPosition);
#ifdef RBBI_DEBUG
if (gTrace) {
RBBIDebugPuts("Handle Previous pos char state category");
}
#endif
// if we're already at the start of the text, return DONE.
if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) {
return BreakIterator::DONE;
}
// Set the initial state for the state machine
c = UTEXT_PREVIOUS32(&fText);
state = START_STATE;
row = (RBBIStateTableRow *)
(stateTable->fTableData + (stateTable->fRowLen * state));
// loop until we reach the start of the text or transition to state 0
//
for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in, which is a UChar32.
//
// And off the dictionary flag bit. For reverse iteration it is not used.
category = UTRIE2_GET16(fData->fTrie, c);
category &= ~0x4000;
#ifdef RBBI_DEBUG
if (gTrace) {
RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText));
if (0x20<=c && c<0x7f) {
RBBIDebugPrintf("\"%c\" ", c);
} else {
RBBIDebugPrintf("%5x ", c);
}
RBBIDebugPrintf("%3d %3d\n", state, category);
}
#endif
// State Transition - move machine to its next state
//
// fNextState is a variable-length array.
U_ASSERT(category<fData->fHeader->fCatCount);
state = row->fNextState[category]; /*Not accessing beyond memory*/
row = (RBBIStateTableRow *)
(stateTable->fTableData + (stateTable->fRowLen * state));
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
// Transistion to state zero means we have found a safe point.
break;
}
}
// The state machine is done. Check whether it found a match...
result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
#ifdef RBBI_DEBUG
if (gTrace) {
RBBIDebugPrintf("result = %d\n\n", result);
}
#endif
return result;
}
//-------------------------------------------------------------------------------
//
// getRuleStatus() Return the break rule tag associated with the current

View file

@ -80,8 +80,6 @@ UBool RBBIDataWrapper::isDataVersionAcceptable(const UVersionInfo version) {
void RBBIDataWrapper::init0() {
fHeader = NULL;
fForwardTable = NULL;
fReverseTable = NULL;
fSafeFwdTable = NULL;
fSafeRevTable = NULL;
fRuleSource = NULL;
fRuleStatusTable = NULL;
@ -108,25 +106,10 @@ void RBBIDataWrapper::init(const RBBIDataHeader *data, UErrorCode &status) {
if (data->fFTableLen != 0) {
fForwardTable = (RBBIStateTable *)((char *)data + fHeader->fFTable);
}
if (data->fRTableLen != 0) {
fReverseTable = (RBBIStateTable *)((char *)data + fHeader->fRTable);
}
if (data->fSFTableLen != 0) {
fSafeFwdTable = (RBBIStateTable *)((char *)data + fHeader->fSFTable);
}
if (data->fSRTableLen != 0) {
fSafeRevTable = (RBBIStateTable *)((char *)data + fHeader->fSRTable);
}
// Rule Compatibility Hacks
// If a rule set includes reverse rules but does not explicitly include safe reverse rules,
// the reverse rules are to be treated as safe reverse rules.
if (fSafeRevTable == NULL && fReverseTable != NULL) {
fSafeRevTable = fReverseTable;
fReverseTable = NULL;
}
fTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(uint8_t *)data + fHeader->fTrie,
fHeader->fTrieLen,
@ -276,8 +259,6 @@ void RBBIDataWrapper::printData() {
RBBIDebugPrintf(" number of character categories = %d\n\n", fHeader->fCatCount);
printTable("Forward State Transition Table", fForwardTable);
printTable("Reverse State Transition Table", fReverseTable);
printTable("Safe Forward State Transition Table", fSafeFwdTable);
printTable("Safe Reverse State Transition Table", fSafeRevTable);
RBBIDebugPrintf("\nOrignal Rules source:\n");

View file

@ -173,8 +173,6 @@ public:
/* */
const RBBIDataHeader *fHeader;
const RBBIStateTable *fForwardTable;
const RBBIStateTable *fReverseTable;
const RBBIStateTable *fSafeFwdTable;
const RBBIStateTable *fSafeRevTable;
const UChar *fRuleSource;
const int32_t *fRuleStatusTable;

View file

@ -63,8 +63,6 @@ RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules,
fSafeRevTree = NULL;
fDefaultTree = &fForwardTree;
fForwardTables = NULL;
fReverseTables = NULL;
fSafeFwdTables = NULL;
fSafeRevTables = NULL;
fRuleStatusVals = NULL;
fChainRules = FALSE;
@ -115,8 +113,6 @@ RBBIRuleBuilder::~RBBIRuleBuilder() {
delete fUSetNodes;
delete fSetBuilder;
delete fForwardTables;
delete fReverseTables;
delete fSafeFwdTables;
delete fSafeRevTables;
delete fForwardTree;
@ -158,20 +154,14 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
//
int32_t headerSize = align8(sizeof(RBBIDataHeader));
int32_t forwardTableSize = align8(fForwardTables->getTableSize());
int32_t reverseTableSize = align8(fReverseTables->getTableSize());
int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize());
int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize());
int32_t trieSize = align8(fSetBuilder->getTrieSize());
int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t));
int32_t rulesSize = align8((fStrippedRules.length()+1) * sizeof(UChar));
(void)safeFwdTableSize;
int32_t totalSize = headerSize
+ forwardTableSize
+ /* reverseTableSize */ 0
+ /* safeFwdTableSize */ 0
+ (safeRevTableSize ? safeRevTableSize : reverseTableSize)
+ safeRevTableSize
+ statusTableSize + trieSize + rulesSize;
RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize);
@ -211,16 +201,9 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
data->fSFTableLen = 0;
data->fSRTable = data->fSFTable + 0;
if (safeRevTableSize > 0) {
data->fSRTableLen = safeRevTableSize;
} else if (reverseTableSize > 0) {
data->fSRTableLen = reverseTableSize;
} else {
U_ASSERT(FALSE); // Rule build should have failed for lack of a reverse table
// before reaching this point.
}
data->fSRTableLen = safeRevTableSize;
U_ASSERT(safeRevTableSize > 0);
data->fTrie = data->fSRTable + data->fSRTableLen;
data->fTrieLen = fSetBuilder->getTrieSize();
data->fStatusTable = data->fTrie + trieSize;
@ -231,13 +214,7 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
fForwardTables->exportTable((uint8_t *)data + data->fFTable);
// fReverseTables->exportTable((uint8_t *)data + data->fRTable);
// fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable);
if (safeRevTableSize > 0) {
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
} else {
fReverseTables->exportTable((uint8_t *)data + data->fSRTable);
}
fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable);
fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie);
@ -252,10 +229,6 @@ RBBIDataHeader *RBBIRuleBuilder::flattenData() {
}
//----------------------------------------------------------------------------------------
//
// createRuleBasedBreakIterator construct from source rules that are passed in
@ -267,8 +240,6 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
UParseError *parseError,
UErrorCode &status)
{
// status checked below
//
// Read the input rules, generate a parse tree, symbol table,
// and list of all Unicode Sets referenced by the rules.
@ -277,66 +248,13 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
if (U_FAILURE(status)) { // status checked here bcos build below doesn't
return NULL;
}
builder.fScanner->parse();
//
// UnicodeSet processing.
// Munge the Unicode Sets to create a set of character categories.
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
builder.fSetBuilder->buildRanges();
RBBIDataHeader *data = builder.build(status);
//
// Generate the DFA state transition table.
//
builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree);
builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree);
builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree);
builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree);
if (builder.fForwardTables == NULL || builder.fReverseTables == NULL ||
builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL)
{
status = U_MEMORY_ALLOCATION_ERROR;
delete builder.fForwardTables; builder.fForwardTables = NULL;
delete builder.fReverseTables; builder.fReverseTables = NULL;
delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL;
delete builder.fSafeRevTables; builder.fSafeRevTables = NULL;
return NULL;
if (U_FAILURE(status)) {
return nullptr;
}
builder.fForwardTables->build();
builder.fReverseTables->build();
builder.fSafeFwdTables->build();
builder.fSafeRevTables->build();
#ifdef RBBI_DEBUG
if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) {
builder.fForwardTables->printRuleStatusTable();
}
#endif
builder.optimizeTables();
builder.fSetBuilder->buildTrie();
//
// Package up the compiled data into a memory image
// in the run-time format.
//
RBBIDataHeader *data = builder.flattenData(); // returns NULL if error
if (U_FAILURE(*builder.fStatus)) {
return NULL;
}
//
// Clean up the compiler related stuff
//
//
// Create a break iterator from the compiled rules.
// (Identical to creation from stored pre-compiled rules)
@ -353,6 +271,61 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
return This;
}
RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
if (U_FAILURE(status)) {
return nullptr;
}
fScanner->parse();
if (U_FAILURE(status)) {
return nullptr;
}
//
// UnicodeSet processing.
// Munge the Unicode Sets to create a set of character categories.
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
fSetBuilder->buildRanges();
//
// Generate the DFA state transition table.
//
fForwardTables = new RBBITableBuilder(this, &fForwardTree, status);
fSafeRevTables = new RBBITableBuilder(this, &fSafeRevTree, status);
if (fForwardTables == nullptr || fSafeRevTables == nullptr)
{
status = U_MEMORY_ALLOCATION_ERROR;
delete fForwardTables; fForwardTables = nullptr;
delete fSafeRevTables; fSafeRevTables = nullptr;
return nullptr;
}
fForwardTables->build();
fForwardTables->buildSafe(status);
fSafeRevTables->build();
#ifdef RBBI_DEBUG
if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
fForwardTables->printRuleStatusTable();
}
#endif
optimizeTables();
fSetBuilder->buildTrie();
//
// Package up the compiled data into a memory image
// in the run-time format.
//
RBBIDataHeader *data = flattenData(); // returns NULL if error
if (U_FAILURE(status)) {
return nullptr;
}
return data;
}
void RBBIRuleBuilder::optimizeTables() {
int32_t leftClass;
int32_t rightClass;
@ -362,18 +335,11 @@ void RBBIRuleBuilder::optimizeTables() {
while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
fSetBuilder->mergeCategories(leftClass, rightClass);
fForwardTables->removeColumn(rightClass);
fReverseTables->removeColumn(rightClass);
fSafeFwdTables->removeColumn(rightClass);
fSafeRevTables->removeColumn(rightClass);
}
fForwardTables->removeDuplicateStates();
fReverseTables->removeDuplicateStates();
fSafeFwdTables->removeDuplicateStates();
fSafeRevTables->removeDuplicateStates();
}
U_NAMESPACE_END

View file

@ -123,10 +123,16 @@ public:
RBBIRuleBuilder(const UnicodeString &rules,
UParseError *parseErr,
UErrorCode &status
);
);
virtual ~RBBIRuleBuilder();
/**
* Build the state tables and char class Trie from the source rules.
*/
RBBIDataHeader *build(UErrorCode &status);
/**
* Fold together redundant character classes (table columns) and
* redundant states (table rows). Done after initial table generation,
@ -163,8 +169,6 @@ public:
UVector *fUSetNodes; // Vector of all uset nodes.
RBBITableBuilder *fForwardTables; // State transition tables
RBBITableBuilder *fReverseTables;
RBBITableBuilder *fSafeFwdTables;
RBBITableBuilder *fSafeRevTables;
UVector *fRuleStatusVals; // The values that can be returned

View file

@ -372,7 +372,7 @@ UBool RBBIRuleScanner::doParseActions(int32_t action)
// (forward, reverse, safe_forward, safe_reverse)
// OR this rule into the appropriate group of them.
//
RBBINode **destRules = (fReverseRule? &fRB->fReverseTree : fRB->fDefaultTree);
RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
if (*destRules != NULL) {
// This is not the first rule encounted.
@ -1123,17 +1123,17 @@ void RBBIRuleScanner::parse() {
}
//
// If there were NO user specified reverse rules, set up the equivalent of ".*;"
// If there were NO user specified safe reverse rules, set up the equivalent of ".*;"
//
if (fRB->fReverseTree == NULL) {
fRB->fReverseTree = pushNewNode(RBBINode::opStar);
if (fRB->fSafeRevTree == NULL) {
fRB->fSafeRevTree = pushNewNode(RBBINode::opStar);
RBBINode *operand = pushNewNode(RBBINode::setRef);
if (U_FAILURE(*fRB->fStatus)) {
return;
}
findSetFor(UnicodeString(TRUE, kAny, 3), operand);
fRB->fReverseTree->fLeftChild = operand;
operand->fParent = fRB->fReverseTree;
fRB->fSafeRevTree->fLeftChild = operand;
operand->fParent = fRB->fSafeRevTree;
fNodeStackPtr -= 2;
}

View file

@ -27,21 +27,21 @@
U_NAMESPACE_BEGIN
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
fTree(*rootNode) {
fRB = rb;
fStatus = fRB->fStatus;
UErrorCode status = U_ZERO_ERROR;
fDStates = new UVector(status);
if (U_FAILURE(*fStatus)) {
return;
}
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
fRB(rb),
fTree(*rootNode),
fStatus(&status),
fDStates(nullptr),
fSafeTable(nullptr) {
if (U_FAILURE(status)) {
*fStatus = status;
return;
}
if (fDStates == NULL) {
*fStatus = U_MEMORY_ALLOCATION_ERROR;;
// fDStates is UVector<RBBIStateDescriptor *>
fDStates = new UVector(status);
// SafeTable is UVector<UnicodeString *>. Contents owned by the UVector.
fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, status);
if (U_SUCCESS(status) && (fDStates == nullptr || fSafeTable == nullptr)) {
status = U_MEMORY_ALLOCATION_ERROR;;
}
}
@ -52,7 +52,8 @@ RBBITableBuilder::~RBBITableBuilder() {
for (i=0; i<fDStates->size(); i++) {
delete (RBBIStateDescriptor *)fDStates->elementAt(i);
}
delete fDStates;
delete fDStates;
delete fSafeTable;
}
@ -1277,6 +1278,89 @@ void RBBITableBuilder::exportTable(void *where) {
}
/**
* Synthesize a safe state table from the main state table.
*/
void RBBITableBuilder::buildSafe(UErrorCode &status) {
// Find safe char class pairs.
// make a state table row for each trailing class, and map from class to row.
// For each pair
// startRow[p1] = p2
// p2row[p2] = stopRow
// For each unfilled in cell
// set to row corresponding to its column.
UVector32 safePairs(status);
int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
int32_t numStates = fDStates->size();
for (int32_t c1=0; c1<numCharClasses; ++c1) {
for (int32_t c2=0; c2 < numCharClasses; ++c2) {
int32_t wantedEndState = -1;
int32_t endState = 0;
for (int32_t startState = 1; startState < numStates; ++startState) {
RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
int32_t s2 = startStateD->fDtran->elementAti(c1);
RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
endState = s2StateD->fDtran->elementAti(c2);
if (wantedEndState < 0) {
wantedEndState = endState;
} else {
if (wantedEndState != endState) {
break;
}
}
}
if (wantedEndState == endState) {
int32_t pair = c1 << 16 | c2;
safePairs.addElement(pair, status);
// printf("(%d, %d) ", c1, c2);
}
}
//printf("\n");
}
// Populate the initial safe table.
// The table as a whole is UVector<UnicodeString>
// Each row is represented by a UnicodeString, being used as a Vector<int16>.
// Row 0 is the stop state.
// Row 1 is the start sate.
// Row 2 and beyond are other states, initially one per char class, but
// after initial construction, many of the states will be combined, compacting the table.)
fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
for (int32_t row=0; row<numCharClasses + 2; ++row) {
fSafeTable->addElement(new UnicodeString(numCharClasses+4, 0, numCharClasses+4), status);
}
// From the start state, each input char class transitions to the state for that input.
UnicodeString &startState = *(UnicodeString *)fSafeTable->elementAt(1);
for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
// Note: +2 for the start & stop state; +4 for header columns in state table.
startState.setCharAt(charClass+4, charClass+2);
}
// Initially make every other state table row look like the start state row,
for (int32_t row=2; row<numCharClasses+2; ++row) {
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(1);
rowState = startState; // UnicodeString assignment, copies contents.
}
// Run through the safe pairs, make next state to zero when pair has been seen.
// Zero being the stop state, meaning we found a safe point.
for (int32_t pairIdx=0; pairIdx<safePairs.size(); pairIdx++) {
int32_t pair = safePairs.elementAti(pairIdx);
int32_t c1 = (pair >> 16) & 0x0000ffff;
int32_t c2 = pair & 0x0000ffff;
UnicodeString &rowState = *(UnicodeString *)fSafeTable->elementAt(c2 + 2);
rowState.setCharAt(c1 + 4, 0);
}
// Merge similar states.
}
//-----------------------------------------------------------------------------
//

View file

@ -37,12 +37,13 @@ class UVector32;
class RBBITableBuilder : public UMemory {
public:
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
~RBBITableBuilder();
void build();
int32_t getTableSize() const; // Return the runtime size in bytes of
// the built state table
/** Return the runtime size in bytes of the built state table. */
int32_t getTableSize() const;
/** Fill in the runtime state table. Sufficient memory must exist at the specified location.
*/
@ -62,6 +63,15 @@ public:
/** Check for, and remove dupicate states (table rows). */
void removeDuplicateStates();
void buildSafe(UErrorCode &status);
/** Return the runtime size in bytes of the built safe reverse state table. */
int32_t getSafeTableSize() const;
/** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
*/
void exportSafeTable(void *where);
private:
void calcNullable(RBBINode *n);
@ -126,10 +136,14 @@ private:
// table for.
UErrorCode *fStatus;
/** State Descriptors, UVector<RBBIStateDescriptor> */
UVector *fDStates; // D states (Aho's terminology)
// Index is state number
// Contents are RBBIStateDescriptor pointers.
/** Synthesized safe table, UVector of UnicodeString, one string per table row. */
UVector *fSafeTable;
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class

View file

@ -648,6 +648,17 @@ private:
*/
int32_t handlePrevious(int32_t fromPosition);
/**
* Iterate backwards from an arbitrary position in the input text using the
* synthesized Safe Reverse rules.
* This locates a "Safe Position" from which the forward break rules
* will operate correctly. A Safe Position is not necessarily a boundary itself.
*
* @param fromPosition the position in the input text to begin the iteration.
* @internal
*/
int32_t handleSafePrevious(int32_t fromPosition);
/**
* Find a rule-based boundary by running the state machine.
* Input

View file

@ -518,7 +518,7 @@ sortiComparator(const void * /*context */, const void *left, const void *right)
}
/**
* Sort the vector, assuming it constains ints.
* Sort the vector, assuming it contains ints.
* (A more general sort would take a comparison function, but it's
* not clear whether UVector's UElementComparator or
* UComparator from uprv_sortAray would be more appropriate.)

View file

@ -13,6 +13,7 @@
#include "uvectr32.h"
#include "cmemory.h"
#include "putilimp.h"
#include "uarrsort.h"
U_NAMESPACE_BEGIN
@ -328,8 +329,15 @@ void UVector32::sortedInsert(int32_t tok, UErrorCode& ec) {
}
/**
* Sort the vector, assuming it contains ints.
*/
void UVector32::sorti(UErrorCode &ec) {
if (U_SUCCESS(ec)) {
uprv_sortArray(elements, count, sizeof(int32_t),
uprv_int32Comparator, nullptr, false, &ec);
}
}
U_NAMESPACE_END

View file

@ -160,6 +160,11 @@ public:
*/
void sortedInsert(int32_t elem, UErrorCode& ec);
/**
* Sort the contents of the int32_t vector.
*/
void sorti(UErrorCode &ec);
/**
* Returns a pointer to the internal array holding the vector.
*/