ICU-13569 rbbi state table opt, work in progress.

X-SVN-Rev: 40855
This commit is contained in:
Andy Heninger 2018-02-08 01:42:04 +00:00
parent be54542b86
commit 3d4a3fbaa8
12 changed files with 213 additions and 23 deletions

View file

@ -1338,6 +1338,10 @@ void RuleBasedBreakIterator::dumpCache() {
fBreakCache->dumpCache();
}
void RuleBasedBreakIterator::dumpTables() {
fData->printData();
}
/**
* Returns the description used to create this iterator
*/

View file

@ -267,8 +267,8 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
#endif
#ifdef RBBI_DEBUG
void RBBIDataWrapper::printData() {
#ifdef RBBI_DEBUG
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
@ -285,8 +285,8 @@ void RBBIDataWrapper::printData() {
RBBIDebugPrintf("%c", fRuleSource[c]);
}
RBBIDebugPrintf("\n\n");
}
#endif
}
U_NAMESPACE_END

View file

@ -165,13 +165,8 @@ public:
UBool operator ==(const RBBIDataWrapper &other) const;
int32_t hashCode();
const UnicodeString &getRuleSourceString() const;
#ifdef RBBI_DEBUG
void printData();
void printTable(const char *heading, const RBBIStateTable *table);
#else
#define printData()
#define printTable(heading, table)
#endif
/* */
/* Pointers to items within the data */

View file

@ -282,10 +282,10 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
//
// UnicodeSet processing.
// Munge the Unicode Sets to create a set of character categories.
// Generate the mapping tables (TRIE) from input 32-bit characters to
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
builder.fSetBuilder->build();
builder.fSetBuilder->buildRanges();
//
@ -317,6 +317,11 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
}
#endif
builder.optimizeTables();
builder.fSetBuilder->buildTrie();
//
// Package up the compiled data into a memory image
// in the run-time format.
@ -348,6 +353,20 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
return This;
}
void RBBIRuleBuilder::optimizeTables() {
int32_t leftClass;
int32_t rightClass;
leftClass = 1;
rightClass = 2;
while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
fSetBuilder->mergeCategories(leftClass, rightClass);
fForwardTables->removeColumn(rightClass);
}
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -126,6 +126,14 @@ public:
);
virtual ~RBBIRuleBuilder();
/**
* Fold together redundant character classes (table columns) and
* redundant states (table rows). Done after initial table generation,
* before serializing the result.
*/
void optimizeTables();
char *fDebugEnv; // controls debug trace output
UErrorCode *fStatus; // Error reporting. Keeping status
UParseError *fParseError; // here avoids passing it everywhere.

View file

@ -91,7 +91,7 @@ RBBISetBuilder::~RBBISetBuilder()
// from the Unicode Sets.
//
//------------------------------------------------------------------------
void RBBISetBuilder::build() {
void RBBISetBuilder::buildRanges() {
RBBINode *usetNode;
RangeDescriptor *rlRange;
@ -245,11 +245,16 @@ void RBBISetBuilder::build() {
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
}
//
// Build the Trie table for mapping UChar32 values to the corresponding
// range group number.
//
void RBBISetBuilder::buildTrie() {
RangeDescriptor *rlRange;
//
// Build the Trie table for mapping UChar32 values to the corresponding
// range group number
//
fTrie = utrie2_open(0, // Initial value for all code points.
0, // Error value for out-of-range input.
fStatus);
@ -265,6 +270,20 @@ void RBBISetBuilder::build() {
}
void RBBISetBuilder::mergeCategories(int32_t left, int32_t right) {
U_ASSERT(left >= 1);
U_ASSERT(right > left);
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
if (rd->fNum == right) {
rd->fNum = left;
} else if (rd->fNum > right) {
rd->fNum--;
}
}
--fGroupCount;
}
//-----------------------------------------------------------------------------------
//
// getTrieSize() Return the size that will be required to serialize the Trie.

View file

@ -82,7 +82,8 @@ public:
RBBISetBuilder(RBBIRuleBuilder *rb);
~RBBISetBuilder();
void build();
void buildRanges();
void buildTrie();
void addValToSets(UVector *sets, uint32_t val);
void addValToSet (RBBINode *usetNode, uint32_t val);
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
@ -93,6 +94,11 @@ public:
UChar32 getFirstChar(int32_t val) const;
UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
// character were encountered.
/** merge two character categories that have been identified as having equivalent behavior.
* The ranges belonging to the right category (table column) will be added to the left.
*/
void mergeCategories(int32_t left, int32_t right);
#ifdef RBBI_DEBUG
void printSets();
void printRanges();

View file

@ -22,6 +22,7 @@
#include "rbbidata.h"
#include "cstring.h"
#include "uassert.h"
#include "uvectr32.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
@ -1077,6 +1078,49 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
}
#endif
//
// findDuplCharClassFrom()
//
bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &duplCategory) {
int32_t numStates = fDStates->size();
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
U_ASSERT(baseCategory < duplCategory);
uint16_t table_base;
uint16_t table_dupl;
for (; baseCategory < numCols-1; ++baseCategory) {
for (; duplCategory < numCols; ++duplCategory) {
for (int32_t state=0; state<numStates; state++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
table_base = (uint16_t)sd->fDtran->elementAti(baseCategory);
table_dupl = (uint16_t)sd->fDtran->elementAti(duplCategory);
if (table_base != table_dupl) {
break;
}
}
if (table_base == table_dupl) {
return true;
}
}
}
return false;
}
//
// removeColumn()
//
void RBBITableBuilder::removeColumn(int32_t column) {
int32_t numStates = fDStates->size();
for (int32_t state=0; state<numStates; state++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
U_ASSERT(column < sd->fDtran->size());
sd->fDtran->removeElementAt(column);
}
}
//-----------------------------------------------------------------------------
@ -1106,7 +1150,6 @@ int32_t RBBITableBuilder::getTableSize() const {
}
//-----------------------------------------------------------------------------
//
// exportTable() export the state transition table in the format required
@ -1256,7 +1299,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
fPositions = NULL;
fDtran = NULL;
fDtran = new UVector(lastInputSymbol+1, *fStatus);
fDtran = new UVector32(lastInputSymbol+1, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
}
@ -1264,7 +1307,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
// It is indexed by input symbols, and will
// hold the next state number for each
// symbol.

View file

@ -24,6 +24,7 @@ U_NAMESPACE_BEGIN
class RBBIRuleScanner;
class RBBIRuleBuilder;
class UVector32;
//
// class RBBITableBuilder is part of the RBBI rule compiler.
@ -42,9 +43,23 @@ public:
void build();
int32_t getTableSize() const; // Return the runtime size in bytes of
// the built state table
void exportTable(void *where); // fill in the runtime state table.
// Sufficient memory must exist at
// the specified location.
/** Fill in the runtime state table. Sufficient memory must exist at the specified location.
*/
void exportTable(void *where);
/** Find duplicate (redundant) character classes, beginning after the specifed
* pair, within this state table. This is an iterator-like function, used to
* identify char classes (state table columns) that can be eliminated.
*/
bool findDuplCharClassFrom(int &baseClass, int &duplClass);
/** Remove a column from the state table. Used when two character categories
* have been found equivalent, and merged together, to eliminate the uneeded table column.
*/
void removeColumn(int32_t column);
private:
@ -60,6 +75,12 @@ private:
void flagTaggedStates();
void mergeRuleStatusVals();
/**
* Merge redundant state table columns, eliminating character classes with identical behavior.
* Done after the state tables are generated, just before converting to their run-time format.
*/
int32_t mergeColumns();
void addRuleRootNodes(UVector *dest, RBBINode *node);
// Set functions for UVector.
@ -112,7 +133,7 @@ public:
// with this state. Unordered (it's a set).
// UVector contents are RBBINode *
UVector *fDtran; // Transitions out of this state.
UVector32 *fDtran; // Transitions out of this state.
// indexed by input character
// contents is int index of dest state
// in RBBITableBuilder.fDStates

View file

@ -60,10 +60,13 @@ private:
UText fText;
/**
* The rule data for this BreakIterator instance
* The rule data for this BreakIterator instance.
* Not for general use; Public only for testing purposes.
* @internal
*/
public:
RBBIDataWrapper *fData;
private:
/**
* The iteration state - current position, rule status for the current position,
@ -683,6 +686,13 @@ private:
* @internal
*/
void dumpCache();
/**
* Debugging function only.
* @internal
*/
void dumpTables();
#endif /* U_HIDE_INTERNAL_API */
};

View file

@ -17,6 +17,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
#include "unicode/brkiter.h"
#include "unicode/localpointer.h"
@ -39,10 +40,12 @@
#include "cstr.h"
#include "intltest.h"
#include "rbbitst.h"
#include "rbbidata.h"
#include "utypeinfo.h" // for 'typeid' to work
#include "uvector.h"
#include "uvectr32.h"
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
#include "unicode/filteredbrk.h"
#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
@ -106,6 +109,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestEmoji);
TESTCASE_AUTO(TestBug12519);
TESTCASE_AUTO(TestBug12677);
TESTCASE_AUTO(TestTableRedundancies);
TESTCASE_AUTO_END;
}
@ -4454,6 +4458,66 @@ void RBBITest::TestBug12677() {
assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
}
void RBBITest::TestTableRedundancies() {
UErrorCode status = U_ZERO_ERROR;
RuleBasedBreakIterator *bi =
(RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
// bi->dumpTables();
RBBIDataWrapper *dw = bi->fData;
const RBBIStateTable *fwtbl = dw->fForwardTable;
int32_t numCharClasses = dw->fHeader->fCatCount;
printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
// Check for duplicate columns
std::vector<UnicodeString> columns;
for (int32_t column = 0; column < numCharClasses; column++) {
UnicodeString s;
for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
s.append(row->fNextState[column]);
}
columns.push_back(s);
}
for (int c1=0; c1<numCharClasses; c1++) {
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
if (columns.at(c1) == columns.at(c2)) {
printf("Duplicate columns (%d, %d)\n", c1, c2);
break;
}
}
}
// Check for duplicate states
std::vector<UnicodeString> rows;
for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
UnicodeString s;
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
if (row->fAccepting < -1) {
printf("row %d accepting = %d\n", r, row->fAccepting);
}
s.append(row->fAccepting + 1); // values of -1 are expected.
s.append(row->fLookAhead);
s.append(row->fTagIdx);
for (int32_t column = 0; column < numCharClasses; column++) {
s.append(row->fNextState[column]);
}
rows.push_back(s);
}
for (int r1=0; r1<(int32_t)fwtbl->fNumStates; r1++) {
for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
if (rows.at(r1) == rows.at(r2)) {
printf("Duplicate rows (%d, %d)\n", r1, r2);
break;
}
}
}
delete bi;
}
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked

View file

@ -75,6 +75,7 @@ public:
void TestEmoji();
void TestBug12519();
void TestBug12677();
void TestTableRedundancies();
void TestDebug();
void TestProperties();