mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-13569 rbbi state table opt, work in progress.
X-SVN-Rev: 40855
This commit is contained in:
parent
be54542b86
commit
3d4a3fbaa8
12 changed files with 213 additions and 23 deletions
|
@ -1338,6 +1338,10 @@ void RuleBasedBreakIterator::dumpCache() {
|
|||
fBreakCache->dumpCache();
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::dumpTables() {
|
||||
fData->printData();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the description used to create this iterator
|
||||
*/
|
||||
|
|
|
@ -267,8 +267,8 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
|
|||
#endif
|
||||
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
void RBBIDataWrapper::printData() {
|
||||
#ifdef RBBI_DEBUG
|
||||
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
|
||||
RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
|
||||
fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
|
||||
|
@ -285,8 +285,8 @@ void RBBIDataWrapper::printData() {
|
|||
RBBIDebugPrintf("%c", fRuleSource[c]);
|
||||
}
|
||||
RBBIDebugPrintf("\n\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -165,13 +165,8 @@ public:
|
|||
UBool operator ==(const RBBIDataWrapper &other) const;
|
||||
int32_t hashCode();
|
||||
const UnicodeString &getRuleSourceString() const;
|
||||
#ifdef RBBI_DEBUG
|
||||
void printData();
|
||||
void printTable(const char *heading, const RBBIStateTable *table);
|
||||
#else
|
||||
#define printData()
|
||||
#define printTable(heading, table)
|
||||
#endif
|
||||
|
||||
/* */
|
||||
/* Pointers to items within the data */
|
||||
|
|
|
@ -282,10 +282,10 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
//
|
||||
// UnicodeSet processing.
|
||||
// Munge the Unicode Sets to create a set of character categories.
|
||||
// Generate the mapping tables (TRIE) from input 32-bit characters to
|
||||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
//
|
||||
builder.fSetBuilder->build();
|
||||
builder.fSetBuilder->buildRanges();
|
||||
|
||||
|
||||
//
|
||||
|
@ -317,6 +317,11 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
}
|
||||
#endif
|
||||
|
||||
builder.optimizeTables();
|
||||
builder.fSetBuilder->buildTrie();
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Package up the compiled data into a memory image
|
||||
// in the run-time format.
|
||||
|
@ -348,6 +353,20 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
return This;
|
||||
}
|
||||
|
||||
void RBBIRuleBuilder::optimizeTables() {
|
||||
int32_t leftClass;
|
||||
int32_t rightClass;
|
||||
|
||||
leftClass = 1;
|
||||
rightClass = 2;
|
||||
while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
|
||||
fSetBuilder->mergeCategories(leftClass, rightClass);
|
||||
fForwardTables->removeColumn(rightClass);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
|
@ -126,6 +126,14 @@ public:
|
|||
);
|
||||
|
||||
virtual ~RBBIRuleBuilder();
|
||||
|
||||
/**
|
||||
* Fold together redundant character classes (table columns) and
|
||||
* redundant states (table rows). Done after initial table generation,
|
||||
* before serializing the result.
|
||||
*/
|
||||
void optimizeTables();
|
||||
|
||||
char *fDebugEnv; // controls debug trace output
|
||||
UErrorCode *fStatus; // Error reporting. Keeping status
|
||||
UParseError *fParseError; // here avoids passing it everywhere.
|
||||
|
|
|
@ -91,7 +91,7 @@ RBBISetBuilder::~RBBISetBuilder()
|
|||
// from the Unicode Sets.
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
void RBBISetBuilder::build() {
|
||||
void RBBISetBuilder::buildRanges() {
|
||||
RBBINode *usetNode;
|
||||
RangeDescriptor *rlRange;
|
||||
|
||||
|
@ -245,11 +245,16 @@ void RBBISetBuilder::build() {
|
|||
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Build the Trie table for mapping UChar32 values to the corresponding
|
||||
// range group number.
|
||||
//
|
||||
void RBBISetBuilder::buildTrie() {
|
||||
RangeDescriptor *rlRange;
|
||||
|
||||
//
|
||||
// Build the Trie table for mapping UChar32 values to the corresponding
|
||||
// range group number
|
||||
//
|
||||
fTrie = utrie2_open(0, // Initial value for all code points.
|
||||
0, // Error value for out-of-range input.
|
||||
fStatus);
|
||||
|
@ -265,6 +270,20 @@ void RBBISetBuilder::build() {
|
|||
}
|
||||
|
||||
|
||||
void RBBISetBuilder::mergeCategories(int32_t left, int32_t right) {
|
||||
U_ASSERT(left >= 1);
|
||||
U_ASSERT(right > left);
|
||||
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
|
||||
if (rd->fNum == right) {
|
||||
rd->fNum = left;
|
||||
} else if (rd->fNum > right) {
|
||||
rd->fNum--;
|
||||
}
|
||||
}
|
||||
--fGroupCount;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// getTrieSize() Return the size that will be required to serialize the Trie.
|
||||
|
|
|
@ -82,7 +82,8 @@ public:
|
|||
RBBISetBuilder(RBBIRuleBuilder *rb);
|
||||
~RBBISetBuilder();
|
||||
|
||||
void build();
|
||||
void buildRanges();
|
||||
void buildTrie();
|
||||
void addValToSets(UVector *sets, uint32_t val);
|
||||
void addValToSet (RBBINode *usetNode, uint32_t val);
|
||||
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
|
||||
|
@ -93,6 +94,11 @@ public:
|
|||
UChar32 getFirstChar(int32_t val) const;
|
||||
UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
|
||||
// character were encountered.
|
||||
/** merge two character categories that have been identified as having equivalent behavior.
|
||||
* The ranges belonging to the right category (table column) will be added to the left.
|
||||
*/
|
||||
void mergeCategories(int32_t left, int32_t right);
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
void printSets();
|
||||
void printRanges();
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "rbbidata.h"
|
||||
#include "cstring.h"
|
||||
#include "uassert.h"
|
||||
#include "uvectr32.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
@ -1077,6 +1078,49 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
|
|||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
// findDuplCharClassFrom()
|
||||
//
|
||||
bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &duplCategory) {
|
||||
int32_t numStates = fDStates->size();
|
||||
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
|
||||
U_ASSERT(baseCategory < duplCategory);
|
||||
|
||||
uint16_t table_base;
|
||||
uint16_t table_dupl;
|
||||
for (; baseCategory < numCols-1; ++baseCategory) {
|
||||
for (; duplCategory < numCols; ++duplCategory) {
|
||||
for (int32_t state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
|
||||
table_base = (uint16_t)sd->fDtran->elementAti(baseCategory);
|
||||
table_dupl = (uint16_t)sd->fDtran->elementAti(duplCategory);
|
||||
if (table_base != table_dupl) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (table_base == table_dupl) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// removeColumn()
|
||||
//
|
||||
void RBBITableBuilder::removeColumn(int32_t column) {
|
||||
int32_t numStates = fDStates->size();
|
||||
for (int32_t state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
|
||||
U_ASSERT(column < sd->fDtran->size());
|
||||
sd->fDtran->removeElementAt(column);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -1106,7 +1150,6 @@ int32_t RBBITableBuilder::getTableSize() const {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// exportTable() export the state transition table in the format required
|
||||
|
@ -1256,7 +1299,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
|
|||
fPositions = NULL;
|
||||
fDtran = NULL;
|
||||
|
||||
fDtran = new UVector(lastInputSymbol+1, *fStatus);
|
||||
fDtran = new UVector32(lastInputSymbol+1, *fStatus);
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
|
@ -1264,7 +1307,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
|
|||
*fStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
|
||||
fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
|
||||
// It is indexed by input symbols, and will
|
||||
// hold the next state number for each
|
||||
// symbol.
|
||||
|
|
|
@ -24,6 +24,7 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
class RBBIRuleScanner;
|
||||
class RBBIRuleBuilder;
|
||||
class UVector32;
|
||||
|
||||
//
|
||||
// class RBBITableBuilder is part of the RBBI rule compiler.
|
||||
|
@ -42,9 +43,23 @@ public:
|
|||
void build();
|
||||
int32_t getTableSize() const; // Return the runtime size in bytes of
|
||||
// the built state table
|
||||
void exportTable(void *where); // fill in the runtime state table.
|
||||
// Sufficient memory must exist at
|
||||
// the specified location.
|
||||
|
||||
/** Fill in the runtime state table. Sufficient memory must exist at the specified location.
|
||||
*/
|
||||
void exportTable(void *where);
|
||||
|
||||
/** Find duplicate (redundant) character classes, beginning after the specifed
|
||||
* pair, within this state table. This is an iterator-like function, used to
|
||||
* identify char classes (state table columns) that can be eliminated.
|
||||
*/
|
||||
bool findDuplCharClassFrom(int &baseClass, int &duplClass);
|
||||
|
||||
/** Remove a column from the state table. Used when two character categories
|
||||
* have been found equivalent, and merged together, to eliminate the uneeded table column.
|
||||
*/
|
||||
void removeColumn(int32_t column);
|
||||
|
||||
|
||||
|
||||
|
||||
private:
|
||||
|
@ -60,6 +75,12 @@ private:
|
|||
void flagTaggedStates();
|
||||
void mergeRuleStatusVals();
|
||||
|
||||
/**
|
||||
* Merge redundant state table columns, eliminating character classes with identical behavior.
|
||||
* Done after the state tables are generated, just before converting to their run-time format.
|
||||
*/
|
||||
int32_t mergeColumns();
|
||||
|
||||
void addRuleRootNodes(UVector *dest, RBBINode *node);
|
||||
|
||||
// Set functions for UVector.
|
||||
|
@ -112,7 +133,7 @@ public:
|
|||
// with this state. Unordered (it's a set).
|
||||
// UVector contents are RBBINode *
|
||||
|
||||
UVector *fDtran; // Transitions out of this state.
|
||||
UVector32 *fDtran; // Transitions out of this state.
|
||||
// indexed by input character
|
||||
// contents is int index of dest state
|
||||
// in RBBITableBuilder.fDStates
|
||||
|
|
|
@ -60,10 +60,13 @@ private:
|
|||
UText fText;
|
||||
|
||||
/**
|
||||
* The rule data for this BreakIterator instance
|
||||
* The rule data for this BreakIterator instance.
|
||||
* Not for general use; Public only for testing purposes.
|
||||
* @internal
|
||||
*/
|
||||
public:
|
||||
RBBIDataWrapper *fData;
|
||||
private:
|
||||
|
||||
/**
|
||||
* The iteration state - current position, rule status for the current position,
|
||||
|
@ -683,6 +686,13 @@ private:
|
|||
* @internal
|
||||
*/
|
||||
void dumpCache();
|
||||
|
||||
/**
|
||||
* Debugging function only.
|
||||
* @internal
|
||||
*/
|
||||
void dumpTables();
|
||||
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
};
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/localpointer.h"
|
||||
|
@ -39,10 +40,12 @@
|
|||
#include "cstr.h"
|
||||
#include "intltest.h"
|
||||
#include "rbbitst.h"
|
||||
#include "rbbidata.h"
|
||||
#include "utypeinfo.h" // for 'typeid' to work
|
||||
#include "uvector.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
|
||||
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
|
||||
#include "unicode/filteredbrk.h"
|
||||
#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
|
||||
|
@ -106,6 +109,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
TESTCASE_AUTO(TestEmoji);
|
||||
TESTCASE_AUTO(TestBug12519);
|
||||
TESTCASE_AUTO(TestBug12677);
|
||||
TESTCASE_AUTO(TestTableRedundancies);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -4454,6 +4458,66 @@ void RBBITest::TestBug12677() {
|
|||
assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::TestTableRedundancies() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
RuleBasedBreakIterator *bi =
|
||||
(RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
|
||||
// bi->dumpTables();
|
||||
|
||||
RBBIDataWrapper *dw = bi->fData;
|
||||
const RBBIStateTable *fwtbl = dw->fForwardTable;
|
||||
int32_t numCharClasses = dw->fHeader->fCatCount;
|
||||
printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
|
||||
|
||||
// Check for duplicate columns
|
||||
|
||||
std::vector<UnicodeString> columns;
|
||||
for (int32_t column = 0; column < numCharClasses; column++) {
|
||||
UnicodeString s;
|
||||
for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
|
||||
s.append(row->fNextState[column]);
|
||||
}
|
||||
columns.push_back(s);
|
||||
}
|
||||
for (int c1=0; c1<numCharClasses; c1++) {
|
||||
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
|
||||
if (columns.at(c1) == columns.at(c2)) {
|
||||
printf("Duplicate columns (%d, %d)\n", c1, c2);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check for duplicate states
|
||||
std::vector<UnicodeString> rows;
|
||||
for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
|
||||
UnicodeString s;
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
|
||||
if (row->fAccepting < -1) {
|
||||
printf("row %d accepting = %d\n", r, row->fAccepting);
|
||||
}
|
||||
s.append(row->fAccepting + 1); // values of -1 are expected.
|
||||
s.append(row->fLookAhead);
|
||||
s.append(row->fTagIdx);
|
||||
for (int32_t column = 0; column < numCharClasses; column++) {
|
||||
s.append(row->fNextState[column]);
|
||||
}
|
||||
rows.push_back(s);
|
||||
}
|
||||
for (int r1=0; r1<(int32_t)fwtbl->fNumStates; r1++) {
|
||||
for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
|
||||
if (rows.at(r1) == rows.at(r2)) {
|
||||
printf("Duplicate rows (%d, %d)\n", r1, r2);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
delete bi;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// TestDebug - A place-holder test for debugging purposes.
|
||||
// For putting in fragments of other tests that can be invoked
|
||||
|
|
|
@ -75,6 +75,7 @@ public:
|
|||
void TestEmoji();
|
||||
void TestBug12519();
|
||||
void TestBug12677();
|
||||
void TestTableRedundancies();
|
||||
|
||||
void TestDebug();
|
||||
void TestProperties();
|
||||
|
|
Loading…
Add table
Reference in a new issue