ICU-13569 Break Iterator state table optimizations.

X-SVN-Rev: 40920
This commit is contained in:
Andy Heninger 2018-02-15 01:31:01 +00:00
commit a3d84405e5
21 changed files with 646 additions and 77 deletions

View file

@ -1338,6 +1338,10 @@ void RuleBasedBreakIterator::dumpCache() {
fBreakCache->dumpCache();
}
void RuleBasedBreakIterator::dumpTables() {
fData->printData();
}
/**
* Returns the description used to create this iterator
*/

View file

@ -267,8 +267,8 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
#endif
#ifdef RBBI_DEBUG
void RBBIDataWrapper::printData() {
#ifdef RBBI_DEBUG
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
@ -285,8 +285,8 @@ void RBBIDataWrapper::printData() {
RBBIDebugPrintf("%c", fRuleSource[c]);
}
RBBIDebugPrintf("\n\n");
}
#endif
}
U_NAMESPACE_END

View file

@ -165,13 +165,8 @@ public:
UBool operator ==(const RBBIDataWrapper &other) const;
int32_t hashCode();
const UnicodeString &getRuleSourceString() const;
#ifdef RBBI_DEBUG
void printData();
void printTable(const char *heading, const RBBIStateTable *table);
#else
#define printData()
#define printTable(heading, table)
#endif
/* */
/* Pointers to items within the data */

View file

@ -282,10 +282,10 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
//
// UnicodeSet processing.
// Munge the Unicode Sets to create a set of character categories.
// Generate the mapping tables (TRIE) from input 32-bit characters to
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
builder.fSetBuilder->build();
builder.fSetBuilder->buildRanges();
//
@ -317,6 +317,11 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
}
#endif
builder.optimizeTables();
builder.fSetBuilder->buildTrie();
//
// Package up the compiled data into a memory image
// in the run-time format.
@ -348,6 +353,29 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
return This;
}
void RBBIRuleBuilder::optimizeTables() {
int32_t leftClass;
int32_t rightClass;
leftClass = 3;
rightClass = 0;
while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
fSetBuilder->mergeCategories(leftClass, rightClass);
fForwardTables->removeColumn(rightClass);
fReverseTables->removeColumn(rightClass);
fSafeFwdTables->removeColumn(rightClass);
fSafeRevTables->removeColumn(rightClass);
}
fForwardTables->removeDuplicateStates();
fReverseTables->removeDuplicateStates();
fSafeFwdTables->removeDuplicateStates();
fSafeRevTables->removeDuplicateStates();
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */

View file

@ -126,6 +126,14 @@ public:
);
virtual ~RBBIRuleBuilder();
/**
* Fold together redundant character classes (table columns) and
* redundant states (table rows). Done after initial table generation,
* before serializing the result.
*/
void optimizeTables();
char *fDebugEnv; // controls debug trace output
UErrorCode *fStatus; // Error reporting. Keeping status
UParseError *fParseError; // here avoids passing it everywhere.

View file

@ -91,7 +91,7 @@ RBBISetBuilder::~RBBISetBuilder()
// from the Unicode Sets.
//
//------------------------------------------------------------------------
void RBBISetBuilder::build() {
void RBBISetBuilder::buildRanges() {
RBBINode *usetNode;
RangeDescriptor *rlRange;
@ -245,11 +245,16 @@ void RBBISetBuilder::build() {
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
}
//
// Build the Trie table for mapping UChar32 values to the corresponding
// range group number.
//
void RBBISetBuilder::buildTrie() {
RangeDescriptor *rlRange;
//
// Build the Trie table for mapping UChar32 values to the corresponding
// range group number
//
fTrie = utrie2_open(0, // Initial value for all code points.
0, // Error value for out-of-range input.
fStatus);
@ -265,6 +270,22 @@ void RBBISetBuilder::build() {
}
void RBBISetBuilder::mergeCategories(int32_t left, int32_t right) {
U_ASSERT(left >= 1);
U_ASSERT(right > left);
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
int32_t rangeNum = rd->fNum & ~DICT_BIT;
int32_t rangeDict = rd->fNum & DICT_BIT;
if (rangeNum == right) {
rd->fNum = left | rangeDict;
} else if (rangeNum > right) {
rd->fNum--;
}
}
--fGroupCount;
}
//-----------------------------------------------------------------------------------
//
// getTrieSize() Return the size that will be required to serialize the Trie.
@ -446,7 +467,7 @@ void RBBISetBuilder::printRangeGroups() {
lastPrintedGroupNum = groupNum;
RBBIDebugPrintf("%2i ", groupNum);
if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> ");}
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
@ -639,20 +660,20 @@ void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
void RangeDescriptor::setDictionaryFlag() {
int i;
for (i=0; i<this->fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
UnicodeString setName;
RBBINode *setRef = usetNode->fParent;
if (setRef != NULL) {
static const char16_t *dictionary = u"dictionary";
for (i=0; i<fIncludesSets->size(); i++) {
RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
RBBINode *setRef = usetNode->fParent;
if (setRef != nullptr) {
RBBINode *varRef = setRef->fParent;
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
setName = varRef->fText;
if (varRef && varRef->fType == RBBINode::varRef) {
const UnicodeString *setName = &varRef->fText;
if (setName->compare(dictionary, -1) == 0) {
fNum |= RBBISetBuilder::DICT_BIT;
break;
}
}
}
if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
this->fNum |= 0x4000;
break;
}
}
}

View file

@ -82,7 +82,8 @@ public:
RBBISetBuilder(RBBIRuleBuilder *rb);
~RBBISetBuilder();
void build();
void buildRanges();
void buildTrie();
void addValToSets(UVector *sets, uint32_t val);
void addValToSet (RBBINode *usetNode, uint32_t val);
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
@ -93,6 +94,13 @@ public:
UChar32 getFirstChar(int32_t val) const;
UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
// character were encountered.
/** merge two character categories that have been identified as having equivalent behavior.
* The ranges belonging to the right category (table column) will be added to the left.
*/
void mergeCategories(int32_t left, int32_t right);
static constexpr int32_t DICT_BIT = 0x4000;
#ifdef RBBI_DEBUG
void printSets();
void printRanges();

View file

@ -22,6 +22,7 @@
#include "rbbidata.h"
#include "cstring.h"
#include "uassert.h"
#include "uvectr32.h"
#include "cmemory.h"
U_NAMESPACE_BEGIN
@ -761,7 +762,7 @@ void RBBITableBuilder::flagAcceptingStates() {
// if sd->fAccepting already had a value other than 0 or -1, leave it be.
// If the end marker node is from a look-ahead rule, set
// the fLookAhead field or this state also.
// the fLookAhead field for this state also.
if (endMarker->fLookAheadEnd) {
// TODO: don't change value if already set?
// TODO: allow for more than one active look-ahead rule in engine.
@ -1077,7 +1078,128 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
}
#endif
//
// findDuplCharClassFrom()
//
bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &duplCategory) {
int32_t numStates = fDStates->size();
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
uint16_t table_base;
uint16_t table_dupl;
for (; baseCategory < numCols-1; ++baseCategory) {
for (duplCategory=baseCategory+1; duplCategory < numCols; ++duplCategory) {
for (int32_t state=0; state<numStates; state++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
table_base = (uint16_t)sd->fDtran->elementAti(baseCategory);
table_dupl = (uint16_t)sd->fDtran->elementAti(duplCategory);
if (table_base != table_dupl) {
break;
}
}
if (table_base == table_dupl) {
return true;
}
}
}
return false;
}
//
// removeColumn()
//
void RBBITableBuilder::removeColumn(int32_t column) {
int32_t numStates = fDStates->size();
for (int32_t state=0; state<numStates; state++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
U_ASSERT(column < sd->fDtran->size());
sd->fDtran->removeElementAt(column);
}
}
/*
* findDuplicateState
*/
bool RBBITableBuilder::findDuplicateState(int32_t &firstState, int32_t &duplState) {
int32_t numStates = fDStates->size();
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
for (; firstState<numStates-1; ++firstState) {
RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(firstState);
for (duplState=firstState+1; duplState<numStates; ++duplState) {
RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
if (firstSD->fAccepting != duplSD->fAccepting ||
firstSD->fLookAhead != duplSD->fLookAhead ||
firstSD->fTagsIdx != duplSD->fTagsIdx) {
continue;
}
bool rowsMatch = true;
for (int32_t col=0; col < numCols; ++col) {
int32_t firstVal = firstSD->fDtran->elementAti(col);
int32_t duplVal = duplSD->fDtran->elementAti(col);
if (!((firstVal == duplVal) ||
((firstVal == firstState || firstVal == duplState) &&
(duplVal == firstState || duplVal == duplState)))) {
rowsMatch = false;
break;
}
}
if (rowsMatch) {
return true;
}
}
}
return false;
}
void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
U_ASSERT(keepState < duplState);
U_ASSERT(duplState < fDStates->size());
RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
fDStates->removeElementAt(duplState);
delete duplSD;
int32_t numStates = fDStates->size();
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
for (int32_t state=0; state<numStates; ++state) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
for (int32_t col=0; col<numCols; col++) {
int32_t existingVal = sd->fDtran->elementAti(col);
int32_t newVal = existingVal;
if (existingVal == duplState) {
newVal = keepState;
} else if (existingVal > duplState) {
newVal = existingVal - 1;
}
sd->fDtran->setElementAt(newVal, col);
}
if (sd->fAccepting == duplState) {
sd->fAccepting = keepState;
} else if (sd->fAccepting > duplState) {
sd->fAccepting--;
}
if (sd->fLookAhead == duplState) {
sd->fLookAhead = keepState;
} else if (sd->fLookAhead > duplState) {
sd->fLookAhead--;
}
}
}
/*
* RemoveDuplicateStates
*/
void RBBITableBuilder::removeDuplicateStates() {
int32_t firstState = 3;
int32_t duplicateState = 0;
while (findDuplicateState(firstState, duplicateState)) {
// printf("Removing duplicate states (%d, %d)\n", firstState, duplicateState);
removeState(firstState, duplicateState);
}
}
//-----------------------------------------------------------------------------
//
@ -1106,7 +1228,6 @@ int32_t RBBITableBuilder::getTableSize() const {
}
//-----------------------------------------------------------------------------
//
// exportTable() export the state transition table in the format required
@ -1256,7 +1377,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
fPositions = NULL;
fDtran = NULL;
fDtran = new UVector(lastInputSymbol+1, *fStatus);
fDtran = new UVector32(lastInputSymbol+1, *fStatus);
if (U_FAILURE(*fStatus)) {
return;
}
@ -1264,7 +1385,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
*fStatus = U_MEMORY_ALLOCATION_ERROR;
return;
}
fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
// It is indexed by input symbols, and will
// hold the next state number for each
// symbol.

View file

@ -24,6 +24,7 @@ U_NAMESPACE_BEGIN
class RBBIRuleScanner;
class RBBIRuleBuilder;
class UVector32;
//
// class RBBITableBuilder is part of the RBBI rule compiler.
@ -42,9 +43,24 @@ public:
void build();
int32_t getTableSize() const; // Return the runtime size in bytes of
// the built state table
void exportTable(void *where); // fill in the runtime state table.
// Sufficient memory must exist at
// the specified location.
/** Fill in the runtime state table. Sufficient memory must exist at the specified location.
*/
void exportTable(void *where);
/** Find duplicate (redundant) character classes, beginning after the specifed
* pair, within this state table. This is an iterator-like function, used to
* identify char classes (state table columns) that can be eliminated.
*/
bool findDuplCharClassFrom(int &baseClass, int &duplClass);
/** Remove a column from the state table. Used when two character categories
* have been found equivalent, and merged together, to eliminate the uneeded table column.
*/
void removeColumn(int32_t column);
/** Check for, and remove dupicate states (table rows). */
void removeDuplicateStates();
private:
@ -60,8 +76,29 @@ private:
void flagTaggedStates();
void mergeRuleStatusVals();
/**
* Merge redundant state table columns, eliminating character classes with identical behavior.
* Done after the state tables are generated, just before converting to their run-time format.
*/
int32_t mergeColumns();
void addRuleRootNodes(UVector *dest, RBBINode *node);
/** Find the next duplicate state. An iterator function.
* @param firstState (in/out) begin looking at this state, return the first of the
* pair of duplicates.
* @param duplicateState returns the duplicate state of fistState
* @return true if a duplicate pair of states was found.
*/
bool findDuplicateState(int32_t &firstState, int32_t &duplicateState);
/** Remove a duplicate state/
* @param keepState First of the duplicate pair. Keep it.
* @param duplState Duplicate state. Remove it. Redirect all references to the duplicate state
* to refer to keepState instead.
*/
void removeState(int32_t keepState, int32_t duplState);
// Set functions for UVector.
// TODO: make a USet subclass of UVector
@ -112,7 +149,7 @@ public:
// with this state. Unordered (it's a set).
// UVector contents are RBBINode *
UVector *fDtran; // Transitions out of this state.
UVector32 *fDtran; // Transitions out of this state.
// indexed by input character
// contents is int index of dest state
// in RBBITableBuilder.fDStates

View file

@ -60,10 +60,13 @@ private:
UText fText;
/**
* The rule data for this BreakIterator instance
* The rule data for this BreakIterator instance.
* Not for general use; Public only for testing purposes.
* @internal
*/
public:
RBBIDataWrapper *fData;
private:
/**
* The iteration state - current position, rule status for the current position,
@ -683,6 +686,13 @@ private:
* @internal
*/
void dumpCache();
/**
* Debugging function only.
* @internal
*/
void dumpTables();
#endif /* U_HIDE_INTERNAL_API */
};

View file

@ -17,6 +17,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
#include "unicode/brkiter.h"
#include "unicode/localpointer.h"
@ -39,10 +40,12 @@
#include "cstr.h"
#include "intltest.h"
#include "rbbitst.h"
#include "rbbidata.h"
#include "utypeinfo.h" // for 'typeid' to work
#include "uvector.h"
#include "uvectr32.h"
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
#include "unicode/filteredbrk.h"
#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
@ -106,6 +109,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
TESTCASE_AUTO(TestEmoji);
TESTCASE_AUTO(TestBug12519);
TESTCASE_AUTO(TestBug12677);
TESTCASE_AUTO(TestTableRedundancies);
TESTCASE_AUTO_END;
}
@ -4454,6 +4458,67 @@ void RBBITest::TestBug12677() {
assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
}
void RBBITest::TestTableRedundancies() {
UErrorCode status = U_ZERO_ERROR;
LocalPointer<RuleBasedBreakIterator> bi (
(RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
assertSuccess(WHERE, status);
if (U_FAILURE(status)) return;
RBBIDataWrapper *dw = bi->fData;
const RBBIStateTable *fwtbl = dw->fForwardTable;
int32_t numCharClasses = dw->fHeader->fCatCount;
// printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
// Check for duplicate columns (character categories)
std::vector<UnicodeString> columns;
for (int32_t column = 0; column < numCharClasses; column++) {
UnicodeString s;
for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
s.append(row->fNextState[column]);
}
columns.push_back(s);
}
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
for (int c1=1; c1<numCharClasses; c1++) {
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
if (columns.at(c1) == columns.at(c2)) {
errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
goto out;
}
}
}
out:
// Check for duplicate states
std::vector<UnicodeString> rows;
for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
UnicodeString s;
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
assertTrue(WHERE, row->fAccepting >= -1);
s.append(row->fAccepting + 1); // values of -1 are expected.
s.append(row->fLookAhead);
s.append(row->fTagIdx);
for (int32_t column = 0; column < numCharClasses; column++) {
s.append(row->fNextState[column]);
}
rows.push_back(s);
}
for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
if (rows.at(r1) == rows.at(r2)) {
errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
return;
}
}
}
}
//
// TestDebug - A place-holder test for debugging purposes.
// For putting in fragments of other tests that can be invoked

View file

@ -75,6 +75,7 @@ public:
void TestEmoji();
void TestBug12519();
void TestBug12677();
void TestTableRedundancies();
void TestDebug();
void TestProperties();

View file

@ -39,18 +39,16 @@
# Temp debugging tests
<locale en>
<word>
<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
。<0></data>
#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
<rules>
$s0=[;,*];
$s1=[a-z];
$s2=[i-n];
$s3=[x-z];
!!forward;
($s0 | '?')*
($s1 | $s2 | $s3)*;
</rules>
<data>•hello• •</data>
## FILTERED BREAK TESTS

View file

@ -18,17 +18,19 @@ import com.ibm.icu.impl.ICUBinary.Authenticate;
import com.ibm.icu.impl.Trie2;
/**
* <p>Internal class used for Rule Based Break Iterators</p>
* <p>Internal class used for Rule Based Break Iterators.</p>
* <p>This class provides access to the compiled break rule data, as
* it is stored in a .brk file.
* Not intended for public use; declared public for testing purposes only.
* @internal
*/
final class RBBIDataWrapper {
public final class RBBIDataWrapper {
//
// These fields are the ready-to-use compiled rule data, as
// read from the file.
//
RBBIDataHeader fHeader;
short fFTable[];
public RBBIDataHeader fHeader;
public short fFTable[];
short fRTable[];
short fSFTable[];
short fSRTable[];
@ -78,11 +80,16 @@ final class RBBIDataWrapper {
// Index offsets to the fields in a state table row.
// Corresponds to struct RBBIStateTableRow in the C version.
//
final static int ACCEPTING = 0;
final static int LOOKAHEAD = 1;
final static int TAGIDX = 2;
final static int RESERVED = 3;
final static int NEXTSTATES = 4;
/** @internal */
public final static int ACCEPTING = 0;
/** @internal */
public final static int LOOKAHEAD = 1;
/** @internal */
public final static int TAGIDX = 2;
/** @internal */
public final static int RESERVED = 3;
/** @internal */
public final static int NEXTSTATES = 4;
// Index offsets to header fields of a state table
// struct RBBIStateTable {... in the C version.
@ -101,13 +108,15 @@ final class RBBIDataWrapper {
/**
* Data Header. A struct-like class with the fields from the RBBI data file header.
* Not intended for public use, declared public for testing purposes only.
* @internal
*/
final static class RBBIDataHeader {
public final static class RBBIDataHeader {
int fMagic; // == 0xbla0
byte[] fFormatVersion; // For ICU 3.4 and later.
int fLength; // Total length in bytes of this RBBI Data,
// including all sections, not just the header.
int fCatCount; // Number of character categories.
public int fCatCount; // Number of character categories.
//
// Offsets and sizes of each of the subsections within the RBBI data.
@ -139,9 +148,9 @@ final class RBBIDataWrapper {
/**
* RBBI State Table Indexing Function. Given a state number, return the
* array index of the start of the state table row for that state.
*
* @internal
*/
int getRowIndex(int state){
public int getRowIndex(int state){
return ROW_DATA + state * (fHeader.fCatCount + 4);
}
@ -311,17 +320,17 @@ final class RBBIDataWrapper {
return This;
}
///CLOVER:OFF
// Getters for fields from the state table header
//
private int getStateTableNumStates(short table[]) {
/**
* Getters for fields from the state table header
* @internal
*/
public int getStateTableNumStates(short table[]) {
if (isBigEndian) {
return (table[NUMSTATES] << 16) | (table[NUMSTATES+1] & 0xffff);
} else {
return (table[NUMSTATES+1] << 16) | (table[NUMSTATES] & 0xffff);
}
}
///CLOVER:ON
int getStateTableFlags(short table[]) {
// This works for up to 15 flags bits.

View file

@ -342,10 +342,10 @@ class RBBIRuleBuilder {
//
// UnicodeSet processing.
// Munge the Unicode Sets to create a set of character categories.
// Generate the mapping tables (TRIE) from input 32-bit characters to
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
builder.fSetBuilder.build();
builder.fSetBuilder.buildRanges();
//
// Generate the DFA state transition table.
@ -363,10 +363,38 @@ class RBBIRuleBuilder {
builder.fForwardTables.printRuleStatusTable();
}
builder.optimizeTables();
builder.fSetBuilder.buildTrie();
//
// Package up the compiled data, writing it to an output stream
// in the serialization format. This is the same as the ICU4C runtime format.
//
builder.flattenData(os);
}
static class IntPair {
int first = 0;
int second = 0;
IntPair() {};
IntPair(int f, int s) {
first = f;
second = s;
}
}
void optimizeTables() {
IntPair duplPair = new IntPair(3, 0);
while (fForwardTables.findDuplCharClassFrom(duplPair)) {
fSetBuilder.mergeCategories(duplPair.first, duplPair.second);
fForwardTables.removeColumn(duplPair.second);
fReverseTables.removeColumn(duplPair.second);
fSafeFwdTables.removeColumn(duplPair.second);
fSafeRevTables.removeColumn(duplPair.second);
}
fForwardTables.removeDuplicateStates();
fReverseTables.removeDuplicateStates();
fSafeFwdTables.removeDuplicateStates();
fSafeRevTables.removeDuplicateStates();
}
}

View file

@ -112,7 +112,7 @@ class RBBISetBuilder {
}
}
if (setName.equals("dictionary")) {
this.fNum |= 0x4000;
this.fNum |= DICT_BIT;
break;
}
}
@ -138,6 +138,8 @@ class RBBISetBuilder {
boolean fSawBOF;
static final int DICT_BIT = 0x4000;
//------------------------------------------------------------------------
//
@ -156,7 +158,7 @@ class RBBISetBuilder {
// from the Unicode Sets.
//
//------------------------------------------------------------------------
void build() {
void buildRanges() {
RangeDescriptor rlRange;
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();}
@ -280,6 +282,15 @@ class RBBISetBuilder {
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
}
/**
* Build the Trie table for mapping UChar32 values to the corresponding
* range group number.
*/
void buildTrie() {
RangeDescriptor rlRange;
fTrie = new Trie2Writable(0, // Initial value for all code points.
0); // Error value for out-of-range input.
@ -294,6 +305,24 @@ class RBBISetBuilder {
}
}
/**
* Merge two character categories that have been identified as having equivalent behavior.
* The ranges belonging to the right category (table column) will be added to the left.
*/
void mergeCategories(int left, int right) {
assert(left >= 1);
assert(right > left);
for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) {
int rangeNum = rd.fNum & ~DICT_BIT;
int rangeDict = rd.fNum & DICT_BIT;
if (rangeNum == right) {
rd.fNum = left | rangeDict;
} else if (rangeNum > right) {
rd.fNum--;
}
}
--fGroupCount;
}
//-----------------------------------------------------------------------------------
//
@ -457,7 +486,7 @@ class RBBISetBuilder {
if (groupNum<10) {System.out.print(" ");}
System.out.print(groupNum + " ");
if ((rlRange.fNum & 0x4000) != 0) { System.out.print(" <DICT> ");}
if ((rlRange.fNum & DICT_BIT) != 0) { System.out.print(" <DICT> ");}
for (i=0; i<rlRange.fIncludesSets.size(); i++) {
RBBINode usetNode = rlRange.fIncludesSets.get(i);

View file

@ -10,6 +10,7 @@
package com.ibm.icu.text;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
@ -20,6 +21,7 @@ import java.util.TreeSet;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
//
// class RBBITableBuilder is part of the RBBI rule compiler.
@ -655,7 +657,7 @@ class RBBITableBuilder {
// if sd.fAccepting already had a value other than 0 or -1, leave it be.
// If the end marker node is from a look-ahead rule, set
// the fLookAhead field or this state also.
// the fLookAhead field for this state also.
if (endMarker.fLookAheadEnd) {
// TODO: don't change value if already set?
// TODO: allow for more than one active look-ahead rule in engine.
@ -832,6 +834,149 @@ class RBBITableBuilder {
/**
* Find duplicate (redundant) character classes, beginning at the specified
* pair, within this state table. This is an iterator-like function, used to
* identify character classes (state table columns) that can be eliminated.
* @param categories in/out parameter, specifies where to start looking for duplicates,
* and returns the first pair of duplicates found, if any.
* @return true if duplicate char classes were found, false otherwise.
* @internal
*/
boolean findDuplCharClassFrom(RBBIRuleBuilder.IntPair categories) {
int numStates = fDStates.size();
int numCols = fRB.fSetBuilder.getNumCharCategories();
int table_base = 0;
int table_dupl = 0;
for (; categories.first < numCols-1; ++categories.first) {
for (categories.second=categories.first+1; categories.second < numCols; ++categories.second) {
for (int state=0; state<numStates; state++) {
RBBIStateDescriptor sd = fDStates.get(state);
table_base = sd.fDtran[categories.first];
table_dupl = sd.fDtran[categories.second];
if (table_base != table_dupl) {
break;
}
}
if (table_base == table_dupl) {
return true;
}
}
}
return false;
}
/**
* Remove a column from the state table. Used when two character categories
* have been found equivalent, and merged together, to eliminate the unneeded table column.
*/
void removeColumn(int column) {
int numStates = fDStates.size();
for (int state=0; state<numStates; state++) {
RBBIStateDescriptor sd = fDStates.get(state);
assert(column < sd.fDtran.length);
int[] newArray = Arrays.copyOf(sd.fDtran, sd.fDtran.length - 1);
System.arraycopy(sd.fDtran, column+1, newArray, column, newArray.length - column);
sd.fDtran = newArray;
}
}
/**
* Find duplicate (redundant) states, beginning at the specified pair,
* within this state table. This is an iterator-like function, used to
* identify states (state table rows) that can be eliminated.
* @param states in/out parameter, specifies where to start looking for duplicates,
* and returns the first pair of duplicates found, if any.
* @return true if duplicate states were found, false otherwise.
* @internal
*/
boolean findDuplicateState(RBBIRuleBuilder.IntPair states) {
int numStates = fDStates.size();
int numCols = fRB.fSetBuilder.getNumCharCategories();
for (; states.first<numStates-1; ++states.first) {
RBBIStateDescriptor firstSD = fDStates.get(states.first);
for (states.second=states.first+1; states.second<numStates; ++states.second) {
RBBIStateDescriptor duplSD = fDStates.get(states.second);
if (firstSD.fAccepting != duplSD.fAccepting ||
firstSD.fLookAhead != duplSD.fLookAhead ||
firstSD.fTagsIdx != duplSD.fTagsIdx) {
continue;
}
boolean rowsMatch = true;
for (int col=0; col < numCols; ++col) {
int firstVal = firstSD.fDtran[col];
int duplVal = duplSD.fDtran[col];
if (!((firstVal == duplVal) ||
((firstVal == states.first || firstVal == states.second) &&
(duplVal == states.first || duplVal == states.second)))) {
rowsMatch = false;
break;
}
}
if (rowsMatch) {
return true;
}
}
}
return false;
}
/**
* Remove a duplicate state (row) from the state table. All references to the deleted state are
* redirected to "keepState", the first encountered of the duplicated pair of states.
* @param keepState The first of the duplicate pair of states, the one to be kept.
* @param duplState The second of the duplicate pair, the one to be removed.
* @internal
*/
void removeState(int keepState, int duplState) {
assert(keepState < duplState);
assert(duplState < fDStates.size());
fDStates.remove(duplState);
int numStates = fDStates.size();
int numCols = fRB.fSetBuilder.getNumCharCategories();
for (int state=0; state<numStates; ++state) {
RBBIStateDescriptor sd = fDStates.get(state);
for (int col=0; col<numCols; col++) {
int existingVal = sd.fDtran[col];
int newVal = existingVal;
if (existingVal == duplState) {
newVal = keepState;
} else if (existingVal > duplState) {
newVal = existingVal - 1;
}
sd.fDtran[col] = newVal;
}
if (sd.fAccepting == duplState) {
sd.fAccepting = keepState;
} else if (sd.fAccepting > duplState) {
sd.fAccepting--;
}
if (sd.fLookAhead == duplState) {
sd.fLookAhead = keepState;
} else if (sd.fLookAhead > duplState) {
sd.fLookAhead--;
}
}
}
/**
* Check for, and remove duplicate states (table rows).
* @internal
*/
void removeDuplicateStates() {
IntPair dupls = new IntPair(3, 0);
while (findDuplicateState(dupls)) {
// System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
removeState(dupls.first, dupls.second);
}
}
//-----------------------------------------------------------------------------
//

View file

@ -222,9 +222,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
private CharacterIterator fText = new java.text.StringCharacterIterator("");
/**
* The rule data for this BreakIterator instance. Package private.
* The rule data for this BreakIterator instance.
* Not intended for public use. Declared public for testing purposes only.
* @internal
*/
RBBIDataWrapper fRData;
public RBBIDataWrapper fRData;
/**
* The iteration state - current position, rule status for the current position,

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5dc7e055663128ae9a1dae471f9a7cdaf5c28e8177b2231e8b0341c3dc8bb842
size 12475727
oid sha256:f78382b447bb13c03234b53e18b013cea1d2ff6a0f71679885ee00d787003822
size 12475101

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:00c43dc113177aa5a0ffda563ac2822f3dc92fdb7ae748a5938529e32a0b67d9
oid sha256:413dd55333970ffe2a54ac323cb9b5e2a7709a06d132e6e60eb41282489e0970
size 92739

View file

@ -19,6 +19,7 @@ package com.ibm.icu.dev.test.rbbi;
import java.text.CharacterIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import org.junit.Test;
import org.junit.runner.RunWith;
@ -26,6 +27,7 @@ import org.junit.runners.JUnit4;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RBBIDataWrapper;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.ULocale;
@ -562,4 +564,62 @@ public class RBBITest extends TestFmwk {
String rtRules = bi.toString(); // getRules() in C++
assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ", rtRules);
}
@Test
public void TestTableRedundancies() {
RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance(Locale.ENGLISH);
String rules = bi.toString();
bi = new RuleBasedBreakIterator(rules);
// Build a break iterator from source rules.
// Want to check the rule builder in Java, not the pre-built rules that are imported from ICU4C.
RBBIDataWrapper dw = bi.fRData;
short[] fwtbl = dw.fFTable;
int numCharClasses = dw.fHeader.fCatCount;
// Check for duplicate columns (character categories)
List<String> columns = new ArrayList<String>();
for (int column=0; column<numCharClasses; column++) {
StringBuilder s = new StringBuilder();
for (int r = 1; r < dw.getStateTableNumStates(fwtbl); r++) {
int row = dw.getRowIndex(r);
short tableVal = fwtbl[row + RBBIDataWrapper.NEXTSTATES + column];
s.append((char)tableVal);
}
columns.add(s.toString());
}
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
for (int c1=1; c1<numCharClasses; c1++) {
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
// if (columns.get(c1).equals(columns.get(c2))) {
// System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
// }
}
}
// Check for duplicate states.
List<String> rows = new ArrayList<String>();
for (int r=0; r<dw.getStateTableNumStates(fwtbl); r++) {
StringBuilder s = new StringBuilder();
int row = dw.getRowIndex(r);
assertTrue("Accepting < -1", fwtbl[row + RBBIDataWrapper.ACCEPTING] >= -1);
s.append(fwtbl[row + RBBIDataWrapper.ACCEPTING]);
s.append(fwtbl[row + RBBIDataWrapper.LOOKAHEAD]);
s.append(fwtbl[row + RBBIDataWrapper.TAGIDX]);
for (int column=0; column<numCharClasses; column++) {
short tableVal = fwtbl[row + RBBIDataWrapper.NEXTSTATES + column];
s.append((char)tableVal);
}
rows.add(s.toString());
}
for (int r1=0; r1 < dw.getStateTableNumStates(fwtbl); r1++) {
for (int r2= r1+1; r2 < dw.getStateTableNumStates(fwtbl); r2++) {
assertFalse(String.format("Duplicate states (%d, %d)", r1, r2), rows.get(r1).equals(rows.get(r2)));
// if (rows.get(r1).equals(rows.get(r2))) {
// System.out.printf("Duplicate states (%d, %d)\n", r1, r2);
// }
}
}
}
}