mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-13569 Break Iterator state table optimizations.
X-SVN-Rev: 40920
This commit is contained in:
commit
a3d84405e5
21 changed files with 646 additions and 77 deletions
|
@ -1338,6 +1338,10 @@ void RuleBasedBreakIterator::dumpCache() {
|
|||
fBreakCache->dumpCache();
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::dumpTables() {
|
||||
fData->printData();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the description used to create this iterator
|
||||
*/
|
||||
|
|
|
@ -267,8 +267,8 @@ void RBBIDataWrapper::printTable(const char *heading, const RBBIStateTable *tab
|
|||
#endif
|
||||
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
void RBBIDataWrapper::printData() {
|
||||
#ifdef RBBI_DEBUG
|
||||
RBBIDebugPrintf("RBBI Data at %p\n", (void *)fHeader);
|
||||
RBBIDebugPrintf(" Version = {%d %d %d %d}\n", fHeader->fFormatVersion[0], fHeader->fFormatVersion[1],
|
||||
fHeader->fFormatVersion[2], fHeader->fFormatVersion[3]);
|
||||
|
@ -285,8 +285,8 @@ void RBBIDataWrapper::printData() {
|
|||
RBBIDebugPrintf("%c", fRuleSource[c]);
|
||||
}
|
||||
RBBIDebugPrintf("\n\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -165,13 +165,8 @@ public:
|
|||
UBool operator ==(const RBBIDataWrapper &other) const;
|
||||
int32_t hashCode();
|
||||
const UnicodeString &getRuleSourceString() const;
|
||||
#ifdef RBBI_DEBUG
|
||||
void printData();
|
||||
void printTable(const char *heading, const RBBIStateTable *table);
|
||||
#else
|
||||
#define printData()
|
||||
#define printTable(heading, table)
|
||||
#endif
|
||||
|
||||
/* */
|
||||
/* Pointers to items within the data */
|
||||
|
|
|
@ -282,10 +282,10 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
//
|
||||
// UnicodeSet processing.
|
||||
// Munge the Unicode Sets to create a set of character categories.
|
||||
// Generate the mapping tables (TRIE) from input 32-bit characters to
|
||||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
//
|
||||
builder.fSetBuilder->build();
|
||||
builder.fSetBuilder->buildRanges();
|
||||
|
||||
|
||||
//
|
||||
|
@ -317,6 +317,11 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
}
|
||||
#endif
|
||||
|
||||
builder.optimizeTables();
|
||||
builder.fSetBuilder->buildTrie();
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Package up the compiled data into a memory image
|
||||
// in the run-time format.
|
||||
|
@ -348,6 +353,29 @@ RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules,
|
|||
return This;
|
||||
}
|
||||
|
||||
void RBBIRuleBuilder::optimizeTables() {
|
||||
int32_t leftClass;
|
||||
int32_t rightClass;
|
||||
|
||||
leftClass = 3;
|
||||
rightClass = 0;
|
||||
while (fForwardTables->findDuplCharClassFrom(leftClass, rightClass)) {
|
||||
fSetBuilder->mergeCategories(leftClass, rightClass);
|
||||
fForwardTables->removeColumn(rightClass);
|
||||
fReverseTables->removeColumn(rightClass);
|
||||
fSafeFwdTables->removeColumn(rightClass);
|
||||
fSafeRevTables->removeColumn(rightClass);
|
||||
}
|
||||
|
||||
fForwardTables->removeDuplicateStates();
|
||||
fReverseTables->removeDuplicateStates();
|
||||
fSafeFwdTables->removeDuplicateStates();
|
||||
fSafeRevTables->removeDuplicateStates();
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
|
|
@ -126,6 +126,14 @@ public:
|
|||
);
|
||||
|
||||
virtual ~RBBIRuleBuilder();
|
||||
|
||||
/**
|
||||
* Fold together redundant character classes (table columns) and
|
||||
* redundant states (table rows). Done after initial table generation,
|
||||
* before serializing the result.
|
||||
*/
|
||||
void optimizeTables();
|
||||
|
||||
char *fDebugEnv; // controls debug trace output
|
||||
UErrorCode *fStatus; // Error reporting. Keeping status
|
||||
UParseError *fParseError; // here avoids passing it everywhere.
|
||||
|
|
|
@ -91,7 +91,7 @@ RBBISetBuilder::~RBBISetBuilder()
|
|||
// from the Unicode Sets.
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
void RBBISetBuilder::build() {
|
||||
void RBBISetBuilder::buildRanges() {
|
||||
RBBINode *usetNode;
|
||||
RangeDescriptor *rlRange;
|
||||
|
||||
|
@ -245,11 +245,16 @@ void RBBISetBuilder::build() {
|
|||
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
|
||||
if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Build the Trie table for mapping UChar32 values to the corresponding
|
||||
// range group number.
|
||||
//
|
||||
void RBBISetBuilder::buildTrie() {
|
||||
RangeDescriptor *rlRange;
|
||||
|
||||
//
|
||||
// Build the Trie table for mapping UChar32 values to the corresponding
|
||||
// range group number
|
||||
//
|
||||
fTrie = utrie2_open(0, // Initial value for all code points.
|
||||
0, // Error value for out-of-range input.
|
||||
fStatus);
|
||||
|
@ -265,6 +270,22 @@ void RBBISetBuilder::build() {
|
|||
}
|
||||
|
||||
|
||||
void RBBISetBuilder::mergeCategories(int32_t left, int32_t right) {
|
||||
U_ASSERT(left >= 1);
|
||||
U_ASSERT(right > left);
|
||||
for (RangeDescriptor *rd = fRangeList; rd != nullptr; rd = rd->fNext) {
|
||||
int32_t rangeNum = rd->fNum & ~DICT_BIT;
|
||||
int32_t rangeDict = rd->fNum & DICT_BIT;
|
||||
if (rangeNum == right) {
|
||||
rd->fNum = left | rangeDict;
|
||||
} else if (rangeNum > right) {
|
||||
rd->fNum--;
|
||||
}
|
||||
}
|
||||
--fGroupCount;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// getTrieSize() Return the size that will be required to serialize the Trie.
|
||||
|
@ -446,7 +467,7 @@ void RBBISetBuilder::printRangeGroups() {
|
|||
lastPrintedGroupNum = groupNum;
|
||||
RBBIDebugPrintf("%2i ", groupNum);
|
||||
|
||||
if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
|
||||
if (rlRange->fNum & DICT_BIT) { RBBIDebugPrintf(" <DICT> ");}
|
||||
|
||||
for (i=0; i<rlRange->fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
|
||||
|
@ -639,20 +660,20 @@ void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
|
|||
void RangeDescriptor::setDictionaryFlag() {
|
||||
int i;
|
||||
|
||||
for (i=0; i<this->fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
|
||||
UnicodeString setName;
|
||||
RBBINode *setRef = usetNode->fParent;
|
||||
if (setRef != NULL) {
|
||||
static const char16_t *dictionary = u"dictionary";
|
||||
for (i=0; i<fIncludesSets->size(); i++) {
|
||||
RBBINode *usetNode = (RBBINode *)fIncludesSets->elementAt(i);
|
||||
RBBINode *setRef = usetNode->fParent;
|
||||
if (setRef != nullptr) {
|
||||
RBBINode *varRef = setRef->fParent;
|
||||
if (varRef != NULL && varRef->fType == RBBINode::varRef) {
|
||||
setName = varRef->fText;
|
||||
if (varRef && varRef->fType == RBBINode::varRef) {
|
||||
const UnicodeString *setName = &varRef->fText;
|
||||
if (setName->compare(dictionary, -1) == 0) {
|
||||
fNum |= RBBISetBuilder::DICT_BIT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) { // TODO: no string literals.
|
||||
this->fNum |= 0x4000;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -82,7 +82,8 @@ public:
|
|||
RBBISetBuilder(RBBIRuleBuilder *rb);
|
||||
~RBBISetBuilder();
|
||||
|
||||
void build();
|
||||
void buildRanges();
|
||||
void buildTrie();
|
||||
void addValToSets(UVector *sets, uint32_t val);
|
||||
void addValToSet (RBBINode *usetNode, uint32_t val);
|
||||
int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
|
||||
|
@ -93,6 +94,13 @@ public:
|
|||
UChar32 getFirstChar(int32_t val) const;
|
||||
UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
|
||||
// character were encountered.
|
||||
/** merge two character categories that have been identified as having equivalent behavior.
|
||||
* The ranges belonging to the right category (table column) will be added to the left.
|
||||
*/
|
||||
void mergeCategories(int32_t left, int32_t right);
|
||||
|
||||
static constexpr int32_t DICT_BIT = 0x4000;
|
||||
|
||||
#ifdef RBBI_DEBUG
|
||||
void printSets();
|
||||
void printRanges();
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include "rbbidata.h"
|
||||
#include "cstring.h"
|
||||
#include "uassert.h"
|
||||
#include "uvectr32.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
@ -761,7 +762,7 @@ void RBBITableBuilder::flagAcceptingStates() {
|
|||
// if sd->fAccepting already had a value other than 0 or -1, leave it be.
|
||||
|
||||
// If the end marker node is from a look-ahead rule, set
|
||||
// the fLookAhead field or this state also.
|
||||
// the fLookAhead field for this state also.
|
||||
if (endMarker->fLookAheadEnd) {
|
||||
// TODO: don't change value if already set?
|
||||
// TODO: allow for more than one active look-ahead rule in engine.
|
||||
|
@ -1077,7 +1078,128 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
|
|||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
// findDuplCharClassFrom()
|
||||
//
|
||||
bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &duplCategory) {
|
||||
int32_t numStates = fDStates->size();
|
||||
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
|
||||
uint16_t table_base;
|
||||
uint16_t table_dupl;
|
||||
for (; baseCategory < numCols-1; ++baseCategory) {
|
||||
for (duplCategory=baseCategory+1; duplCategory < numCols; ++duplCategory) {
|
||||
for (int32_t state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
|
||||
table_base = (uint16_t)sd->fDtran->elementAti(baseCategory);
|
||||
table_dupl = (uint16_t)sd->fDtran->elementAti(duplCategory);
|
||||
if (table_base != table_dupl) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (table_base == table_dupl) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// removeColumn()
|
||||
//
|
||||
void RBBITableBuilder::removeColumn(int32_t column) {
|
||||
int32_t numStates = fDStates->size();
|
||||
for (int32_t state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
|
||||
U_ASSERT(column < sd->fDtran->size());
|
||||
sd->fDtran->removeElementAt(column);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* findDuplicateState
|
||||
*/
|
||||
bool RBBITableBuilder::findDuplicateState(int32_t &firstState, int32_t &duplState) {
|
||||
int32_t numStates = fDStates->size();
|
||||
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
|
||||
for (; firstState<numStates-1; ++firstState) {
|
||||
RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(firstState);
|
||||
for (duplState=firstState+1; duplState<numStates; ++duplState) {
|
||||
RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
|
||||
if (firstSD->fAccepting != duplSD->fAccepting ||
|
||||
firstSD->fLookAhead != duplSD->fLookAhead ||
|
||||
firstSD->fTagsIdx != duplSD->fTagsIdx) {
|
||||
continue;
|
||||
}
|
||||
bool rowsMatch = true;
|
||||
for (int32_t col=0; col < numCols; ++col) {
|
||||
int32_t firstVal = firstSD->fDtran->elementAti(col);
|
||||
int32_t duplVal = duplSD->fDtran->elementAti(col);
|
||||
if (!((firstVal == duplVal) ||
|
||||
((firstVal == firstState || firstVal == duplState) &&
|
||||
(duplVal == firstState || duplVal == duplState)))) {
|
||||
rowsMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rowsMatch) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
|
||||
U_ASSERT(keepState < duplState);
|
||||
U_ASSERT(duplState < fDStates->size());
|
||||
|
||||
RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
|
||||
fDStates->removeElementAt(duplState);
|
||||
delete duplSD;
|
||||
|
||||
int32_t numStates = fDStates->size();
|
||||
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
|
||||
for (int32_t state=0; state<numStates; ++state) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
|
||||
for (int32_t col=0; col<numCols; col++) {
|
||||
int32_t existingVal = sd->fDtran->elementAti(col);
|
||||
int32_t newVal = existingVal;
|
||||
if (existingVal == duplState) {
|
||||
newVal = keepState;
|
||||
} else if (existingVal > duplState) {
|
||||
newVal = existingVal - 1;
|
||||
}
|
||||
sd->fDtran->setElementAt(newVal, col);
|
||||
}
|
||||
if (sd->fAccepting == duplState) {
|
||||
sd->fAccepting = keepState;
|
||||
} else if (sd->fAccepting > duplState) {
|
||||
sd->fAccepting--;
|
||||
}
|
||||
if (sd->fLookAhead == duplState) {
|
||||
sd->fLookAhead = keepState;
|
||||
} else if (sd->fLookAhead > duplState) {
|
||||
sd->fLookAhead--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* RemoveDuplicateStates
|
||||
*/
|
||||
void RBBITableBuilder::removeDuplicateStates() {
|
||||
int32_t firstState = 3;
|
||||
int32_t duplicateState = 0;
|
||||
while (findDuplicateState(firstState, duplicateState)) {
|
||||
// printf("Removing duplicate states (%d, %d)\n", firstState, duplicateState);
|
||||
removeState(firstState, duplicateState);
|
||||
}
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -1106,7 +1228,6 @@ int32_t RBBITableBuilder::getTableSize() const {
|
|||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// exportTable() export the state transition table in the format required
|
||||
|
@ -1256,7 +1377,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
|
|||
fPositions = NULL;
|
||||
fDtran = NULL;
|
||||
|
||||
fDtran = new UVector(lastInputSymbol+1, *fStatus);
|
||||
fDtran = new UVector32(lastInputSymbol+1, *fStatus);
|
||||
if (U_FAILURE(*fStatus)) {
|
||||
return;
|
||||
}
|
||||
|
@ -1264,7 +1385,7 @@ RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatu
|
|||
*fStatus = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
fDtran->setSize(lastInputSymbol+1, *fStatus); // fDtran needs to be pre-sized.
|
||||
fDtran->setSize(lastInputSymbol+1); // fDtran needs to be pre-sized.
|
||||
// It is indexed by input symbols, and will
|
||||
// hold the next state number for each
|
||||
// symbol.
|
||||
|
|
|
@ -24,6 +24,7 @@ U_NAMESPACE_BEGIN
|
|||
|
||||
class RBBIRuleScanner;
|
||||
class RBBIRuleBuilder;
|
||||
class UVector32;
|
||||
|
||||
//
|
||||
// class RBBITableBuilder is part of the RBBI rule compiler.
|
||||
|
@ -42,9 +43,24 @@ public:
|
|||
void build();
|
||||
int32_t getTableSize() const; // Return the runtime size in bytes of
|
||||
// the built state table
|
||||
void exportTable(void *where); // fill in the runtime state table.
|
||||
// Sufficient memory must exist at
|
||||
// the specified location.
|
||||
|
||||
/** Fill in the runtime state table. Sufficient memory must exist at the specified location.
|
||||
*/
|
||||
void exportTable(void *where);
|
||||
|
||||
/** Find duplicate (redundant) character classes, beginning after the specifed
|
||||
* pair, within this state table. This is an iterator-like function, used to
|
||||
* identify char classes (state table columns) that can be eliminated.
|
||||
*/
|
||||
bool findDuplCharClassFrom(int &baseClass, int &duplClass);
|
||||
|
||||
/** Remove a column from the state table. Used when two character categories
|
||||
* have been found equivalent, and merged together, to eliminate the uneeded table column.
|
||||
*/
|
||||
void removeColumn(int32_t column);
|
||||
|
||||
/** Check for, and remove dupicate states (table rows). */
|
||||
void removeDuplicateStates();
|
||||
|
||||
|
||||
private:
|
||||
|
@ -60,8 +76,29 @@ private:
|
|||
void flagTaggedStates();
|
||||
void mergeRuleStatusVals();
|
||||
|
||||
/**
|
||||
* Merge redundant state table columns, eliminating character classes with identical behavior.
|
||||
* Done after the state tables are generated, just before converting to their run-time format.
|
||||
*/
|
||||
int32_t mergeColumns();
|
||||
|
||||
void addRuleRootNodes(UVector *dest, RBBINode *node);
|
||||
|
||||
/** Find the next duplicate state. An iterator function.
|
||||
* @param firstState (in/out) begin looking at this state, return the first of the
|
||||
* pair of duplicates.
|
||||
* @param duplicateState returns the duplicate state of fistState
|
||||
* @return true if a duplicate pair of states was found.
|
||||
*/
|
||||
bool findDuplicateState(int32_t &firstState, int32_t &duplicateState);
|
||||
|
||||
/** Remove a duplicate state/
|
||||
* @param keepState First of the duplicate pair. Keep it.
|
||||
* @param duplState Duplicate state. Remove it. Redirect all references to the duplicate state
|
||||
* to refer to keepState instead.
|
||||
*/
|
||||
void removeState(int32_t keepState, int32_t duplState);
|
||||
|
||||
// Set functions for UVector.
|
||||
// TODO: make a USet subclass of UVector
|
||||
|
||||
|
@ -112,7 +149,7 @@ public:
|
|||
// with this state. Unordered (it's a set).
|
||||
// UVector contents are RBBINode *
|
||||
|
||||
UVector *fDtran; // Transitions out of this state.
|
||||
UVector32 *fDtran; // Transitions out of this state.
|
||||
// indexed by input character
|
||||
// contents is int index of dest state
|
||||
// in RBBITableBuilder.fDStates
|
||||
|
|
|
@ -60,10 +60,13 @@ private:
|
|||
UText fText;
|
||||
|
||||
/**
|
||||
* The rule data for this BreakIterator instance
|
||||
* The rule data for this BreakIterator instance.
|
||||
* Not for general use; Public only for testing purposes.
|
||||
* @internal
|
||||
*/
|
||||
public:
|
||||
RBBIDataWrapper *fData;
|
||||
private:
|
||||
|
||||
/**
|
||||
* The iteration state - current position, rule status for the current position,
|
||||
|
@ -683,6 +686,13 @@ private:
|
|||
* @internal
|
||||
*/
|
||||
void dumpCache();
|
||||
|
||||
/**
|
||||
* Debugging function only.
|
||||
* @internal
|
||||
*/
|
||||
void dumpTables();
|
||||
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
};
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/localpointer.h"
|
||||
|
@ -39,10 +40,12 @@
|
|||
#include "cstr.h"
|
||||
#include "intltest.h"
|
||||
#include "rbbitst.h"
|
||||
#include "rbbidata.h"
|
||||
#include "utypeinfo.h" // for 'typeid' to work
|
||||
#include "uvector.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
|
||||
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
|
||||
#include "unicode/filteredbrk.h"
|
||||
#endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
|
||||
|
@ -106,6 +109,7 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
|
|||
TESTCASE_AUTO(TestEmoji);
|
||||
TESTCASE_AUTO(TestBug12519);
|
||||
TESTCASE_AUTO(TestBug12677);
|
||||
TESTCASE_AUTO(TestTableRedundancies);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -4454,6 +4458,67 @@ void RBBITest::TestBug12677() {
|
|||
assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "), rtRules);
|
||||
}
|
||||
|
||||
|
||||
void RBBITest::TestTableRedundancies() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
LocalPointer<RuleBasedBreakIterator> bi (
|
||||
(RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
|
||||
assertSuccess(WHERE, status);
|
||||
if (U_FAILURE(status)) return;
|
||||
|
||||
RBBIDataWrapper *dw = bi->fData;
|
||||
const RBBIStateTable *fwtbl = dw->fForwardTable;
|
||||
int32_t numCharClasses = dw->fHeader->fCatCount;
|
||||
// printf("Char Classes: %d states: %d\n", numCharClasses, fwtbl->fNumStates);
|
||||
|
||||
// Check for duplicate columns (character categories)
|
||||
|
||||
std::vector<UnicodeString> columns;
|
||||
for (int32_t column = 0; column < numCharClasses; column++) {
|
||||
UnicodeString s;
|
||||
for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
|
||||
s.append(row->fNextState[column]);
|
||||
}
|
||||
columns.push_back(s);
|
||||
}
|
||||
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
|
||||
for (int c1=1; c1<numCharClasses; c1++) {
|
||||
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
|
||||
if (columns.at(c1) == columns.at(c2)) {
|
||||
errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
out:
|
||||
|
||||
// Check for duplicate states
|
||||
std::vector<UnicodeString> rows;
|
||||
for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
|
||||
UnicodeString s;
|
||||
RBBIStateTableRow *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
|
||||
assertTrue(WHERE, row->fAccepting >= -1);
|
||||
s.append(row->fAccepting + 1); // values of -1 are expected.
|
||||
s.append(row->fLookAhead);
|
||||
s.append(row->fTagIdx);
|
||||
for (int32_t column = 0; column < numCharClasses; column++) {
|
||||
s.append(row->fNextState[column]);
|
||||
}
|
||||
rows.push_back(s);
|
||||
}
|
||||
for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
|
||||
for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
|
||||
if (rows.at(r1) == rows.at(r2)) {
|
||||
errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// TestDebug - A place-holder test for debugging purposes.
|
||||
// For putting in fragments of other tests that can be invoked
|
||||
|
|
|
@ -75,6 +75,7 @@ public:
|
|||
void TestEmoji();
|
||||
void TestBug12519();
|
||||
void TestBug12677();
|
||||
void TestTableRedundancies();
|
||||
|
||||
void TestDebug();
|
||||
void TestProperties();
|
||||
|
|
22
icu4c/source/test/testdata/rbbitst.txt
vendored
22
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -39,18 +39,16 @@
|
|||
|
||||
# Temp debugging tests
|
||||
<locale en>
|
||||
<word>
|
||||
<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
|
||||
コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
|
||||
よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
|
||||
何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
|
||||
んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
|
||||
すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
|
||||
が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
|
||||
の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
|
||||
。<0></data>
|
||||
|
||||
#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
|
||||
<rules>
|
||||
$s0=[;,*];
|
||||
$s1=[a-z];
|
||||
$s2=[i-n];
|
||||
$s3=[x-z];
|
||||
!!forward;
|
||||
($s0 | '?')*
|
||||
($s1 | $s2 | $s3)*;
|
||||
</rules>
|
||||
<data>•hello• •</data>
|
||||
|
||||
## FILTERED BREAK TESTS
|
||||
|
||||
|
|
|
@ -18,17 +18,19 @@ import com.ibm.icu.impl.ICUBinary.Authenticate;
|
|||
import com.ibm.icu.impl.Trie2;
|
||||
|
||||
/**
|
||||
* <p>Internal class used for Rule Based Break Iterators</p>
|
||||
* <p>Internal class used for Rule Based Break Iterators.</p>
|
||||
* <p>This class provides access to the compiled break rule data, as
|
||||
* it is stored in a .brk file.
|
||||
* Not intended for public use; declared public for testing purposes only.
|
||||
* @internal
|
||||
*/
|
||||
final class RBBIDataWrapper {
|
||||
public final class RBBIDataWrapper {
|
||||
//
|
||||
// These fields are the ready-to-use compiled rule data, as
|
||||
// read from the file.
|
||||
//
|
||||
RBBIDataHeader fHeader;
|
||||
short fFTable[];
|
||||
public RBBIDataHeader fHeader;
|
||||
public short fFTable[];
|
||||
short fRTable[];
|
||||
short fSFTable[];
|
||||
short fSRTable[];
|
||||
|
@ -78,11 +80,16 @@ final class RBBIDataWrapper {
|
|||
// Index offsets to the fields in a state table row.
|
||||
// Corresponds to struct RBBIStateTableRow in the C version.
|
||||
//
|
||||
final static int ACCEPTING = 0;
|
||||
final static int LOOKAHEAD = 1;
|
||||
final static int TAGIDX = 2;
|
||||
final static int RESERVED = 3;
|
||||
final static int NEXTSTATES = 4;
|
||||
/** @internal */
|
||||
public final static int ACCEPTING = 0;
|
||||
/** @internal */
|
||||
public final static int LOOKAHEAD = 1;
|
||||
/** @internal */
|
||||
public final static int TAGIDX = 2;
|
||||
/** @internal */
|
||||
public final static int RESERVED = 3;
|
||||
/** @internal */
|
||||
public final static int NEXTSTATES = 4;
|
||||
|
||||
// Index offsets to header fields of a state table
|
||||
// struct RBBIStateTable {... in the C version.
|
||||
|
@ -101,13 +108,15 @@ final class RBBIDataWrapper {
|
|||
|
||||
/**
|
||||
* Data Header. A struct-like class with the fields from the RBBI data file header.
|
||||
* Not intended for public use, declared public for testing purposes only.
|
||||
* @internal
|
||||
*/
|
||||
final static class RBBIDataHeader {
|
||||
public final static class RBBIDataHeader {
|
||||
int fMagic; // == 0xbla0
|
||||
byte[] fFormatVersion; // For ICU 3.4 and later.
|
||||
int fLength; // Total length in bytes of this RBBI Data,
|
||||
// including all sections, not just the header.
|
||||
int fCatCount; // Number of character categories.
|
||||
public int fCatCount; // Number of character categories.
|
||||
|
||||
//
|
||||
// Offsets and sizes of each of the subsections within the RBBI data.
|
||||
|
@ -139,9 +148,9 @@ final class RBBIDataWrapper {
|
|||
/**
|
||||
* RBBI State Table Indexing Function. Given a state number, return the
|
||||
* array index of the start of the state table row for that state.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
int getRowIndex(int state){
|
||||
public int getRowIndex(int state){
|
||||
return ROW_DATA + state * (fHeader.fCatCount + 4);
|
||||
}
|
||||
|
||||
|
@ -311,17 +320,17 @@ final class RBBIDataWrapper {
|
|||
return This;
|
||||
}
|
||||
|
||||
///CLOVER:OFF
|
||||
// Getters for fields from the state table header
|
||||
//
|
||||
private int getStateTableNumStates(short table[]) {
|
||||
/**
|
||||
* Getters for fields from the state table header
|
||||
* @internal
|
||||
*/
|
||||
public int getStateTableNumStates(short table[]) {
|
||||
if (isBigEndian) {
|
||||
return (table[NUMSTATES] << 16) | (table[NUMSTATES+1] & 0xffff);
|
||||
} else {
|
||||
return (table[NUMSTATES+1] << 16) | (table[NUMSTATES] & 0xffff);
|
||||
}
|
||||
}
|
||||
///CLOVER:ON
|
||||
|
||||
int getStateTableFlags(short table[]) {
|
||||
// This works for up to 15 flags bits.
|
||||
|
|
|
@ -342,10 +342,10 @@ class RBBIRuleBuilder {
|
|||
//
|
||||
// UnicodeSet processing.
|
||||
// Munge the Unicode Sets to create a set of character categories.
|
||||
// Generate the mapping tables (TRIE) from input 32-bit characters to
|
||||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
//
|
||||
builder.fSetBuilder.build();
|
||||
builder.fSetBuilder.buildRanges();
|
||||
|
||||
//
|
||||
// Generate the DFA state transition table.
|
||||
|
@ -363,10 +363,38 @@ class RBBIRuleBuilder {
|
|||
builder.fForwardTables.printRuleStatusTable();
|
||||
}
|
||||
|
||||
builder.optimizeTables();
|
||||
builder.fSetBuilder.buildTrie();
|
||||
//
|
||||
// Package up the compiled data, writing it to an output stream
|
||||
// in the serialization format. This is the same as the ICU4C runtime format.
|
||||
//
|
||||
builder.flattenData(os);
|
||||
}
|
||||
|
||||
static class IntPair {
|
||||
int first = 0;
|
||||
int second = 0;
|
||||
IntPair() {};
|
||||
IntPair(int f, int s) {
|
||||
first = f;
|
||||
second = s;
|
||||
}
|
||||
}
|
||||
|
||||
void optimizeTables() {
|
||||
IntPair duplPair = new IntPair(3, 0);
|
||||
while (fForwardTables.findDuplCharClassFrom(duplPair)) {
|
||||
fSetBuilder.mergeCategories(duplPair.first, duplPair.second);
|
||||
fForwardTables.removeColumn(duplPair.second);
|
||||
fReverseTables.removeColumn(duplPair.second);
|
||||
fSafeFwdTables.removeColumn(duplPair.second);
|
||||
fSafeRevTables.removeColumn(duplPair.second);
|
||||
}
|
||||
|
||||
fForwardTables.removeDuplicateStates();
|
||||
fReverseTables.removeDuplicateStates();
|
||||
fSafeFwdTables.removeDuplicateStates();
|
||||
fSafeRevTables.removeDuplicateStates();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -112,7 +112,7 @@ class RBBISetBuilder {
|
|||
}
|
||||
}
|
||||
if (setName.equals("dictionary")) {
|
||||
this.fNum |= 0x4000;
|
||||
this.fNum |= DICT_BIT;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -138,6 +138,8 @@ class RBBISetBuilder {
|
|||
|
||||
boolean fSawBOF;
|
||||
|
||||
static final int DICT_BIT = 0x4000;
|
||||
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -156,7 +158,7 @@ class RBBISetBuilder {
|
|||
// from the Unicode Sets.
|
||||
//
|
||||
//------------------------------------------------------------------------
|
||||
void build() {
|
||||
void buildRanges() {
|
||||
RangeDescriptor rlRange;
|
||||
|
||||
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();}
|
||||
|
@ -280,6 +282,15 @@ class RBBISetBuilder {
|
|||
|
||||
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
|
||||
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Build the Trie table for mapping UChar32 values to the corresponding
|
||||
* range group number.
|
||||
*/
|
||||
void buildTrie() {
|
||||
RangeDescriptor rlRange;
|
||||
|
||||
fTrie = new Trie2Writable(0, // Initial value for all code points.
|
||||
0); // Error value for out-of-range input.
|
||||
|
@ -294,6 +305,24 @@ class RBBISetBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge two character categories that have been identified as having equivalent behavior.
|
||||
* The ranges belonging to the right category (table column) will be added to the left.
|
||||
*/
|
||||
void mergeCategories(int left, int right) {
|
||||
assert(left >= 1);
|
||||
assert(right > left);
|
||||
for (RangeDescriptor rd = fRangeList; rd != null; rd = rd.fNext) {
|
||||
int rangeNum = rd.fNum & ~DICT_BIT;
|
||||
int rangeDict = rd.fNum & DICT_BIT;
|
||||
if (rangeNum == right) {
|
||||
rd.fNum = left | rangeDict;
|
||||
} else if (rangeNum > right) {
|
||||
rd.fNum--;
|
||||
}
|
||||
}
|
||||
--fGroupCount;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -457,7 +486,7 @@ class RBBISetBuilder {
|
|||
if (groupNum<10) {System.out.print(" ");}
|
||||
System.out.print(groupNum + " ");
|
||||
|
||||
if ((rlRange.fNum & 0x4000) != 0) { System.out.print(" <DICT> ");}
|
||||
if ((rlRange.fNum & DICT_BIT) != 0) { System.out.print(" <DICT> ");}
|
||||
|
||||
for (i=0; i<rlRange.fIncludesSets.size(); i++) {
|
||||
RBBINode usetNode = rlRange.fIncludesSets.get(i);
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
package com.ibm.icu.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
@ -20,6 +21,7 @@ import java.util.TreeSet;
|
|||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
|
||||
|
||||
//
|
||||
// class RBBITableBuilder is part of the RBBI rule compiler.
|
||||
|
@ -655,7 +657,7 @@ class RBBITableBuilder {
|
|||
// if sd.fAccepting already had a value other than 0 or -1, leave it be.
|
||||
|
||||
// If the end marker node is from a look-ahead rule, set
|
||||
// the fLookAhead field or this state also.
|
||||
// the fLookAhead field for this state also.
|
||||
if (endMarker.fLookAheadEnd) {
|
||||
// TODO: don't change value if already set?
|
||||
// TODO: allow for more than one active look-ahead rule in engine.
|
||||
|
@ -832,6 +834,149 @@ class RBBITableBuilder {
|
|||
|
||||
|
||||
|
||||
/**
|
||||
* Find duplicate (redundant) character classes, beginning at the specified
|
||||
* pair, within this state table. This is an iterator-like function, used to
|
||||
* identify character classes (state table columns) that can be eliminated.
|
||||
* @param categories in/out parameter, specifies where to start looking for duplicates,
|
||||
* and returns the first pair of duplicates found, if any.
|
||||
* @return true if duplicate char classes were found, false otherwise.
|
||||
* @internal
|
||||
*/
|
||||
boolean findDuplCharClassFrom(RBBIRuleBuilder.IntPair categories) {
|
||||
int numStates = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
|
||||
int table_base = 0;
|
||||
int table_dupl = 0;
|
||||
for (; categories.first < numCols-1; ++categories.first) {
|
||||
for (categories.second=categories.first+1; categories.second < numCols; ++categories.second) {
|
||||
for (int state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor sd = fDStates.get(state);
|
||||
table_base = sd.fDtran[categories.first];
|
||||
table_dupl = sd.fDtran[categories.second];
|
||||
if (table_base != table_dupl) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (table_base == table_dupl) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a column from the state table. Used when two character categories
|
||||
* have been found equivalent, and merged together, to eliminate the unneeded table column.
|
||||
*/
|
||||
void removeColumn(int column) {
|
||||
int numStates = fDStates.size();
|
||||
for (int state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor sd = fDStates.get(state);
|
||||
assert(column < sd.fDtran.length);
|
||||
int[] newArray = Arrays.copyOf(sd.fDtran, sd.fDtran.length - 1);
|
||||
System.arraycopy(sd.fDtran, column+1, newArray, column, newArray.length - column);
|
||||
sd.fDtran = newArray;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Find duplicate (redundant) states, beginning at the specified pair,
|
||||
* within this state table. This is an iterator-like function, used to
|
||||
* identify states (state table rows) that can be eliminated.
|
||||
* @param states in/out parameter, specifies where to start looking for duplicates,
|
||||
* and returns the first pair of duplicates found, if any.
|
||||
* @return true if duplicate states were found, false otherwise.
|
||||
* @internal
|
||||
*/
|
||||
boolean findDuplicateState(RBBIRuleBuilder.IntPair states) {
|
||||
int numStates = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
|
||||
for (; states.first<numStates-1; ++states.first) {
|
||||
RBBIStateDescriptor firstSD = fDStates.get(states.first);
|
||||
for (states.second=states.first+1; states.second<numStates; ++states.second) {
|
||||
RBBIStateDescriptor duplSD = fDStates.get(states.second);
|
||||
if (firstSD.fAccepting != duplSD.fAccepting ||
|
||||
firstSD.fLookAhead != duplSD.fLookAhead ||
|
||||
firstSD.fTagsIdx != duplSD.fTagsIdx) {
|
||||
continue;
|
||||
}
|
||||
boolean rowsMatch = true;
|
||||
for (int col=0; col < numCols; ++col) {
|
||||
int firstVal = firstSD.fDtran[col];
|
||||
int duplVal = duplSD.fDtran[col];
|
||||
if (!((firstVal == duplVal) ||
|
||||
((firstVal == states.first || firstVal == states.second) &&
|
||||
(duplVal == states.first || duplVal == states.second)))) {
|
||||
rowsMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rowsMatch) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a duplicate state (row) from the state table. All references to the deleted state are
|
||||
* redirected to "keepState", the first encountered of the duplicated pair of states.
|
||||
* @param keepState The first of the duplicate pair of states, the one to be kept.
|
||||
* @param duplState The second of the duplicate pair, the one to be removed.
|
||||
* @internal
|
||||
*/
|
||||
void removeState(int keepState, int duplState) {
|
||||
assert(keepState < duplState);
|
||||
assert(duplState < fDStates.size());
|
||||
|
||||
fDStates.remove(duplState);
|
||||
|
||||
int numStates = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
for (int state=0; state<numStates; ++state) {
|
||||
RBBIStateDescriptor sd = fDStates.get(state);
|
||||
for (int col=0; col<numCols; col++) {
|
||||
int existingVal = sd.fDtran[col];
|
||||
int newVal = existingVal;
|
||||
if (existingVal == duplState) {
|
||||
newVal = keepState;
|
||||
} else if (existingVal > duplState) {
|
||||
newVal = existingVal - 1;
|
||||
}
|
||||
sd.fDtran[col] = newVal;
|
||||
}
|
||||
if (sd.fAccepting == duplState) {
|
||||
sd.fAccepting = keepState;
|
||||
} else if (sd.fAccepting > duplState) {
|
||||
sd.fAccepting--;
|
||||
}
|
||||
if (sd.fLookAhead == duplState) {
|
||||
sd.fLookAhead = keepState;
|
||||
} else if (sd.fLookAhead > duplState) {
|
||||
sd.fLookAhead--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Check for, and remove duplicate states (table rows).
|
||||
* @internal
|
||||
*/
|
||||
void removeDuplicateStates() {
|
||||
IntPair dupls = new IntPair(3, 0);
|
||||
while (findDuplicateState(dupls)) {
|
||||
// System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
|
||||
removeState(dupls.first, dupls.second);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
|
|
|
@ -222,9 +222,11 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
private CharacterIterator fText = new java.text.StringCharacterIterator("");
|
||||
|
||||
/**
|
||||
* The rule data for this BreakIterator instance. Package private.
|
||||
* The rule data for this BreakIterator instance.
|
||||
* Not intended for public use. Declared public for testing purposes only.
|
||||
* @internal
|
||||
*/
|
||||
RBBIDataWrapper fRData;
|
||||
public RBBIDataWrapper fRData;
|
||||
|
||||
/**
|
||||
* The iteration state - current position, rule status for the current position,
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5dc7e055663128ae9a1dae471f9a7cdaf5c28e8177b2231e8b0341c3dc8bb842
|
||||
size 12475727
|
||||
oid sha256:f78382b447bb13c03234b53e18b013cea1d2ff6a0f71679885ee00d787003822
|
||||
size 12475101
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:00c43dc113177aa5a0ffda563ac2822f3dc92fdb7ae748a5938529e32a0b67d9
|
||||
oid sha256:413dd55333970ffe2a54ac323cb9b5e2a7709a06d132e6e60eb41282489e0970
|
||||
size 92739
|
||||
|
|
|
@ -19,6 +19,7 @@ package com.ibm.icu.dev.test.rbbi;
|
|||
import java.text.CharacterIterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
|
@ -26,6 +27,7 @@ import org.junit.runners.JUnit4;
|
|||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RBBIDataWrapper;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
|
@ -562,4 +564,62 @@ public class RBBITest extends TestFmwk {
|
|||
String rtRules = bi.toString(); // getRules() in C++
|
||||
assertEquals("Break Iterator rule stripping test", "!!forward; $x = [ab#]; '#' '?'; ", rtRules);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestTableRedundancies() {
|
||||
RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance(Locale.ENGLISH);
|
||||
String rules = bi.toString();
|
||||
bi = new RuleBasedBreakIterator(rules);
|
||||
// Build a break iterator from source rules.
|
||||
// Want to check the rule builder in Java, not the pre-built rules that are imported from ICU4C.
|
||||
RBBIDataWrapper dw = bi.fRData;
|
||||
short[] fwtbl = dw.fFTable;
|
||||
int numCharClasses = dw.fHeader.fCatCount;
|
||||
|
||||
// Check for duplicate columns (character categories)
|
||||
List<String> columns = new ArrayList<String>();
|
||||
for (int column=0; column<numCharClasses; column++) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
for (int r = 1; r < dw.getStateTableNumStates(fwtbl); r++) {
|
||||
int row = dw.getRowIndex(r);
|
||||
short tableVal = fwtbl[row + RBBIDataWrapper.NEXTSTATES + column];
|
||||
s.append((char)tableVal);
|
||||
}
|
||||
columns.add(s.toString());
|
||||
}
|
||||
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
|
||||
for (int c1=1; c1<numCharClasses; c1++) {
|
||||
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
|
||||
assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
|
||||
// if (columns.get(c1).equals(columns.get(c2))) {
|
||||
// System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
// Check for duplicate states.
|
||||
List<String> rows = new ArrayList<String>();
|
||||
for (int r=0; r<dw.getStateTableNumStates(fwtbl); r++) {
|
||||
StringBuilder s = new StringBuilder();
|
||||
int row = dw.getRowIndex(r);
|
||||
assertTrue("Accepting < -1", fwtbl[row + RBBIDataWrapper.ACCEPTING] >= -1);
|
||||
s.append(fwtbl[row + RBBIDataWrapper.ACCEPTING]);
|
||||
s.append(fwtbl[row + RBBIDataWrapper.LOOKAHEAD]);
|
||||
s.append(fwtbl[row + RBBIDataWrapper.TAGIDX]);
|
||||
for (int column=0; column<numCharClasses; column++) {
|
||||
short tableVal = fwtbl[row + RBBIDataWrapper.NEXTSTATES + column];
|
||||
s.append((char)tableVal);
|
||||
}
|
||||
rows.add(s.toString());
|
||||
}
|
||||
|
||||
for (int r1=0; r1 < dw.getStateTableNumStates(fwtbl); r1++) {
|
||||
for (int r2= r1+1; r2 < dw.getStateTableNumStates(fwtbl); r2++) {
|
||||
assertFalse(String.format("Duplicate states (%d, %d)", r1, r2), rows.get(r1).equals(rows.get(r2)));
|
||||
// if (rows.get(r1).equals(rows.get(r2))) {
|
||||
// System.out.printf("Duplicate states (%d, %d)\n", r1, r2);
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue