mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-13569 RBBI state table minimization, Java now works.
X-SVN-Rev: 40916
This commit is contained in:
parent
ff3ebb8c32
commit
fd77c49a2b
4 changed files with 163 additions and 132 deletions
|
@ -372,25 +372,29 @@ class RBBIRuleBuilder {
|
|||
builder.flattenData(os);
|
||||
}
|
||||
|
||||
static class ClassPair {
|
||||
int left = 3;
|
||||
int right = 0;
|
||||
static class IntPair {
|
||||
int first = 0;
|
||||
int second = 0;
|
||||
IntPair() {};
|
||||
IntPair(int f, int s) {
|
||||
first = f;
|
||||
second = s;
|
||||
}
|
||||
}
|
||||
|
||||
void optimizeTables() {
|
||||
ClassPair duplPair = new ClassPair();
|
||||
|
||||
IntPair duplPair = new IntPair(3, 0);
|
||||
while (fForwardTables.findDuplCharClassFrom(duplPair)) {
|
||||
fSetBuilder.mergeCategories(duplPair);
|
||||
fForwardTables.removeColumn(duplPair.right);
|
||||
fReverseTables.removeColumn(duplPair.right);
|
||||
fSafeFwdTables.removeColumn(duplPair.right);
|
||||
fSafeRevTables.removeColumn(duplPair.right);
|
||||
fSetBuilder.mergeCategories(duplPair.first, duplPair.second);
|
||||
fForwardTables.removeColumn(duplPair.second);
|
||||
fReverseTables.removeColumn(duplPair.second);
|
||||
fSafeFwdTables.removeColumn(duplPair.second);
|
||||
fSafeRevTables.removeColumn(duplPair.second);
|
||||
}
|
||||
|
||||
fForwardTables.removeDuplicateStates();
|
||||
fReverseTables.removeDuplicateStates();
|
||||
fSafeFwdTables.removeDuplicateStates();
|
||||
fSafeRevTables.removeDuplicateStates();
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -305,6 +305,10 @@ class RBBISetBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge two character categories that have been identified as having equivalent behavior.
|
||||
* The ranges belonging to the right category (table column) will be added to the left.
|
||||
*/
|
||||
void mergeCategories(int left, int right) {
|
||||
assert(left >= 1);
|
||||
assert(right > left);
|
||||
|
@ -319,6 +323,7 @@ class RBBISetBuilder {
|
|||
}
|
||||
--fGroupCount;
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------------------
|
||||
//
|
||||
// getTrieSize() Return the size that will be required to serialize the Trie.
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
package com.ibm.icu.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
|
@ -20,6 +21,7 @@ import java.util.TreeSet;
|
|||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
|
||||
|
||||
//
|
||||
// class RBBITableBuilder is part of the RBBI rule compiler.
|
||||
|
@ -832,128 +834,148 @@ class RBBITableBuilder {
|
|||
|
||||
|
||||
|
||||
//
|
||||
// findDuplCharClassFrom()
|
||||
//
|
||||
boolean findDuplCharClassFrom(RBBIRuleBuilder.ClassPair classPair) {
|
||||
int numStates = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
/**
|
||||
* Find duplicate (redundant) character classes, beginning at the specified
|
||||
* pair, within this state table. This is an iterator-like function, used to
|
||||
* identify character classes (state table columns) that can be eliminated.
|
||||
* @param categories in/out parameter, specifies where to start looking for duplicates,
|
||||
* and returns the first pair of duplicates found, if any.
|
||||
* @return true if duplicate char classes were found, false otherwise.
|
||||
* @internal
|
||||
*/
|
||||
boolean findDuplCharClassFrom(RBBIRuleBuilder.IntPair categories) {
|
||||
int numStates = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
|
||||
uint16_t table_base;
|
||||
uint16_t table_dupl;
|
||||
for (; baseCategory < numCols-1; ++baseCategory) {
|
||||
for (duplCategory=baseCategory+1; duplCategory < numCols; ++duplCategory) {
|
||||
for (int state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates.elementAt(state);
|
||||
table_base = (uint16_t)sd.fDtran.elementAti(baseCategory);
|
||||
table_dupl = (uint16_t)sd.fDtran.elementAti(duplCategory);
|
||||
if (table_base != table_dupl) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (table_base == table_dupl) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
int table_base = 0;
|
||||
int table_dupl = 0;
|
||||
for (; categories.first < numCols-1; ++categories.first) {
|
||||
for (categories.second=categories.first+1; categories.second < numCols; ++categories.second) {
|
||||
for (int state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor sd = fDStates.get(state);
|
||||
table_base = sd.fDtran[categories.first];
|
||||
table_dupl = sd.fDtran[categories.second];
|
||||
if (table_base != table_dupl) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (table_base == table_dupl) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a column from the state table. Used when two character categories
|
||||
* have been found equivalent, and merged together, to eliminate the unneeded table column.
|
||||
*/
|
||||
void removeColumn(int column) {
|
||||
int numStates = fDStates.size();
|
||||
for (int state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor sd = fDStates.get(state);
|
||||
assert(column < sd.fDtran.length);
|
||||
int[] newArray = Arrays.copyOf(sd.fDtran, sd.fDtran.length - 1);
|
||||
System.arraycopy(sd.fDtran, column+1, newArray, column, newArray.length - column);
|
||||
sd.fDtran = newArray;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// removeColumn()
|
||||
//
|
||||
void removeColumn(int column) {
|
||||
int numStates = fDStates.size();
|
||||
for (int state=0; state<numStates; state++) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates.elementAt(state);
|
||||
U_ASSERT(column < sd.fDtran.size());
|
||||
sd.fDtran.removeElementAt(column);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Find duplicate (redundant) states, beginning at the specified pair,
|
||||
* within this state table. This is an iterator-like function, used to
|
||||
* identify states (state table rows) that can be eliminated.
|
||||
* @param states in/out parameter, specifies where to start looking for duplicates,
|
||||
* and returns the first pair of duplicates found, if any.
|
||||
* @return true if duplicate states were found, false otherwise.
|
||||
* @internal
|
||||
*/
|
||||
boolean findDuplicateState(RBBIRuleBuilder.IntPair states) {
|
||||
int numStates = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
|
||||
/*
|
||||
* findDuplicateState
|
||||
*/
|
||||
bool findDuplicateState(int &firstState, int &duplState) {
|
||||
int numStates = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
for (; states.first<numStates-1; ++states.first) {
|
||||
RBBIStateDescriptor firstSD = fDStates.get(states.first);
|
||||
for (states.second=states.first+1; states.second<numStates; ++states.second) {
|
||||
RBBIStateDescriptor duplSD = fDStates.get(states.second);
|
||||
if (firstSD.fAccepting != duplSD.fAccepting ||
|
||||
firstSD.fLookAhead != duplSD.fLookAhead ||
|
||||
firstSD.fTagsIdx != duplSD.fTagsIdx) {
|
||||
continue;
|
||||
}
|
||||
boolean rowsMatch = true;
|
||||
for (int col=0; col < numCols; ++col) {
|
||||
int firstVal = firstSD.fDtran[col];
|
||||
int duplVal = duplSD.fDtran[col];
|
||||
if (!((firstVal == duplVal) ||
|
||||
((firstVal == states.first || firstVal == states.second) &&
|
||||
(duplVal == states.first || duplVal == states.second)))) {
|
||||
rowsMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rowsMatch) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
for (; firstState<numStates-1; ++firstState) {
|
||||
RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates.elementAt(firstState);
|
||||
for (duplState=firstState+1; duplState<numStates; ++duplState) {
|
||||
RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates.elementAt(duplState);
|
||||
if (firstSD.fAccepting != duplSD.fAccepting ||
|
||||
firstSD.fLookAhead != duplSD.fLookAhead ||
|
||||
firstSD.fTagsIdx != duplSD.fTagsIdx) {
|
||||
continue;
|
||||
}
|
||||
bool rowsMatch = true;
|
||||
for (int col=0; col < numCols; ++col) {
|
||||
int firstVal = firstSD.fDtran.elementAti(col);
|
||||
int duplVal = duplSD.fDtran.elementAti(col);
|
||||
if (!((firstVal == duplVal) ||
|
||||
((firstVal == firstState || firstVal == duplState) &&
|
||||
(duplVal == firstState || duplVal == duplState)))) {
|
||||
rowsMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rowsMatch) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
/**
|
||||
* Remove a duplicate state (row) from the state table. All references to the deleted state are
|
||||
* redirected to "keepState", the first encountered of the duplicated pair of states.
|
||||
* @param keepState The first of the duplicate pair of states, the one to be kept.
|
||||
* @param duplState The second of the duplicate pair, the one to be removed.
|
||||
* @internal
|
||||
*/
|
||||
void removeState(int keepState, int duplState) {
|
||||
assert(keepState < duplState);
|
||||
assert(duplState < fDStates.size());
|
||||
|
||||
void removeState(int keepState, int duplState) {
|
||||
U_ASSERT(keepState < duplState);
|
||||
U_ASSERT(duplState < fDStates.size());
|
||||
fDStates.remove(duplState);
|
||||
|
||||
RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates.elementAt(duplState);
|
||||
fDStates.removeElementAt(duplState);
|
||||
delete duplSD;
|
||||
|
||||
int numStates = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
for (int state=0; state<numStates; ++state) {
|
||||
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates.elementAt(state);
|
||||
for (int col=0; col<numCols; col++) {
|
||||
int existingVal = sd.fDtran.elementAti(col);
|
||||
int newVal = existingVal;
|
||||
if (existingVal == duplState) {
|
||||
newVal = keepState;
|
||||
} else if (existingVal > duplState) {
|
||||
newVal = existingVal - 1;
|
||||
}
|
||||
sd.fDtran.setElementAt(newVal, col);
|
||||
}
|
||||
if (sd.fAccepting == duplState) {
|
||||
sd.fAccepting = keepState;
|
||||
} else if (sd.fAccepting > duplState) {
|
||||
sd.fAccepting--;
|
||||
}
|
||||
if (sd.fLookAhead == duplState) {
|
||||
sd.fLookAhead = keepState;
|
||||
} else if (sd.fLookAhead > duplState) {
|
||||
sd.fLookAhead--;
|
||||
}
|
||||
}
|
||||
}
|
||||
int numStates = fDStates.size();
|
||||
int numCols = fRB.fSetBuilder.getNumCharCategories();
|
||||
for (int state=0; state<numStates; ++state) {
|
||||
RBBIStateDescriptor sd = fDStates.get(state);
|
||||
for (int col=0; col<numCols; col++) {
|
||||
int existingVal = sd.fDtran[col];
|
||||
int newVal = existingVal;
|
||||
if (existingVal == duplState) {
|
||||
newVal = keepState;
|
||||
} else if (existingVal > duplState) {
|
||||
newVal = existingVal - 1;
|
||||
}
|
||||
sd.fDtran[col] = newVal;
|
||||
}
|
||||
if (sd.fAccepting == duplState) {
|
||||
sd.fAccepting = keepState;
|
||||
} else if (sd.fAccepting > duplState) {
|
||||
sd.fAccepting--;
|
||||
}
|
||||
if (sd.fLookAhead == duplState) {
|
||||
sd.fLookAhead = keepState;
|
||||
} else if (sd.fLookAhead > duplState) {
|
||||
sd.fLookAhead--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* RemoveDuplicateStates
|
||||
*/
|
||||
void removeDuplicateStates() {
|
||||
int firstState = 3;
|
||||
int duplicateState = 0;
|
||||
while (findDuplicateState(firstState, duplicateState)) {
|
||||
// printf("Removing duplicate states (%d, %d)\n", firstState, duplicateState);
|
||||
removeState(firstState, duplicateState);
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Check for, and remove duplicate states (table rows).
|
||||
* @internal
|
||||
*/
|
||||
void removeDuplicateStates() {
|
||||
IntPair dupls = new IntPair(3, 0);
|
||||
while (findDuplicateState(dupls)) {
|
||||
// System.out.printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
|
||||
removeState(dupls.first, dupls.second);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
|
|
@ -590,10 +590,10 @@ public class RBBITest extends TestFmwk {
|
|||
// Ignore column (char class) 0 while checking; it's special, and may have duplicates.
|
||||
for (int c1=1; c1<numCharClasses; c1++) {
|
||||
for (int c2 = c1+1; c2 < numCharClasses; c2++) {
|
||||
// assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
|
||||
if (columns.get(c1).equals(columns.get(c2))) {
|
||||
System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
|
||||
}
|
||||
assertFalse(String.format("Duplicate columns (%d, %d)", c1, c2), columns.get(c1).equals(columns.get(c2)));
|
||||
// if (columns.get(c1).equals(columns.get(c2))) {
|
||||
// System.out.printf("Duplicate columns (%d, %d)\n", c1, c2);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -615,10 +615,10 @@ public class RBBITest extends TestFmwk {
|
|||
|
||||
for (int r1=0; r1 < dw.getStateTableNumStates(fwtbl); r1++) {
|
||||
for (int r2= r1+1; r2 < dw.getStateTableNumStates(fwtbl); r2++) {
|
||||
// assertFalse(String.format("Duplicate states (%d, %d)", r1, r2), rows.get(r1).equals(rows.get(r2)));
|
||||
if (rows.get(r1).equals(rows.get(r2))) {
|
||||
System.out.printf("Duplicate states (%d, %d)\n", r1, r2);
|
||||
}
|
||||
assertFalse(String.format("Duplicate states (%d, %d)", r1, r2), rows.get(r1).equals(rows.get(r2)));
|
||||
// if (rows.get(r1).equals(rows.get(r2))) {
|
||||
// System.out.printf("Duplicate states (%d, %d)\n", r1, r2);
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue