mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-13194 RBBI safe tables Java port, work in progress.
X-SVN-Rev: 41172
This commit is contained in:
parent
6cdf52d3da
commit
3180a6400e
3 changed files with 264 additions and 36 deletions
|
@ -35,13 +35,16 @@ class RBBIRuleBuilder {
|
|||
//
|
||||
// There are four separate parse trees generated, one for each of the
|
||||
// forward rules, reverse rules, safe forward rules and safe reverse rules.
|
||||
// This array references the root of each of the trees.
|
||||
// This array references the root of each of the trees.
|
||||
// Only fForwardTree data is actually used to generate a state table.
|
||||
// The other three are retained for back compatibility with old rule files,
|
||||
// which may have safe and reverse rules. These are still parsed.
|
||||
//
|
||||
RBBINode[] fTreeRoots = new RBBINode[4];
|
||||
static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
|
||||
static final int fReverseTree = 1; // for each of the trees.
|
||||
// // (in C, these are pointer variables and
|
||||
// // there is no array.)
|
||||
static final int fSafeFwdTree = 3; // (in C, these are pointer variables and
|
||||
static final int fSafeRevTree = 4; // there is no array.)
|
||||
int fDefaultTree = fForwardTree; // For rules not qualified with a !
|
||||
// the tree to which they belong to.
|
||||
|
||||
|
|
|
@ -292,7 +292,7 @@ class RBBIRuleScanner {
|
|||
// OR this rule into the appropriate group of them.
|
||||
//
|
||||
|
||||
int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
|
||||
int destRules = (fReverseRule ? RBBIRuleBuilder.fSafeRevTree : fRB.fDefaultTree);
|
||||
|
||||
if (fRB.fTreeRoots[destRules] != null) {
|
||||
// This is not the first rule encountered.
|
||||
|
@ -972,18 +972,6 @@ class RBBIRuleScanner {
|
|||
error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
|
||||
}
|
||||
|
||||
//
|
||||
// If there were NO user specified reverse rules, set up the equivalent of ".*;"
|
||||
//
|
||||
if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
|
||||
fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
|
||||
RBBINode operand = pushNewNode(RBBINode.setRef);
|
||||
findSetFor(kAny, operand, null);
|
||||
fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
|
||||
operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
|
||||
fNodeStackPtr -= 2;
|
||||
}
|
||||
|
||||
//
|
||||
// Parsing of the input RBBI rules is complete.
|
||||
// We now have a parse tree for the rule expressions
|
||||
|
|
|
@ -23,18 +23,16 @@ import com.ibm.icu.lang.UCharacter;
|
|||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
|
||||
|
||||
//
|
||||
// class RBBITableBuilder is part of the RBBI rule compiler.
|
||||
// It builds the state transition table used by the RBBI runtime
|
||||
// from the expression syntax tree generated by the rule scanner.
|
||||
//
|
||||
// This class is part of the RBBI implementation only.
|
||||
// There is no user-visible public API here.
|
||||
//
|
||||
/**
|
||||
* This class is part of the RBBI rule compiler.
|
||||
* It builds the state transition table used by the RBBI runtime
|
||||
* from the expression syntax tree generated by the rule scanner.
|
||||
*
|
||||
* This class is part of the RBBI implementation only.
|
||||
* There is no user-visible public API here.
|
||||
*/
|
||||
class RBBITableBuilder {
|
||||
|
||||
|
||||
|
||||
//
|
||||
// RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
|
||||
// one for each state.
|
||||
|
@ -65,13 +63,15 @@ class RBBITableBuilder {
|
|||
|
||||
|
||||
private RBBIRuleBuilder fRB;
|
||||
private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots
|
||||
// for the parse tree to operate on.
|
||||
// Too bad Java can't do indirection more easily!
|
||||
|
||||
private List<RBBIStateDescriptor> fDStates; // D states (Aho's terminology)
|
||||
// Index is state number
|
||||
// Contents are RBBIStateDescriptor pointers.
|
||||
/** The array index into RBBIRuleBuilder.fTreeRoots for the parse tree to operate on. */
|
||||
private int fRootIx;
|
||||
|
||||
/** D states (Aho's terminology). Index is state number. */
|
||||
private List<RBBIStateDescriptor> fDStates;
|
||||
|
||||
/** Synthesized safe table, a List of row arrays. */
|
||||
private List<short[]> fSafeTable;
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
|
@ -91,8 +91,8 @@ class RBBITableBuilder {
|
|||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// RBBITableBuilder::build - This is the main function for building the DFA state transtion
|
||||
// table from the RBBI rules parse tree.
|
||||
// RBBITableBuilder::buildForwardTable - This is the main function for building
|
||||
// the DFA state transition table from the RBBI rules parse tree.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void buildForwardTable() {
|
||||
|
@ -195,8 +195,6 @@ class RBBITableBuilder {
|
|||
// for all tables. Merge the ones from this table into the global set.
|
||||
//
|
||||
mergeRuleStatusVals();
|
||||
|
||||
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();}
|
||||
}
|
||||
|
||||
|
||||
|
@ -924,6 +922,40 @@ class RBBITableBuilder {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the next duplicate state in the safe reverse table. An iterator function.
|
||||
* @param states in/out parameter, specifies where to start looking for duplicates,
|
||||
* and returns the first pair of duplicates found, if any.
|
||||
* @return true if duplicate states were found, false otherwise.
|
||||
* @internal
|
||||
*/
|
||||
boolean findDuplicateSafeState(RBBIRuleBuilder.IntPair states) {
|
||||
int numStates = fSafeTable.size();
|
||||
|
||||
for (; states.first<numStates-1; ++states.first) {
|
||||
short[] firstRow = fSafeTable.get(states.first);
|
||||
for (states.second=states.first+1; states.second<numStates; ++states.second) {
|
||||
short[] duplRow = fSafeTable.get(states.second);
|
||||
boolean rowsMatch = true;
|
||||
int numCols = firstRow.length;
|
||||
for (int col=0; col < numCols; ++col) {
|
||||
int firstVal = firstRow[col];
|
||||
int duplVal = duplRow[col];
|
||||
if (!((firstVal == duplVal) ||
|
||||
((firstVal == states.first || firstVal == states.second) &&
|
||||
(duplVal == states.first || duplVal == states.second)))) {
|
||||
rowsMatch = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rowsMatch) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a duplicate state (row) from the state table. All references to the deleted state are
|
||||
* redirected to "keepState", the first encountered of the duplicated pair of states.
|
||||
|
@ -964,6 +996,33 @@ class RBBITableBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a duplicate state from the safe table.
|
||||
* @param keepState The first of the duplicate pair of states, the one to be kept.
|
||||
* @param duplState The second of the duplicate pair, the one to be removed.
|
||||
* @internal
|
||||
*/
|
||||
void removeSafeState(int keepState, int duplState) {
|
||||
assert(keepState < duplState);
|
||||
assert(duplState < fDStates.size());
|
||||
|
||||
fSafeTable.remove(duplState);
|
||||
int numStates = fSafeTable.size();
|
||||
for (int state=0; state<numStates; ++state) {
|
||||
short[] row = fSafeTable.get(state);
|
||||
for (int col=0; col<row.length; col++) {
|
||||
int existingVal = row[col];
|
||||
int newVal = existingVal;
|
||||
if (existingVal == duplState) {
|
||||
newVal = keepState;
|
||||
} else if (existingVal > duplState) {
|
||||
newVal = existingVal - 1;
|
||||
}
|
||||
row[col] = (short)newVal;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Check for, and remove duplicate states (table rows).
|
||||
|
@ -1047,6 +1106,146 @@ class RBBITableBuilder {
|
|||
return table;
|
||||
}
|
||||
|
||||
/**
|
||||
* Synthesize a safe state table from the main state table.
|
||||
*/
|
||||
void buildSafeReverseTable() {
|
||||
// Find safe char class pairs.
|
||||
|
||||
// make a state table row for each trailing class, and map from class to row.
|
||||
|
||||
// For each pair
|
||||
// startRow[p1] = p2
|
||||
// p2row[p2] = stopRow
|
||||
// For each unfilled in cell
|
||||
// set to row corresponding to its column.
|
||||
|
||||
// Each safe pair is stored as two chars in the safePair stringBuilder.
|
||||
StringBuilder safePairs = new StringBuilder();
|
||||
|
||||
int numCharClasses = fRB.fSetBuilder.getNumCharCategories();
|
||||
int numStates = fDStates.size();
|
||||
|
||||
for (int c1=0; c1<numCharClasses; ++c1) {
|
||||
for (int c2=0; c2 < numCharClasses; ++c2) {
|
||||
int wantedEndState = -1;
|
||||
int endState = 0;
|
||||
for (int startState = 1; startState < numStates; ++startState) {
|
||||
RBBIStateDescriptor startStateD = fDStates.get(startState);
|
||||
int s2 = startStateD.fDtran[c1];
|
||||
RBBIStateDescriptor s2StateD = fDStates.get(s2);
|
||||
endState = s2StateD.fDtran[c2];
|
||||
if (wantedEndState < 0) {
|
||||
wantedEndState = endState;
|
||||
} else {
|
||||
if (wantedEndState != endState) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (wantedEndState == endState) {
|
||||
safePairs.append((char)c1);
|
||||
safePairs.append((char)c2);
|
||||
// System.out.printf("(%d, %d) ", c1, c2);
|
||||
}
|
||||
}
|
||||
// System.out.printf("\n");
|
||||
}
|
||||
|
||||
// Populate the initial safe table.
|
||||
// The table as a whole is a List<short[]>
|
||||
// Row 0 is the stop state.
|
||||
// Row 1 is the start sate.
|
||||
// Row 2 and beyond are other states, initially one per char class, but
|
||||
// after initial construction, many of the states will be combined, compacting the table.)
|
||||
// The String holds the nextState data only. The four leading fields of a row, fAccepting,
|
||||
// fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
|
||||
|
||||
assert(fSafeTable == null);
|
||||
fSafeTable = new ArrayList<short[]>();
|
||||
for (int row=0; row<numCharClasses + 2; ++row) {
|
||||
fSafeTable.add(new short[numCharClasses]);
|
||||
}
|
||||
|
||||
// From the start state, each input char class transitions to the state for that input.
|
||||
short[] startState = fSafeTable.get(1);
|
||||
for (int charClass=0; charClass < numCharClasses; ++charClass) {
|
||||
// Note: +2 to skip the start & stop state rows.
|
||||
startState[charClass] = (short)(charClass+2);
|
||||
}
|
||||
|
||||
// Initially make every other state table row look like the start state row
|
||||
// (except for the stop state, which remains all 0)
|
||||
for (int row=2; row<numCharClasses+2; ++row) {
|
||||
System.arraycopy(startState, 0, fSafeTable.get(row), 0, startState.length);
|
||||
}
|
||||
|
||||
// Run through the safe pairs, set the next state to zero when pair has been seen.
|
||||
// Zero being the stop state, meaning we found a safe point.
|
||||
for (int pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
|
||||
int c1 = safePairs.charAt(pairIdx);
|
||||
int c2 = safePairs.charAt(pairIdx + 1);
|
||||
|
||||
short[] rowState = fSafeTable.get(c2 + 2);
|
||||
rowState[c1] = 0;
|
||||
}
|
||||
|
||||
// Remove duplicate or redundant rows from the table.
|
||||
RBBIRuleBuilder.IntPair states = new RBBIRuleBuilder.IntPair(1, 0);
|
||||
while (findDuplicateSafeState(states)) {
|
||||
// System.out.printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
|
||||
removeSafeState(states.first, states.second);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate the size of the runtime form of this safe state table.
|
||||
*/
|
||||
int getSafeTableSize() {
|
||||
if (fSafeTable == null) {
|
||||
return 0;
|
||||
}
|
||||
int size = 16; // The header of 4 ints, with no rows to the table.
|
||||
int numRows = fSafeTable.size();
|
||||
int numCols = fSafeTable.get(0).length;
|
||||
int rowSize = 8 + 2*numCols;
|
||||
size += numRows * rowSize;
|
||||
// TODO: there are redundant round-up. Figure out best place, get rid of the rest.
|
||||
size = (size + 7) & ~7; // round up to a multiple of 8 bytes
|
||||
return size;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Create a RBBIDataWrapper.RBBIStateTable for the safe reverse table.
|
||||
* RBBIDataWrapper.RBBIStateTable is similar to struct RBBIStateTable in ICU4C,
|
||||
* in common/rbbidata.h
|
||||
*/
|
||||
RBBIDataWrapper.RBBIStateTable exportSafeTable() {
|
||||
RBBIDataWrapper.RBBIStateTable table = new RBBIDataWrapper.RBBIStateTable();
|
||||
table.fNumStates = fSafeTable.size();
|
||||
int numCharCategories = fSafeTable.get(0).length;
|
||||
|
||||
// Size of table size in shorts.
|
||||
// the "4" is the size of struct RBBIStateTableRow, the row header part only.
|
||||
int rowLen = 4 + numCharCategories;
|
||||
// TODO: tableSize is basically numStates * numCharCategories,
|
||||
// except for alignment padding. Clean up here, and in main exportTable().
|
||||
int tableSize = (getSafeTableSize() - 16) / 2; // fTable length in shorts.
|
||||
table.fTable = new short[tableSize];
|
||||
table.fRowLen = rowLen * 2; // Row length in bytes.
|
||||
|
||||
for (int state=0; state<table.fNumStates; state++) {
|
||||
short[] rowArray = fSafeTable.get(state);
|
||||
int row = state * rowLen;
|
||||
|
||||
for (int col=0; col<numCharCategories; col++) {
|
||||
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = rowArray[col];
|
||||
}
|
||||
}
|
||||
return table;
|
||||
}
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
@ -1104,6 +1303,44 @@ class RBBITableBuilder {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* Debug Function. Dump the fully constructed safe reverse table.
|
||||
*/
|
||||
void printReverseTable() {
|
||||
int c; // input "character"
|
||||
|
||||
System.out.printf(" Safe Reverse Table \n");
|
||||
if (fSafeTable == null) {
|
||||
System.out.printf(" --- nullptr ---\n");
|
||||
return;
|
||||
}
|
||||
int numCharCategories = fSafeTable.get(0).length;
|
||||
System.out.printf("state | i n p u t s y m b o l s \n");
|
||||
System.out.printf(" | Acc LA Tag");
|
||||
for (c=0; c< numCharCategories; c++) {
|
||||
System.out.printf(" %2d", c);
|
||||
}
|
||||
System.out.printf("\n");
|
||||
System.out.printf(" |---------------");
|
||||
for (c=0; c<numCharCategories; c++) {
|
||||
System.out.printf("---");
|
||||
}
|
||||
System.out.printf("\n");
|
||||
|
||||
for (int n=0; n<fSafeTable.size(); n++) {
|
||||
short rowArray[] = fSafeTable.get(n);
|
||||
System.out.printf(" %3d | " , n);
|
||||
System.out.printf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
|
||||
for (c=0; c<numCharCategories; c++) {
|
||||
System.out.printf(" %2d", rowArray[c]);
|
||||
}
|
||||
System.out.printf("\n");
|
||||
}
|
||||
System.out.printf("\n\n");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
|
|
Loading…
Add table
Reference in a new issue