ICU-13194 RBBI safe tables Java port, work in progress.

X-SVN-Rev: 41172
This commit is contained in:
Andy Heninger 2018-03-30 01:12:50 +00:00
parent 6cdf52d3da
commit 3180a6400e
3 changed files with 264 additions and 36 deletions

View file

@ -35,13 +35,16 @@ class RBBIRuleBuilder {
//
// There are four separate parse trees generated, one for each of the
// forward rules, reverse rules, safe forward rules and safe reverse rules.
// This array references the root of each of the trees.
// This array references the root of each of the trees.
// Only fForwardTree data is actually used to generate a state table.
// The other three are retained for back compatibility with old rule files,
// which may have safe and reverse rules. These are still parsed.
//
RBBINode[] fTreeRoots = new RBBINode[4];
static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
static final int fReverseTree = 1; // for each of the trees.
// // (in C, these are pointer variables and
// // there is no array.)
static final int fSafeFwdTree = 3; // (in C, these are pointer variables and
static final int fSafeRevTree = 4; // there is no array.)
int fDefaultTree = fForwardTree; // For rules not qualified with a !
// the tree to which they belong to.

View file

@ -292,7 +292,7 @@ class RBBIRuleScanner {
// OR this rule into the appropriate group of them.
//
int destRules = (fReverseRule ? RBBIRuleBuilder.fReverseTree : fRB.fDefaultTree);
int destRules = (fReverseRule ? RBBIRuleBuilder.fSafeRevTree : fRB.fDefaultTree);
if (fRB.fTreeRoots[destRules] != null) {
// This is not the first rule encountered.
@ -972,18 +972,6 @@ class RBBIRuleScanner {
error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
}
//
// If there were NO user specified reverse rules, set up the equivalent of ".*;"
//
if (fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] == null) {
fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree] = pushNewNode(RBBINode.opStar);
RBBINode operand = pushNewNode(RBBINode.setRef);
findSetFor(kAny, operand, null);
fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].fLeftChild = operand;
operand.fParent = fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree];
fNodeStackPtr -= 2;
}
//
// Parsing of the input RBBI rules is complete.
// We now have a parse tree for the rule expressions

View file

@ -23,18 +23,16 @@ import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.text.RBBIRuleBuilder.IntPair;
//
// class RBBITableBuilder is part of the RBBI rule compiler.
// It builds the state transition table used by the RBBI runtime
// from the expression syntax tree generated by the rule scanner.
//
// This class is part of the RBBI implementation only.
// There is no user-visible public API here.
//
/**
* This class is part of the RBBI rule compiler.
* It builds the state transition table used by the RBBI runtime
* from the expression syntax tree generated by the rule scanner.
*
* This class is part of the RBBI implementation only.
* There is no user-visible public API here.
*/
class RBBITableBuilder {
//
// RBBIStateDescriptor - The DFA is initially constructed as a set of these descriptors,
// one for each state.
@ -65,13 +63,15 @@ class RBBITableBuilder {
private RBBIRuleBuilder fRB;
private int fRootIx; // The array index into RBBIRuleBuilder.fTreeRoots
// for the parse tree to operate on.
// Too bad Java can't do indirection more easily!
private List<RBBIStateDescriptor> fDStates; // D states (Aho's terminology)
// Index is state number
// Contents are RBBIStateDescriptor pointers.
/** The array index into RBBIRuleBuilder.fTreeRoots for the parse tree to operate on. */
private int fRootIx;
/** D states (Aho's terminology). Index is state number. */
private List<RBBIStateDescriptor> fDStates;
/** Synthesized safe table, a List of row arrays. */
private List<short[]> fSafeTable;
//-----------------------------------------------------------------------------
//
@ -91,8 +91,8 @@ class RBBITableBuilder {
//-----------------------------------------------------------------------------
//
// RBBITableBuilder::build - This is the main function for building the DFA state transtion
// table from the RBBI rules parse tree.
// RBBITableBuilder::buildForwardTable - This is the main function for building
// the DFA state transition table from the RBBI rules parse tree.
//
//-----------------------------------------------------------------------------
void buildForwardTable() {
@ -195,8 +195,6 @@ class RBBITableBuilder {
// for all tables. Merge the ones from this table into the global set.
//
mergeRuleStatusVals();
if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("states")>=0) {printStates();}
}
@ -924,6 +922,40 @@ class RBBITableBuilder {
return false;
}
/**
* Find the next duplicate state in the safe reverse table. An iterator function.
* @param states in/out parameter, specifies where to start looking for duplicates,
* and returns the first pair of duplicates found, if any.
* @return true if duplicate states were found, false otherwise.
* @internal
*/
boolean findDuplicateSafeState(RBBIRuleBuilder.IntPair states) {
int numStates = fSafeTable.size();
for (; states.first<numStates-1; ++states.first) {
short[] firstRow = fSafeTable.get(states.first);
for (states.second=states.first+1; states.second<numStates; ++states.second) {
short[] duplRow = fSafeTable.get(states.second);
boolean rowsMatch = true;
int numCols = firstRow.length;
for (int col=0; col < numCols; ++col) {
int firstVal = firstRow[col];
int duplVal = duplRow[col];
if (!((firstVal == duplVal) ||
((firstVal == states.first || firstVal == states.second) &&
(duplVal == states.first || duplVal == states.second)))) {
rowsMatch = false;
break;
}
}
if (rowsMatch) {
return true;
}
}
}
return false;
}
/**
* Remove a duplicate state (row) from the state table. All references to the deleted state are
* redirected to "keepState", the first encountered of the duplicated pair of states.
@ -964,6 +996,33 @@ class RBBITableBuilder {
}
}
/**
* Remove a duplicate state from the safe table.
* @param keepState The first of the duplicate pair of states, the one to be kept.
* @param duplState The second of the duplicate pair, the one to be removed.
* @internal
*/
void removeSafeState(int keepState, int duplState) {
assert(keepState < duplState);
assert(duplState < fDStates.size());
fSafeTable.remove(duplState);
int numStates = fSafeTable.size();
for (int state=0; state<numStates; ++state) {
short[] row = fSafeTable.get(state);
for (int col=0; col<row.length; col++) {
int existingVal = row[col];
int newVal = existingVal;
if (existingVal == duplState) {
newVal = keepState;
} else if (existingVal > duplState) {
newVal = existingVal - 1;
}
row[col] = (short)newVal;
}
}
}
/**
* Check for, and remove duplicate states (table rows).
@ -1047,6 +1106,146 @@ class RBBITableBuilder {
return table;
}
/**
* Synthesize a safe state table from the main state table.
*/
void buildSafeReverseTable() {
// Find safe char class pairs.
// make a state table row for each trailing class, and map from class to row.
// For each pair
// startRow[p1] = p2
// p2row[p2] = stopRow
// For each unfilled in cell
// set to row corresponding to its column.
// Each safe pair is stored as two chars in the safePair stringBuilder.
StringBuilder safePairs = new StringBuilder();
int numCharClasses = fRB.fSetBuilder.getNumCharCategories();
int numStates = fDStates.size();
for (int c1=0; c1<numCharClasses; ++c1) {
for (int c2=0; c2 < numCharClasses; ++c2) {
int wantedEndState = -1;
int endState = 0;
for (int startState = 1; startState < numStates; ++startState) {
RBBIStateDescriptor startStateD = fDStates.get(startState);
int s2 = startStateD.fDtran[c1];
RBBIStateDescriptor s2StateD = fDStates.get(s2);
endState = s2StateD.fDtran[c2];
if (wantedEndState < 0) {
wantedEndState = endState;
} else {
if (wantedEndState != endState) {
break;
}
}
}
if (wantedEndState == endState) {
safePairs.append((char)c1);
safePairs.append((char)c2);
// System.out.printf("(%d, %d) ", c1, c2);
}
}
// System.out.printf("\n");
}
// Populate the initial safe table.
// The table as a whole is a List<short[]>
// Row 0 is the stop state.
// Row 1 is the start sate.
// Row 2 and beyond are other states, initially one per char class, but
// after initial construction, many of the states will be combined, compacting the table.)
// The String holds the nextState data only. The four leading fields of a row, fAccepting,
// fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
assert(fSafeTable == null);
fSafeTable = new ArrayList<short[]>();
for (int row=0; row<numCharClasses + 2; ++row) {
fSafeTable.add(new short[numCharClasses]);
}
// From the start state, each input char class transitions to the state for that input.
short[] startState = fSafeTable.get(1);
for (int charClass=0; charClass < numCharClasses; ++charClass) {
// Note: +2 to skip the start & stop state rows.
startState[charClass] = (short)(charClass+2);
}
// Initially make every other state table row look like the start state row
// (except for the stop state, which remains all 0)
for (int row=2; row<numCharClasses+2; ++row) {
System.arraycopy(startState, 0, fSafeTable.get(row), 0, startState.length);
}
// Run through the safe pairs, set the next state to zero when pair has been seen.
// Zero being the stop state, meaning we found a safe point.
for (int pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
int c1 = safePairs.charAt(pairIdx);
int c2 = safePairs.charAt(pairIdx + 1);
short[] rowState = fSafeTable.get(c2 + 2);
rowState[c1] = 0;
}
// Remove duplicate or redundant rows from the table.
RBBIRuleBuilder.IntPair states = new RBBIRuleBuilder.IntPair(1, 0);
while (findDuplicateSafeState(states)) {
// System.out.printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
removeSafeState(states.first, states.second);
}
}
/**
* Calculate the size of the runtime form of this safe state table.
*/
int getSafeTableSize() {
if (fSafeTable == null) {
return 0;
}
int size = 16; // The header of 4 ints, with no rows to the table.
int numRows = fSafeTable.size();
int numCols = fSafeTable.get(0).length;
int rowSize = 8 + 2*numCols;
size += numRows * rowSize;
// TODO: there are redundant round-up. Figure out best place, get rid of the rest.
size = (size + 7) & ~7; // round up to a multiple of 8 bytes
return size;
}
/**
* Create a RBBIDataWrapper.RBBIStateTable for the safe reverse table.
* RBBIDataWrapper.RBBIStateTable is similar to struct RBBIStateTable in ICU4C,
* in common/rbbidata.h
*/
RBBIDataWrapper.RBBIStateTable exportSafeTable() {
RBBIDataWrapper.RBBIStateTable table = new RBBIDataWrapper.RBBIStateTable();
table.fNumStates = fSafeTable.size();
int numCharCategories = fSafeTable.get(0).length;
// Size of table size in shorts.
// the "4" is the size of struct RBBIStateTableRow, the row header part only.
int rowLen = 4 + numCharCategories;
// TODO: tableSize is basically numStates * numCharCategories,
// except for alignment padding. Clean up here, and in main exportTable().
int tableSize = (getSafeTableSize() - 16) / 2; // fTable length in shorts.
table.fTable = new short[tableSize];
table.fRowLen = rowLen * 2; // Row length in bytes.
for (int state=0; state<table.fNumStates; state++) {
short[] rowArray = fSafeTable.get(state);
int row = state * rowLen;
for (int col=0; col<numCharCategories; col++) {
table.fTable[row + RBBIDataWrapper.NEXTSTATES + col] = rowArray[col];
}
}
return table;
}
//-----------------------------------------------------------------------------
@ -1104,6 +1303,44 @@ class RBBITableBuilder {
}
/**
* Debug Function. Dump the fully constructed safe reverse table.
*/
void printReverseTable() {
int c; // input "character"
System.out.printf(" Safe Reverse Table \n");
if (fSafeTable == null) {
System.out.printf(" --- nullptr ---\n");
return;
}
int numCharCategories = fSafeTable.get(0).length;
System.out.printf("state | i n p u t s y m b o l s \n");
System.out.printf(" | Acc LA Tag");
for (c=0; c< numCharCategories; c++) {
System.out.printf(" %2d", c);
}
System.out.printf("\n");
System.out.printf(" |---------------");
for (c=0; c<numCharCategories; c++) {
System.out.printf("---");
}
System.out.printf("\n");
for (int n=0; n<fSafeTable.size(); n++) {
short rowArray[] = fSafeTable.get(n);
System.out.printf(" %3d | " , n);
System.out.printf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
for (c=0; c<numCharCategories; c++) {
System.out.printf(" %2d", rowArray[c]);
}
System.out.printf("\n");
}
System.out.printf("\n\n");
}
//-----------------------------------------------------------------------------