ICU-13194 RBBI safe tables, Java port, work in progress.

X-SVN-Rev: 41170
This commit is contained in:
Andy Heninger 2018-03-29 16:09:26 +00:00
parent 198a14956e
commit ed5b77c406
4 changed files with 128 additions and 291 deletions

View file

@ -177,25 +177,13 @@ public final class RBBIDataWrapper {
*/
@Deprecated
public RBBIStateTable fRTable;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public RBBIStateTable fSFTable;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public RBBIStateTable fSRTable;
Trie2 fTrie;
String fRuleSource;
int fStatusTable[];
static final int DATA_FORMAT = 0x42726b20; // "Brk "
static final int FORMAT_VERSION = 0x04000000; // 4.0.0.0
static final int FORMAT_VERSION = 0x05000000; // 4.0.0.0
private static final class IsAcceptable implements Authenticate {
@Override
@ -210,7 +198,7 @@ public final class RBBIDataWrapper {
// Indexes to fields in the ICU4C style binary form of the RBBI Data Header
// Used by the rule compiler when flattening the data.
//
final static int DH_SIZE = 24;
final static int DH_SIZE = 20;
final static int DH_MAGIC = 0;
final static int DH_FORMATVERSION = 1;
final static int DH_LENGTH = 2;
@ -219,16 +207,12 @@ public final class RBBIDataWrapper {
final static int DH_FTABLELEN = 5;
final static int DH_RTABLE = 6;
final static int DH_RTABLELEN = 7;
final static int DH_SFTABLE = 8;
final static int DH_SFTABLELEN = 9;
final static int DH_SRTABLE = 10;
final static int DH_SRTABLELEN = 11;
final static int DH_TRIE = 12;
final static int DH_TRIELEN = 13;
final static int DH_RULESOURCE = 14;
final static int DH_RULESOURCELEN = 15;
final static int DH_STATUSTABLE = 16;
final static int DH_STATUSTABLELEN = 17;
final static int DH_TRIE = 8;
final static int DH_TRIELEN = 9;
final static int DH_RULESOURCE = 10;
final static int DH_RULESOURCELEN = 11;
final static int DH_STATUSTABLE = 12;
final static int DH_STATUSTABLELEN = 13;
// Index offsets to the fields in a state table row.
@ -299,10 +283,6 @@ public final class RBBIDataWrapper {
int fFTableLen;
int fRTable; // Offset to the reverse state transition table.
int fRTableLen;
int fSFTable; // safe point forward transition table
int fSFTableLen;
int fSRTable; // safe point reverse transition table
int fSRTableLen;
int fTrie; // Offset to Trie data for character categories
int fTrieLen;
int fRuleSource; // Offset to the source for for the break
@ -358,10 +338,6 @@ public final class RBBIDataWrapper {
This.fHeader.fFTableLen = bytes.getInt();
This.fHeader.fRTable = bytes.getInt();
This.fHeader.fRTableLen = bytes.getInt();
This.fHeader.fSFTable = bytes.getInt();
This.fHeader.fSFTableLen = bytes.getInt();
This.fHeader.fSRTable = bytes.getInt();
This.fHeader.fSRTableLen = bytes.getInt();
This.fHeader.fTrie = bytes.getInt();
This.fHeader.fTrieLen = bytes.getInt();
This.fHeader.fRuleSource = bytes.getInt();
@ -406,41 +382,6 @@ public final class RBBIDataWrapper {
This.fRTable = RBBIStateTable.get(bytes, This.fHeader.fRTableLen);
pos += This.fHeader.fRTableLen;
//
// Read in the Safe Forward state table
//
if (This.fHeader.fSFTableLen > 0) {
// Skip over any padding in the file
ICUBinary.skipBytes(bytes, This.fHeader.fSFTable - pos);
pos = This.fHeader.fSFTable;
// Create & fill the table itself.
This.fSFTable = RBBIStateTable.get(bytes, This.fHeader.fSFTableLen);
pos += This.fHeader.fSFTableLen;
}
//
// Read in the Safe Reverse state table
//
if (This.fHeader.fSRTableLen > 0) {
// Skip over any padding in the file
ICUBinary.skipBytes(bytes, This.fHeader.fSRTable - pos);
pos = This.fHeader.fSRTable;
// Create & fill the table itself.
This.fSRTable = RBBIStateTable.get(bytes, This.fHeader.fSRTableLen);
pos += This.fHeader.fSRTableLen;
}
// Rule Compatibility Hacks
// If a rule set includes reverse rules but does not explicitly include safe reverse rules,
// the reverse rules are to be treated as safe reverse rules.
if (This.fSRTable == null && This.fRTable != null) {
This.fSRTable = This.fRTable;
This.fRTable = null;
}
//
// Unserialize the Character categories TRIE
// Because we can't be absolutely certain where the Trie deserialize will
@ -512,10 +453,6 @@ public final class RBBIDataWrapper {
dumpTable(out, fFTable);
out.println("Reverse State Table");
dumpTable(out, fRTable);
out.println("Forward Safe Points Table");
dumpTable(out, fSFTable);
out.println("Reverse Safe Points Table");
dumpTable(out, fSRTable);
dumpCharCategories(out);
out.println("Source Rules: " + fRuleSource);

View file

@ -40,8 +40,8 @@ class RBBIRuleBuilder {
RBBINode[] fTreeRoots = new RBBINode[4];
static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
static final int fReverseTree = 1; // for each of the trees.
static final int fSafeFwdTree = 2; // (in C, these are pointer variables and
static final int fSafeRevTree = 3; // there is no array.)
// // (in C, these are pointer variables and
// // there is no array.)
int fDefaultTree = fForwardTree; // For rules not qualified with a !
// the tree to which they belong to.
@ -57,10 +57,7 @@ class RBBIRuleBuilder {
RBBISetBuilder fSetBuilder; // Set and Character Category builder.
List<RBBINode> fUSetNodes; // Vector of all uset nodes.
RBBITableBuilder fForwardTables; // State transition tables
RBBITableBuilder fReverseTables;
RBBITableBuilder fSafeFwdTables;
RBBITableBuilder fSafeRevTables;
RBBITableBuilder fForwardTable; // State transition tables
//
// Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
@ -176,20 +173,16 @@ class RBBIRuleBuilder {
// Sections sizes actually stored in the header are for the actual data
// without the padding.
//
int headerSize = 24 * 4; // align8(sizeof(RBBIDataHeader));
int forwardTableSize = align8(fForwardTables.getTableSize());
int reverseTableSize = align8(fReverseTables.getTableSize());
// int safeFwdTableSize = align8(fSafeFwdTables.getTableSize());
int safeRevTableSize = align8(fSafeRevTables.getTableSize());
int headerSize = RBBIDataWrapper.DH_SIZE * 4; // align8(sizeof(RBBIDataHeader));
int forwardTableSize = align8(fForwardTable.getTableSize());
int reverseTableSize = align8(fForwardTable.getSafeTableSize());
int trieSize = align8(fSetBuilder.getTrieSize());
int statusTableSize = align8(fRuleStatusVals.size() * 4);
int rulesSize = align8((strippedRules.length()) * 2);
int totalSize = headerSize
+ forwardTableSize
+ /* reverseTableSize */ 0
+ /* safeFwdTableSize */ 0
+ (safeRevTableSize > 0 ? safeRevTableSize : reverseTableSize)
+ reverseTableSize
+ statusTableSize + trieSize + rulesSize;
int outputPos = 0; // Track stream position, starting from RBBIDataHeader.
@ -207,39 +200,14 @@ class RBBIRuleBuilder {
header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount.
// Only save the forward table and the safe reverse table,
// because these are the only ones used at run-time.
//
// For the moment, we still build the other tables if they are present in the rule source files,
// for backwards compatibility. Old rule files need to work, and this is the simplest approach.
//
// Additional backwards compatibility consideration: if no safe rules are provided, consider the
// reverse rules to actually be the safe reverse rules.
header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable
header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen
// Do not save Reverse Table.
header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable
header[RBBIDataWrapper.DH_RTABLELEN] = 0; // fRTableLen
header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen
// Do not save the Safe Forward table.
header[RBBIDataWrapper.DH_SFTABLE] = header[RBBIDataWrapper.DH_RTABLE]
+ 0; // fSTable
header[RBBIDataWrapper.DH_SFTABLELEN] = 0; // fSTableLen
// Safe reverse table. Use if present, otherwise save regular reverse table as the safe reverse.
header[RBBIDataWrapper.DH_SRTABLE] = header[RBBIDataWrapper.DH_SFTABLE]
+ 0; // fSRTable
if (safeRevTableSize > 0) {
header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize;
} else {
assert reverseTableSize > 0;
header[RBBIDataWrapper.DH_SRTABLELEN] = reverseTableSize;
}
header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_SRTABLE]
+ header[RBBIDataWrapper.DH_SRTABLELEN]; // fTrie
header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_RTABLE]
+ header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie
header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen
header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE]
+ header[RBBIDataWrapper.DH_TRIELEN];
@ -253,49 +221,25 @@ class RBBIRuleBuilder {
}
// Write out the actual state tables.
RBBIDataWrapper.RBBIStateTable table = fForwardTables.exportTable();
assert(outputPos == header[4]);
RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable();
assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]);
outputPos += table.put(dos);
/* do not write the reverse table
tableData = fReverseTables.exportTable();
Assert.assrt(outputPos == header[6]);
for (i = 0; i < tableData.length; i++) {
dos.writeShort(tableData[i]);
outputPos += 2;
}
*/
/* do not write safe forwards table
Assert.assrt(outputPos == header[8]);
tableData = fSafeFwdTables.exportTable();
for (i = 0; i < tableData.length; i++) {
dos.writeShort(tableData[i]);
outputPos += 2;
}
*/
// Write the safe reverse table.
// If not present, write the plain reverse table (old style rule compatibility)
assert(outputPos == header[10]);
if (safeRevTableSize > 0) {
table = fSafeRevTables.exportTable();
} else {
table = fReverseTables.exportTable();
}
table = fForwardTable.exportSafeTable();
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]);
outputPos += table.put(dos);
// write out the Trie table
Assert.assrt(outputPos == header[12]);
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]);
fSetBuilder.serializeTrie(os);
outputPos += header[13];
outputPos += header[RBBIDataWrapper.DH_TRIELEN];
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
dos.write(0);
outputPos += 1;
}
// Write out the status {tag} table.
Assert.assrt(outputPos == header[16]);
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]);
for (Integer val : fRuleStatusVals) {
dos.writeInt(val.intValue());
outputPos += 4;
@ -308,7 +252,7 @@ class RBBIRuleBuilder {
// Write out the stripped rules (rules with extra spaces removed
// These go last in the data area, even though they are not last in the header.
Assert.assrt(outputPos == header[14]);
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]);
dos.writeChars(strippedRules);
outputPos += strippedRules.length() * 2;
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
@ -330,7 +274,15 @@ class RBBIRuleBuilder {
// and list of all Unicode Sets referenced by the rules.
//
RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
builder.fScanner.parse();
builder.build(os);
}
/**
* Compile rules to the binary form, write that to an ouput stream.
*
*/
void build(OutputStream os) throws IOException {
fScanner.parse();
//
// UnicodeSet processing.
@ -338,31 +290,30 @@ class RBBIRuleBuilder {
// Generate the mapping tables (TRIE) from input code points to
// the character categories.
//
builder.fSetBuilder.buildRanges();
fSetBuilder.buildRanges();
//
// Generate the DFA state transition table.
//
builder.fForwardTables = new RBBITableBuilder(builder, fForwardTree);
builder.fReverseTables = new RBBITableBuilder(builder, fReverseTree);
builder.fSafeFwdTables = new RBBITableBuilder(builder, fSafeFwdTree);
builder.fSafeRevTables = new RBBITableBuilder(builder, fSafeRevTree);
builder.fForwardTables.build();
builder.fReverseTables.build();
builder.fSafeFwdTables.build();
builder.fSafeRevTables.build();
if (builder.fDebugEnv != null
&& builder.fDebugEnv.indexOf("states") >= 0) {
builder.fForwardTables.printRuleStatusTable();
fForwardTable = new RBBITableBuilder(this, fForwardTree);
fForwardTable.buildForwardTable();
optimizeTables();
fForwardTable.buildSafeReverseTable();
if (fDebugEnv != null
&& fDebugEnv.indexOf("states") >= 0) {
fForwardTable.printStates();
fForwardTable.printRuleStatusTable();
fForwardTable.printReverseTable();
}
builder.optimizeTables();
builder.fSetBuilder.buildTrie();
fSetBuilder.buildTrie();
//
// Package up the compiled data, writing it to an output stream
// in the serialization format. This is the same as the ICU4C runtime format.
//
builder.flattenData(os);
flattenData(os);
}
static class IntPair {
@ -377,17 +328,10 @@ class RBBIRuleBuilder {
void optimizeTables() {
IntPair duplPair = new IntPair(3, 0);
while (fForwardTables.findDuplCharClassFrom(duplPair)) {
while (fForwardTable.findDuplCharClassFrom(duplPair)) {
fSetBuilder.mergeCategories(duplPair.first, duplPair.second);
fForwardTables.removeColumn(duplPair.second);
fReverseTables.removeColumn(duplPair.second);
fSafeFwdTables.removeColumn(duplPair.second);
fSafeRevTables.removeColumn(duplPair.second);
fForwardTable.removeColumn(duplPair.second);
}
fForwardTables.removeDuplicateStates();
fReverseTables.removeDuplicateStates();
fSafeFwdTables.removeDuplicateStates();
fSafeRevTables.removeDuplicateStates();
fForwardTable.removeDuplicateStates();
}
}

View file

@ -95,7 +95,7 @@ class RBBITableBuilder {
// table from the RBBI rules parse tree.
//
//-----------------------------------------------------------------------------
void build() {
void buildForwardTable() {
// If there were no rules, just return. This situation can easily arise
// for the reverse rules.
if (fRB.fTreeRoots[fRootIx]==null) {

View file

@ -12,7 +12,6 @@ package com.ibm.icu.text;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
import static com.ibm.icu.impl.CharacterIteration.previous32;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@ -510,7 +509,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
checkOffset(offset, fText);
// Adjust offset to be on a code point boundary and not beyond the end of the text.
// Note that isBoundary() is always be false for offsets that are not on code point boundaries.
// Note that isBoundary() is always false for offsets that are not on code point boundaries.
// But we still need the side effect of leaving iteration at the following boundary.
int adjustedOffset = CISetIndex32(fText, offset);
@ -966,142 +965,71 @@ public class RuleBasedBreakIterator extends BreakIterator {
* This locates a "Safe Position" from which the forward break rules
* will operate correctly. A Safe Position is not necessarily a boundary itself.
*
* The logic of this function is very similar to handleNext(), above.
* The logic of this function is very similar to handleNext(), above, but simpler
* because the safe table does not require as many options.
*
* @param fromPosition the position in the input text to begin the iteration.
* @internal
*/
private int handlePrevious(int fromPosition) {
if (fText == null) {
return 0;
private int handleSafePrevious(int fromPosition) {
int state;
short category = 0;
int result = 0;
// caches for quicker access
CharacterIterator text = fText;
Trie2 trie = fRData.fTrie;
short[] stateTable = fRData.fRTable.fTable;
CISetIndex32(text, fromPosition);
if (TRACE) {
System.out.print("Handle Previous pos char state category");
}
int state;
int category = 0;
int mode;
int row;
int c;
int result = 0;
int initialPosition = fromPosition;
fLookAheadMatches.reset();
short[] stateTable = fRData.fSRTable.fTable;
CISetIndex32(fText, fromPosition);
if (fromPosition == fText.getBeginIndex()) {
// if we're already at the start of the text, return DONE.
if (text.getIndex() == text.getBeginIndex()) {
return BreakIterator.DONE;
}
// set up the starting char
result = initialPosition;
c = previous32(fText);
// Set up the initial state for the state machine
// Set the initial state for the state machine
int c = CharacterIteration.previous32(text);
state = START_STATE;
row = fRData.getRowIndex(state);
category = 3; // TODO: obsolete? from the old start/run mode scheme?
mode = RBBI_RUN;
if ((fRData.fSRTable.fFlags & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
category = 2;
mode = RBBI_START;
}
int row = fRData.getRowIndex(state);
if (TRACE) {
System.out.println("Handle Prev pos char state category ");
}
// loop until we reach the beginning of the text or transition to state 0
// loop until we reach the start of the text or transition to state 0
//
mainLoop: for (;;) {
if (c == DONE32) {
// Reached end of input string.
if (mode == RBBI_END) {
// We have already done the {eof} iteration. Now is the time
// to unconditionally bail out.
break mainLoop;
}
mode = RBBI_END;
category = 1;
}
if (mode == RBBI_RUN) {
// look up the current character's category, which tells us
// which column in the state table to look at.
//
// And off the dictionary flag bit. For reverse iteration it is not used.
category = (short) fRData.fTrie.get(c);
category &= ~0x4000;
}
for (; c != DONE32; c = CharacterIteration.previous32(text)) {
// look up the current character's character category, which tells us
// which column in the state table to look at.
//
// And off the dictionary flag bit. For reverse iteration it is not used.
category = (short) trie.get(c);
category &= ~0x4000;
if (TRACE) {
System.out.print(" " + fText.getIndex() + " ");
if (0x20 <= c && c < 0x7f) {
System.out.print(" " + c + " ");
} else {
System.out.print(" " + Integer.toHexString(c) + " ");
}
System.out.println(" " + state + " " + category + " ");
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
}
// State Transition - move machine to its next state
//
assert(category < fRData.fHeader.fCatCount);
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
row = fRData.getRowIndex(state);
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
// Match found, common case, could have lookahead so we move
// on to check it
result = fText.getIndex();
}
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
if (completedRule > 0) {
// Lookahead match is completed.
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
if (lookaheadResult >= 0) {
result = lookaheadResult;
break mainLoop;
}
}
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
if (rule != 0) {
// At the position of a '/' in a look-ahead match. Record it.
int pos = fText.getIndex();
fLookAheadMatches.setPosition(rule, pos);
}
row = fRData.getRowIndex(state);
if (state == STOP_STATE) {
// Normal loop exit is here
break mainLoop;
// This is the normal exit from the lookup state machine.
// Transition to state zero means we have found a safe point.
break;
}
// then move iterator position backwards one character
//
if (mode == RBBI_RUN) {
c = previous32(fText);
} else {
if (mode == RBBI_START) {
mode = RBBI_RUN;
}
}
} // End of the main loop.
// The state machine is done. Check whether it found a match...
//
// If the iterator failed to move in the match engine, force it back by one code point.
// (This really indicates a defect in the break rules. They should always match
// at least one character.)
if (result == initialPosition) {
CISetIndex32(fText, initialPosition);
previous32(fText);
result = fText.getIndex();
}
// The state machine is done.
result = text.getIndex();
if (TRACE) {
System.out.println("Result = " + result);
System.out.println("result = " + result);
}
return result;
}
@ -1493,11 +1421,26 @@ class BreakCache {
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
int aBoundary = fText.getBeginIndex();
int ruleStatusIndex = 0;
// TODO: check for position == length of text. Although may still need to back up to get rule status.
if (position > aBoundary + 20) {
int backupPos = handlePrevious(position);
fPosition = backupPos;
aBoundary = handleNext(); // Ignore dictionary, just finding a rule based boundary.
int backupPos = handleSafePrevious(position);
if (backupPos > aBoundary) {
// Advance to the boundary following the backup position.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
fPosition = backupPos;
aBoundary = handleNext();
if (aBoundary == backupPos + 1 ||
(aBoundary == backupPos + 2 &&
Character.isHighSurrogate(fText.setIndex(backupPos)) &&
Character.isLowSurrogate(fText.next()))) {
// The initial handleNext() only advanced by a single code point. Go again.
// Safe rules identify safe pairs.
aBoundary = handleNext();
}
}
ruleStatusIndex = fRuleStatusIndex;
}
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
@ -1628,21 +1571,34 @@ class BreakCache {
if (backupPosition <= textBegin) {
backupPosition = textBegin;
} else {
backupPosition = handlePrevious(backupPosition);
backupPosition = handleSafePrevious(backupPosition);
}
if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
position = textBegin;
positionStatusIdx = 0;
} else {
// Advance to the boundary following the backup position.
// There is a complication: the safe reverse rules identify pairs of code points
// that are safe. If advancing from the safe point moves forwards by less than
// two code points, we need to advance one more time to ensure that the boundary
// is good, including a correct rules status value.
//
fPosition = backupPosition; // TODO: pass starting position in a clearer way.
position = handleNext();
if (position == backupPosition + 1 ||
(position == backupPosition + 2 &&
Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
Character.isLowSurrogate(fText.next()))) {
// The initial handleNext() only advanced by a single code point. Go again.
// Safe rules identify safe pairs.
position = handleNext();
}
positionStatusIdx = fRuleStatusIndex;
}
} while (position >= fromPosition);
// Find boundaries between the one we just located and the first already-cached boundary
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
fSideBuffer.removeAllElements();
fSideBuffer.push(position);