mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-11 08:01:32 +00:00
ICU-13194 RBBI safe tables, Java port, work in progress.
X-SVN-Rev: 41170
This commit is contained in:
parent
198a14956e
commit
ed5b77c406
4 changed files with 128 additions and 291 deletions
|
@ -177,25 +177,13 @@ public final class RBBIDataWrapper {
|
|||
*/
|
||||
@Deprecated
|
||||
public RBBIStateTable fRTable;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public RBBIStateTable fSFTable;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public RBBIStateTable fSRTable;
|
||||
|
||||
Trie2 fTrie;
|
||||
String fRuleSource;
|
||||
int fStatusTable[];
|
||||
|
||||
static final int DATA_FORMAT = 0x42726b20; // "Brk "
|
||||
static final int FORMAT_VERSION = 0x04000000; // 4.0.0.0
|
||||
static final int FORMAT_VERSION = 0x05000000; // 4.0.0.0
|
||||
|
||||
private static final class IsAcceptable implements Authenticate {
|
||||
@Override
|
||||
|
@ -210,7 +198,7 @@ public final class RBBIDataWrapper {
|
|||
// Indexes to fields in the ICU4C style binary form of the RBBI Data Header
|
||||
// Used by the rule compiler when flattening the data.
|
||||
//
|
||||
final static int DH_SIZE = 24;
|
||||
final static int DH_SIZE = 20;
|
||||
final static int DH_MAGIC = 0;
|
||||
final static int DH_FORMATVERSION = 1;
|
||||
final static int DH_LENGTH = 2;
|
||||
|
@ -219,16 +207,12 @@ public final class RBBIDataWrapper {
|
|||
final static int DH_FTABLELEN = 5;
|
||||
final static int DH_RTABLE = 6;
|
||||
final static int DH_RTABLELEN = 7;
|
||||
final static int DH_SFTABLE = 8;
|
||||
final static int DH_SFTABLELEN = 9;
|
||||
final static int DH_SRTABLE = 10;
|
||||
final static int DH_SRTABLELEN = 11;
|
||||
final static int DH_TRIE = 12;
|
||||
final static int DH_TRIELEN = 13;
|
||||
final static int DH_RULESOURCE = 14;
|
||||
final static int DH_RULESOURCELEN = 15;
|
||||
final static int DH_STATUSTABLE = 16;
|
||||
final static int DH_STATUSTABLELEN = 17;
|
||||
final static int DH_TRIE = 8;
|
||||
final static int DH_TRIELEN = 9;
|
||||
final static int DH_RULESOURCE = 10;
|
||||
final static int DH_RULESOURCELEN = 11;
|
||||
final static int DH_STATUSTABLE = 12;
|
||||
final static int DH_STATUSTABLELEN = 13;
|
||||
|
||||
|
||||
// Index offsets to the fields in a state table row.
|
||||
|
@ -299,10 +283,6 @@ public final class RBBIDataWrapper {
|
|||
int fFTableLen;
|
||||
int fRTable; // Offset to the reverse state transition table.
|
||||
int fRTableLen;
|
||||
int fSFTable; // safe point forward transition table
|
||||
int fSFTableLen;
|
||||
int fSRTable; // safe point reverse transition table
|
||||
int fSRTableLen;
|
||||
int fTrie; // Offset to Trie data for character categories
|
||||
int fTrieLen;
|
||||
int fRuleSource; // Offset to the source for for the break
|
||||
|
@ -358,10 +338,6 @@ public final class RBBIDataWrapper {
|
|||
This.fHeader.fFTableLen = bytes.getInt();
|
||||
This.fHeader.fRTable = bytes.getInt();
|
||||
This.fHeader.fRTableLen = bytes.getInt();
|
||||
This.fHeader.fSFTable = bytes.getInt();
|
||||
This.fHeader.fSFTableLen = bytes.getInt();
|
||||
This.fHeader.fSRTable = bytes.getInt();
|
||||
This.fHeader.fSRTableLen = bytes.getInt();
|
||||
This.fHeader.fTrie = bytes.getInt();
|
||||
This.fHeader.fTrieLen = bytes.getInt();
|
||||
This.fHeader.fRuleSource = bytes.getInt();
|
||||
|
@ -406,41 +382,6 @@ public final class RBBIDataWrapper {
|
|||
This.fRTable = RBBIStateTable.get(bytes, This.fHeader.fRTableLen);
|
||||
pos += This.fHeader.fRTableLen;
|
||||
|
||||
//
|
||||
// Read in the Safe Forward state table
|
||||
//
|
||||
if (This.fHeader.fSFTableLen > 0) {
|
||||
// Skip over any padding in the file
|
||||
ICUBinary.skipBytes(bytes, This.fHeader.fSFTable - pos);
|
||||
pos = This.fHeader.fSFTable;
|
||||
|
||||
// Create & fill the table itself.
|
||||
This.fSFTable = RBBIStateTable.get(bytes, This.fHeader.fSFTableLen);
|
||||
pos += This.fHeader.fSFTableLen;
|
||||
}
|
||||
|
||||
//
|
||||
// Read in the Safe Reverse state table
|
||||
//
|
||||
if (This.fHeader.fSRTableLen > 0) {
|
||||
// Skip over any padding in the file
|
||||
ICUBinary.skipBytes(bytes, This.fHeader.fSRTable - pos);
|
||||
pos = This.fHeader.fSRTable;
|
||||
|
||||
// Create & fill the table itself.
|
||||
This.fSRTable = RBBIStateTable.get(bytes, This.fHeader.fSRTableLen);
|
||||
pos += This.fHeader.fSRTableLen;
|
||||
}
|
||||
|
||||
// Rule Compatibility Hacks
|
||||
// If a rule set includes reverse rules but does not explicitly include safe reverse rules,
|
||||
// the reverse rules are to be treated as safe reverse rules.
|
||||
|
||||
if (This.fSRTable == null && This.fRTable != null) {
|
||||
This.fSRTable = This.fRTable;
|
||||
This.fRTable = null;
|
||||
}
|
||||
|
||||
//
|
||||
// Unserialize the Character categories TRIE
|
||||
// Because we can't be absolutely certain where the Trie deserialize will
|
||||
|
@ -512,10 +453,6 @@ public final class RBBIDataWrapper {
|
|||
dumpTable(out, fFTable);
|
||||
out.println("Reverse State Table");
|
||||
dumpTable(out, fRTable);
|
||||
out.println("Forward Safe Points Table");
|
||||
dumpTable(out, fSFTable);
|
||||
out.println("Reverse Safe Points Table");
|
||||
dumpTable(out, fSRTable);
|
||||
|
||||
dumpCharCategories(out);
|
||||
out.println("Source Rules: " + fRuleSource);
|
||||
|
|
|
@ -40,8 +40,8 @@ class RBBIRuleBuilder {
|
|||
RBBINode[] fTreeRoots = new RBBINode[4];
|
||||
static final int fForwardTree = 0; // Indexes into the above fTreeRoots array
|
||||
static final int fReverseTree = 1; // for each of the trees.
|
||||
static final int fSafeFwdTree = 2; // (in C, these are pointer variables and
|
||||
static final int fSafeRevTree = 3; // there is no array.)
|
||||
// // (in C, these are pointer variables and
|
||||
// // there is no array.)
|
||||
int fDefaultTree = fForwardTree; // For rules not qualified with a !
|
||||
// the tree to which they belong to.
|
||||
|
||||
|
@ -57,10 +57,7 @@ class RBBIRuleBuilder {
|
|||
|
||||
RBBISetBuilder fSetBuilder; // Set and Character Category builder.
|
||||
List<RBBINode> fUSetNodes; // Vector of all uset nodes.
|
||||
RBBITableBuilder fForwardTables; // State transition tables
|
||||
RBBITableBuilder fReverseTables;
|
||||
RBBITableBuilder fSafeFwdTables;
|
||||
RBBITableBuilder fSafeRevTables;
|
||||
RBBITableBuilder fForwardTable; // State transition tables
|
||||
|
||||
//
|
||||
// Status {tag} values. These structures are common to all of the rule sets (Forward, Reverse, etc.).
|
||||
|
@ -176,20 +173,16 @@ class RBBIRuleBuilder {
|
|||
// Sections sizes actually stored in the header are for the actual data
|
||||
// without the padding.
|
||||
//
|
||||
int headerSize = 24 * 4; // align8(sizeof(RBBIDataHeader));
|
||||
int forwardTableSize = align8(fForwardTables.getTableSize());
|
||||
int reverseTableSize = align8(fReverseTables.getTableSize());
|
||||
// int safeFwdTableSize = align8(fSafeFwdTables.getTableSize());
|
||||
int safeRevTableSize = align8(fSafeRevTables.getTableSize());
|
||||
int headerSize = RBBIDataWrapper.DH_SIZE * 4; // align8(sizeof(RBBIDataHeader));
|
||||
int forwardTableSize = align8(fForwardTable.getTableSize());
|
||||
int reverseTableSize = align8(fForwardTable.getSafeTableSize());
|
||||
int trieSize = align8(fSetBuilder.getTrieSize());
|
||||
int statusTableSize = align8(fRuleStatusVals.size() * 4);
|
||||
int rulesSize = align8((strippedRules.length()) * 2);
|
||||
|
||||
int totalSize = headerSize
|
||||
+ forwardTableSize
|
||||
+ /* reverseTableSize */ 0
|
||||
+ /* safeFwdTableSize */ 0
|
||||
+ (safeRevTableSize > 0 ? safeRevTableSize : reverseTableSize)
|
||||
+ reverseTableSize
|
||||
+ statusTableSize + trieSize + rulesSize;
|
||||
int outputPos = 0; // Track stream position, starting from RBBIDataHeader.
|
||||
|
||||
|
@ -207,39 +200,14 @@ class RBBIRuleBuilder {
|
|||
header[RBBIDataWrapper.DH_LENGTH] = totalSize; // fLength, the total size of all rule sections.
|
||||
header[RBBIDataWrapper.DH_CATCOUNT] = fSetBuilder.getNumCharCategories(); // fCatCount.
|
||||
|
||||
// Only save the forward table and the safe reverse table,
|
||||
// because these are the only ones used at run-time.
|
||||
//
|
||||
// For the moment, we still build the other tables if they are present in the rule source files,
|
||||
// for backwards compatibility. Old rule files need to work, and this is the simplest approach.
|
||||
//
|
||||
// Additional backwards compatibility consideration: if no safe rules are provided, consider the
|
||||
// reverse rules to actually be the safe reverse rules.
|
||||
|
||||
header[RBBIDataWrapper.DH_FTABLE] = headerSize; // fFTable
|
||||
header[RBBIDataWrapper.DH_FTABLELEN] = forwardTableSize; // fTableLen
|
||||
|
||||
// Do not save Reverse Table.
|
||||
header[RBBIDataWrapper.DH_RTABLE] = header[RBBIDataWrapper.DH_FTABLE] + forwardTableSize; // fRTable
|
||||
header[RBBIDataWrapper.DH_RTABLELEN] = 0; // fRTableLen
|
||||
header[RBBIDataWrapper.DH_RTABLELEN] = reverseTableSize; // fRTableLen
|
||||
|
||||
// Do not save the Safe Forward table.
|
||||
header[RBBIDataWrapper.DH_SFTABLE] = header[RBBIDataWrapper.DH_RTABLE]
|
||||
+ 0; // fSTable
|
||||
header[RBBIDataWrapper.DH_SFTABLELEN] = 0; // fSTableLen
|
||||
|
||||
// Safe reverse table. Use if present, otherwise save regular reverse table as the safe reverse.
|
||||
header[RBBIDataWrapper.DH_SRTABLE] = header[RBBIDataWrapper.DH_SFTABLE]
|
||||
+ 0; // fSRTable
|
||||
if (safeRevTableSize > 0) {
|
||||
header[RBBIDataWrapper.DH_SRTABLELEN] = safeRevTableSize;
|
||||
} else {
|
||||
assert reverseTableSize > 0;
|
||||
header[RBBIDataWrapper.DH_SRTABLELEN] = reverseTableSize;
|
||||
}
|
||||
|
||||
header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_SRTABLE]
|
||||
+ header[RBBIDataWrapper.DH_SRTABLELEN]; // fTrie
|
||||
header[RBBIDataWrapper.DH_TRIE] = header[RBBIDataWrapper.DH_RTABLE]
|
||||
+ header[RBBIDataWrapper.DH_RTABLELEN]; // fTrie
|
||||
header[RBBIDataWrapper.DH_TRIELEN] = fSetBuilder.getTrieSize(); // fTrieLen
|
||||
header[RBBIDataWrapper.DH_STATUSTABLE] = header[RBBIDataWrapper.DH_TRIE]
|
||||
+ header[RBBIDataWrapper.DH_TRIELEN];
|
||||
|
@ -253,49 +221,25 @@ class RBBIRuleBuilder {
|
|||
}
|
||||
|
||||
// Write out the actual state tables.
|
||||
RBBIDataWrapper.RBBIStateTable table = fForwardTables.exportTable();
|
||||
assert(outputPos == header[4]);
|
||||
RBBIDataWrapper.RBBIStateTable table = fForwardTable.exportTable();
|
||||
assert(outputPos == header[RBBIDataWrapper.DH_FTABLE]);
|
||||
outputPos += table.put(dos);
|
||||
|
||||
/* do not write the reverse table
|
||||
tableData = fReverseTables.exportTable();
|
||||
Assert.assrt(outputPos == header[6]);
|
||||
for (i = 0; i < tableData.length; i++) {
|
||||
dos.writeShort(tableData[i]);
|
||||
outputPos += 2;
|
||||
}
|
||||
*/
|
||||
|
||||
/* do not write safe forwards table
|
||||
Assert.assrt(outputPos == header[8]);
|
||||
tableData = fSafeFwdTables.exportTable();
|
||||
for (i = 0; i < tableData.length; i++) {
|
||||
dos.writeShort(tableData[i]);
|
||||
outputPos += 2;
|
||||
}
|
||||
*/
|
||||
|
||||
// Write the safe reverse table.
|
||||
// If not present, write the plain reverse table (old style rule compatibility)
|
||||
assert(outputPos == header[10]);
|
||||
if (safeRevTableSize > 0) {
|
||||
table = fSafeRevTables.exportTable();
|
||||
} else {
|
||||
table = fReverseTables.exportTable();
|
||||
}
|
||||
table = fForwardTable.exportSafeTable();
|
||||
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RTABLE]);
|
||||
outputPos += table.put(dos);
|
||||
|
||||
// write out the Trie table
|
||||
Assert.assrt(outputPos == header[12]);
|
||||
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_TRIE]);
|
||||
fSetBuilder.serializeTrie(os);
|
||||
outputPos += header[13];
|
||||
outputPos += header[RBBIDataWrapper.DH_TRIELEN];
|
||||
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
|
||||
dos.write(0);
|
||||
outputPos += 1;
|
||||
}
|
||||
|
||||
// Write out the status {tag} table.
|
||||
Assert.assrt(outputPos == header[16]);
|
||||
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_STATUSTABLE]);
|
||||
for (Integer val : fRuleStatusVals) {
|
||||
dos.writeInt(val.intValue());
|
||||
outputPos += 4;
|
||||
|
@ -308,7 +252,7 @@ class RBBIRuleBuilder {
|
|||
|
||||
// Write out the stripped rules (rules with extra spaces removed
|
||||
// These go last in the data area, even though they are not last in the header.
|
||||
Assert.assrt(outputPos == header[14]);
|
||||
Assert.assrt(outputPos == header[RBBIDataWrapper.DH_RULESOURCE]);
|
||||
dos.writeChars(strippedRules);
|
||||
outputPos += strippedRules.length() * 2;
|
||||
while (outputPos % 8 != 0) { // pad to an 8 byte boundary
|
||||
|
@ -330,7 +274,15 @@ class RBBIRuleBuilder {
|
|||
// and list of all Unicode Sets referenced by the rules.
|
||||
//
|
||||
RBBIRuleBuilder builder = new RBBIRuleBuilder(rules);
|
||||
builder.fScanner.parse();
|
||||
builder.build(os);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compile rules to the binary form, write that to an ouput stream.
|
||||
*
|
||||
*/
|
||||
void build(OutputStream os) throws IOException {
|
||||
fScanner.parse();
|
||||
|
||||
//
|
||||
// UnicodeSet processing.
|
||||
|
@ -338,31 +290,30 @@ class RBBIRuleBuilder {
|
|||
// Generate the mapping tables (TRIE) from input code points to
|
||||
// the character categories.
|
||||
//
|
||||
builder.fSetBuilder.buildRanges();
|
||||
fSetBuilder.buildRanges();
|
||||
|
||||
//
|
||||
// Generate the DFA state transition table.
|
||||
//
|
||||
builder.fForwardTables = new RBBITableBuilder(builder, fForwardTree);
|
||||
builder.fReverseTables = new RBBITableBuilder(builder, fReverseTree);
|
||||
builder.fSafeFwdTables = new RBBITableBuilder(builder, fSafeFwdTree);
|
||||
builder.fSafeRevTables = new RBBITableBuilder(builder, fSafeRevTree);
|
||||
builder.fForwardTables.build();
|
||||
builder.fReverseTables.build();
|
||||
builder.fSafeFwdTables.build();
|
||||
builder.fSafeRevTables.build();
|
||||
if (builder.fDebugEnv != null
|
||||
&& builder.fDebugEnv.indexOf("states") >= 0) {
|
||||
builder.fForwardTables.printRuleStatusTable();
|
||||
fForwardTable = new RBBITableBuilder(this, fForwardTree);
|
||||
fForwardTable.buildForwardTable();
|
||||
optimizeTables();
|
||||
fForwardTable.buildSafeReverseTable();
|
||||
|
||||
|
||||
if (fDebugEnv != null
|
||||
&& fDebugEnv.indexOf("states") >= 0) {
|
||||
fForwardTable.printStates();
|
||||
fForwardTable.printRuleStatusTable();
|
||||
fForwardTable.printReverseTable();
|
||||
}
|
||||
|
||||
builder.optimizeTables();
|
||||
builder.fSetBuilder.buildTrie();
|
||||
fSetBuilder.buildTrie();
|
||||
//
|
||||
// Package up the compiled data, writing it to an output stream
|
||||
// in the serialization format. This is the same as the ICU4C runtime format.
|
||||
//
|
||||
builder.flattenData(os);
|
||||
flattenData(os);
|
||||
}
|
||||
|
||||
static class IntPair {
|
||||
|
@ -377,17 +328,10 @@ class RBBIRuleBuilder {
|
|||
|
||||
void optimizeTables() {
|
||||
IntPair duplPair = new IntPair(3, 0);
|
||||
while (fForwardTables.findDuplCharClassFrom(duplPair)) {
|
||||
while (fForwardTable.findDuplCharClassFrom(duplPair)) {
|
||||
fSetBuilder.mergeCategories(duplPair.first, duplPair.second);
|
||||
fForwardTables.removeColumn(duplPair.second);
|
||||
fReverseTables.removeColumn(duplPair.second);
|
||||
fSafeFwdTables.removeColumn(duplPair.second);
|
||||
fSafeRevTables.removeColumn(duplPair.second);
|
||||
fForwardTable.removeColumn(duplPair.second);
|
||||
}
|
||||
|
||||
fForwardTables.removeDuplicateStates();
|
||||
fReverseTables.removeDuplicateStates();
|
||||
fSafeFwdTables.removeDuplicateStates();
|
||||
fSafeRevTables.removeDuplicateStates();
|
||||
fForwardTable.removeDuplicateStates();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -95,7 +95,7 @@ class RBBITableBuilder {
|
|||
// table from the RBBI rules parse tree.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
void build() {
|
||||
void buildForwardTable() {
|
||||
// If there were no rules, just return. This situation can easily arise
|
||||
// for the reverse rules.
|
||||
if (fRB.fTreeRoots[fRootIx]==null) {
|
||||
|
|
|
@ -12,7 +12,6 @@ package com.ibm.icu.text;
|
|||
import static com.ibm.icu.impl.CharacterIteration.DONE32;
|
||||
import static com.ibm.icu.impl.CharacterIteration.next32;
|
||||
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;
|
||||
import static com.ibm.icu.impl.CharacterIteration.previous32;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
@ -510,7 +509,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
checkOffset(offset, fText);
|
||||
|
||||
// Adjust offset to be on a code point boundary and not beyond the end of the text.
|
||||
// Note that isBoundary() is always be false for offsets that are not on code point boundaries.
|
||||
// Note that isBoundary() is always false for offsets that are not on code point boundaries.
|
||||
// But we still need the side effect of leaving iteration at the following boundary.
|
||||
int adjustedOffset = CISetIndex32(fText, offset);
|
||||
|
||||
|
@ -966,142 +965,71 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
* This locates a "Safe Position" from which the forward break rules
|
||||
* will operate correctly. A Safe Position is not necessarily a boundary itself.
|
||||
*
|
||||
* The logic of this function is very similar to handleNext(), above.
|
||||
* The logic of this function is very similar to handleNext(), above, but simpler
|
||||
* because the safe table does not require as many options.
|
||||
*
|
||||
* @param fromPosition the position in the input text to begin the iteration.
|
||||
* @internal
|
||||
*/
|
||||
private int handlePrevious(int fromPosition) {
|
||||
if (fText == null) {
|
||||
return 0;
|
||||
private int handleSafePrevious(int fromPosition) {
|
||||
int state;
|
||||
short category = 0;
|
||||
int result = 0;
|
||||
|
||||
// caches for quicker access
|
||||
CharacterIterator text = fText;
|
||||
Trie2 trie = fRData.fTrie;
|
||||
short[] stateTable = fRData.fRTable.fTable;
|
||||
|
||||
CISetIndex32(text, fromPosition);
|
||||
if (TRACE) {
|
||||
System.out.print("Handle Previous pos char state category");
|
||||
}
|
||||
|
||||
int state;
|
||||
int category = 0;
|
||||
int mode;
|
||||
int row;
|
||||
int c;
|
||||
int result = 0;
|
||||
int initialPosition = fromPosition;
|
||||
fLookAheadMatches.reset();
|
||||
short[] stateTable = fRData.fSRTable.fTable;
|
||||
CISetIndex32(fText, fromPosition);
|
||||
if (fromPosition == fText.getBeginIndex()) {
|
||||
// if we're already at the start of the text, return DONE.
|
||||
if (text.getIndex() == text.getBeginIndex()) {
|
||||
return BreakIterator.DONE;
|
||||
}
|
||||
|
||||
// set up the starting char
|
||||
result = initialPosition;
|
||||
c = previous32(fText);
|
||||
|
||||
// Set up the initial state for the state machine
|
||||
// Set the initial state for the state machine
|
||||
int c = CharacterIteration.previous32(text);
|
||||
state = START_STATE;
|
||||
row = fRData.getRowIndex(state);
|
||||
category = 3; // TODO: obsolete? from the old start/run mode scheme?
|
||||
mode = RBBI_RUN;
|
||||
if ((fRData.fSRTable.fFlags & RBBIDataWrapper.RBBI_BOF_REQUIRED) != 0) {
|
||||
category = 2;
|
||||
mode = RBBI_START;
|
||||
}
|
||||
int row = fRData.getRowIndex(state);
|
||||
|
||||
if (TRACE) {
|
||||
System.out.println("Handle Prev pos char state category ");
|
||||
}
|
||||
|
||||
// loop until we reach the beginning of the text or transition to state 0
|
||||
// loop until we reach the start of the text or transition to state 0
|
||||
//
|
||||
mainLoop: for (;;) {
|
||||
if (c == DONE32) {
|
||||
// Reached end of input string.
|
||||
if (mode == RBBI_END) {
|
||||
// We have already done the {eof} iteration. Now is the time
|
||||
// to unconditionally bail out.
|
||||
break mainLoop;
|
||||
}
|
||||
mode = RBBI_END;
|
||||
category = 1;
|
||||
}
|
||||
|
||||
if (mode == RBBI_RUN) {
|
||||
// look up the current character's category, which tells us
|
||||
// which column in the state table to look at.
|
||||
//
|
||||
// And off the dictionary flag bit. For reverse iteration it is not used.
|
||||
category = (short) fRData.fTrie.get(c);
|
||||
category &= ~0x4000;
|
||||
}
|
||||
for (; c != DONE32; c = CharacterIteration.previous32(text)) {
|
||||
|
||||
// look up the current character's character category, which tells us
|
||||
// which column in the state table to look at.
|
||||
//
|
||||
// And off the dictionary flag bit. For reverse iteration it is not used.
|
||||
category = (short) trie.get(c);
|
||||
category &= ~0x4000;
|
||||
if (TRACE) {
|
||||
System.out.print(" " + fText.getIndex() + " ");
|
||||
if (0x20 <= c && c < 0x7f) {
|
||||
System.out.print(" " + c + " ");
|
||||
} else {
|
||||
System.out.print(" " + Integer.toHexString(c) + " ");
|
||||
}
|
||||
System.out.println(" " + state + " " + category + " ");
|
||||
System.out.print(" " + RBBIDataWrapper.intToString(text.getIndex(), 5));
|
||||
System.out.print(RBBIDataWrapper.intToHexString(c, 10));
|
||||
System.out.println(RBBIDataWrapper.intToString(state,7) + RBBIDataWrapper.intToString(category,6));
|
||||
}
|
||||
|
||||
// State Transition - move machine to its next state
|
||||
//
|
||||
assert(category < fRData.fHeader.fCatCount);
|
||||
state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
|
||||
row = fRData.getRowIndex(state);
|
||||
|
||||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] == -1) {
|
||||
// Match found, common case, could have lookahead so we move
|
||||
// on to check it
|
||||
result = fText.getIndex();
|
||||
}
|
||||
|
||||
|
||||
int completedRule = stateTable[row + RBBIDataWrapper.ACCEPTING];
|
||||
if (completedRule > 0) {
|
||||
// Lookahead match is completed.
|
||||
int lookaheadResult = fLookAheadMatches.getPosition(completedRule);
|
||||
if (lookaheadResult >= 0) {
|
||||
result = lookaheadResult;
|
||||
break mainLoop;
|
||||
}
|
||||
}
|
||||
int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
|
||||
if (rule != 0) {
|
||||
// At the position of a '/' in a look-ahead match. Record it.
|
||||
int pos = fText.getIndex();
|
||||
fLookAheadMatches.setPosition(rule, pos);
|
||||
}
|
||||
row = fRData.getRowIndex(state);
|
||||
|
||||
if (state == STOP_STATE) {
|
||||
// Normal loop exit is here
|
||||
break mainLoop;
|
||||
// This is the normal exit from the lookup state machine.
|
||||
// Transition to state zero means we have found a safe point.
|
||||
break;
|
||||
}
|
||||
|
||||
// then move iterator position backwards one character
|
||||
//
|
||||
if (mode == RBBI_RUN) {
|
||||
c = previous32(fText);
|
||||
} else {
|
||||
if (mode == RBBI_START) {
|
||||
mode = RBBI_RUN;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // End of the main loop.
|
||||
|
||||
// The state machine is done. Check whether it found a match...
|
||||
//
|
||||
// If the iterator failed to move in the match engine, force it back by one code point.
|
||||
// (This really indicates a defect in the break rules. They should always match
|
||||
// at least one character.)
|
||||
if (result == initialPosition) {
|
||||
CISetIndex32(fText, initialPosition);
|
||||
previous32(fText);
|
||||
result = fText.getIndex();
|
||||
}
|
||||
|
||||
// The state machine is done.
|
||||
result = text.getIndex();
|
||||
if (TRACE) {
|
||||
System.out.println("Result = " + result);
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -1493,11 +1421,26 @@ class BreakCache {
|
|||
if ((position < fBoundaries[fStartBufIdx] - 15) || position > (fBoundaries[fEndBufIdx] + 15)) {
|
||||
int aBoundary = fText.getBeginIndex();
|
||||
int ruleStatusIndex = 0;
|
||||
// TODO: check for position == length of text. Although may still need to back up to get rule status.
|
||||
if (position > aBoundary + 20) {
|
||||
int backupPos = handlePrevious(position);
|
||||
fPosition = backupPos;
|
||||
aBoundary = handleNext(); // Ignore dictionary, just finding a rule based boundary.
|
||||
int backupPos = handleSafePrevious(position);
|
||||
if (backupPos > aBoundary) {
|
||||
// Advance to the boundary following the backup position.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
//
|
||||
fPosition = backupPos;
|
||||
aBoundary = handleNext();
|
||||
if (aBoundary == backupPos + 1 ||
|
||||
(aBoundary == backupPos + 2 &&
|
||||
Character.isHighSurrogate(fText.setIndex(backupPos)) &&
|
||||
Character.isLowSurrogate(fText.next()))) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
// Safe rules identify safe pairs.
|
||||
aBoundary = handleNext();
|
||||
}
|
||||
}
|
||||
ruleStatusIndex = fRuleStatusIndex;
|
||||
}
|
||||
reset(aBoundary, ruleStatusIndex); // Reset cache to hold aBoundary as a single starting point.
|
||||
|
@ -1628,21 +1571,34 @@ class BreakCache {
|
|||
if (backupPosition <= textBegin) {
|
||||
backupPosition = textBegin;
|
||||
} else {
|
||||
backupPosition = handlePrevious(backupPosition);
|
||||
backupPosition = handleSafePrevious(backupPosition);
|
||||
}
|
||||
if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
|
||||
position = textBegin;
|
||||
positionStatusIdx = 0;
|
||||
} else {
|
||||
// Advance to the boundary following the backup position.
|
||||
// There is a complication: the safe reverse rules identify pairs of code points
|
||||
// that are safe. If advancing from the safe point moves forwards by less than
|
||||
// two code points, we need to advance one more time to ensure that the boundary
|
||||
// is good, including a correct rules status value.
|
||||
//
|
||||
fPosition = backupPosition; // TODO: pass starting position in a clearer way.
|
||||
position = handleNext();
|
||||
if (position == backupPosition + 1 ||
|
||||
(position == backupPosition + 2 &&
|
||||
Character.isHighSurrogate(fText.setIndex(backupPosition)) &&
|
||||
Character.isLowSurrogate(fText.next()))) {
|
||||
// The initial handleNext() only advanced by a single code point. Go again.
|
||||
// Safe rules identify safe pairs.
|
||||
position = handleNext();
|
||||
}
|
||||
positionStatusIdx = fRuleStatusIndex;
|
||||
|
||||
}
|
||||
} while (position >= fromPosition);
|
||||
|
||||
// Find boundaries between the one we just located and the first already-cached boundary
|
||||
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer..
|
||||
// Put them in a side buffer, because we don't yet know where they will fall in the circular cache buffer.
|
||||
|
||||
fSideBuffer.removeAllElements();
|
||||
fSideBuffer.push(position);
|
||||
|
|
Loading…
Add table
Reference in a new issue