ICU-3295 rbbi rt port to Java.

X-SVN-Rev: 14986
This commit is contained in:
Andy Heninger 2004-04-16 01:15:04 +00:00
parent c957f85632
commit 77136a5b24
3 changed files with 513 additions and 30 deletions

View file

@ -219,7 +219,7 @@ RuleBasedBreakIterator::operator==(const BreakIterator& that) const {
const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that;
if (fText == that2.fText ||
(fText != NULL && that2.fText != NULL && *that2.fText == *fText)) {
if (that2.fData == fData ||
if (that2.fData == fData ||
(fData != NULL && that2.fData != NULL && *that2.fData == *fData)) {
r = TRUE;
}
@ -475,12 +475,12 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
// otherwise, set our internal iteration position (temporarily)
// to the position passed in. If this is the _beginning_ position,
// then we can just use next() to get our return value
int32_t result = 0;
if (fData->fSafeRevTable != NULL) {
// new rule syntax
/// todo synwee
/// todo synwee
fText->setIndex(offset);
// move forward one codepoint to prepare for moving back to a
// safe point.
@ -500,9 +500,9 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
fText->previous32();
// handle next will give result >= offset
handleNext(fData->fSafeFwdTable);
// previous will give result 0 or 1 boundary away from offset,
// previous will give result 0 or 1 boundary away from offset,
// most of the time
// we have to
// we have to
int32_t oldresult = previous();
while (oldresult > offset) {
int32_t result = previous();
@ -584,9 +584,9 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
// handle previous will give result <= offset
handlePrevious(fData->fSafeRevTable);
// next will give result 0 or 1 boundary away from offset,
// next will give result 0 or 1 boundary away from offset,
// most of the time
// we have to
// we have to
int32_t oldresult = next();
while (oldresult < offset) {
int32_t result = next();
@ -779,8 +779,8 @@ int32_t RuleBasedBreakIterator::handleNext(const RBBIStateTable *statetable) {
}
if (row->fLookAhead != 0) {
if (lookaheadStatus != 0
&& row->fAccepting == lookaheadStatus) {
if (lookaheadStatus != 0
&& row->fAccepting == lookaheadStatus) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
result = lookaheadResult;
@ -817,7 +817,7 @@ continueOn:
// We have advanced through the string until it is certain that no
// longer match is possible, no matter what characters follow.
break;
}
}
}
// The state machine is done. Check whether it found a match...
@ -839,6 +839,13 @@ continueOn:
return result;
}
//----------------------------------------------------------------
//
// handlePrevious(void) This is the variant used with old style rules
// (Overshoot to a safe point, then move forward)
//
//----------------------------------------------------------------
int32_t RuleBasedBreakIterator::handlePrevious(void) {
if (fText == NULL || fData == NULL) {
return 0;
@ -991,10 +998,10 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
int32_t state = START_STATE;
int32_t category;
int32_t lastCategory = 0;
UBool hasPassedStartText = !fText->hasPrevious();
UBool hasPassedStartText = !fText->hasPrevious();
UChar32 c = fText->previous32();
// previous character
int32_t result = fText->getIndex();
int32_t result = fText->getIndex();
int32_t lookaheadStatus = 0;
int32_t lookaheadResult = 0;
int32_t lookaheadTagIdx = 0;
@ -1017,7 +1024,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
// loop until we reach the beginning of the text or transition to state 0
for (;;) {
// if (c == CharacterIterator::DONE && fText->hasPrevious()==FALSE) {
if (hasPassedStartText) {
if (hasPassedStartText) {
// if we have already considered the start of the text
if (row->fLookAhead != 0 && lookaheadResult == 0) {
result = 0;
@ -1052,7 +1059,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
state = row->fNextState[category];
row = (RBBIStateTableRow *)
(statetable->fTableData + (state * statetable->fRowLen));
if (row->fAccepting == -1) {
// Match found, common case, could have lookahead so we move on to check it
result = fText->getIndex();
@ -1061,8 +1068,8 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
}
if (row->fLookAhead != 0) {
if (lookaheadStatus != 0
&& row->fAccepting == lookaheadStatus) {
if (lookaheadStatus != 0
&& row->fAccepting == lookaheadStatus) {
// Lookahead match is completed. Set the result accordingly, but only
// if no other rule has matched further in the mean time.
result = lookaheadResult;
@ -1085,7 +1092,7 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
}
category = lastCategory;
fText->setIndex(result);
goto continueOn;
}
@ -1105,12 +1112,12 @@ int32_t RuleBasedBreakIterator::handlePrevious(const RBBIStateTable *statetable)
lookaheadStatus = 0; // clear out any pending look-ahead matches.
continueOn:
if (state == STOP_STATE) {
if (state == STOP_STATE) {
break;
}
// then advance one character backwards
hasPassedStartText = !fText->hasPrevious();
hasPassedStartText = !fText->hasPrevious();
c = fText->previous32();
}
@ -1186,7 +1193,7 @@ int32_t RuleBasedBreakIterator::getRuleStatus() const {
int32_t RuleBasedBreakIterator::getRuleStatusVec(
int32_t *fillInVec, int32_t capacity, UErrorCode &status)
int32_t *fillInVec, int32_t capacity, UErrorCode &status)
{
if (U_FAILURE(status)) {
return 0;
@ -1197,7 +1204,7 @@ int32_t RuleBasedBreakIterator::getRuleStatusVec(
int32_t numVals = fData->fRuleStatusTable[fLastRuleStatusIndex];
int32_t numValsToCopy = numVals;
if (numVals > capacity) {
status = U_BUFFER_OVERFLOW_ERROR;
status = U_BUFFER_OVERFLOW_ERROR;
numValsToCopy = capacity;
}
int i;

View file

@ -0,0 +1,284 @@
/**
*******************************************************************************
* Copyright (C) 1996-2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.InputStream;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.util.Locale;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.icu.util.VersionInfo;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.impl.ICUData;
import com.ibm.icu.impl.Trie;
import com.ibm.icu.impl.CharTrie;
/**
* <p>Internal class used for Rule Based Break Iterators</p>
* <p>This class provides access to the compiled break rule data, as
* it is stored in a .brk file.
*
*/
public class RBBIDataWrapper {
//
// These fields are the ready-to-use compiled rule data, as
// read from the file.
//
public RBBIDataHeader fHeader;
public short fFTable[];
public short fRTable[];
public short fSFTable[];
public short fSRTable[];
public CharTrie fTrie;
public String fRuleSource;
public int fStatusTable[];
// Data Header. A struct-like class with the fields from the RBBI data file header.
//
static class RBBIDataHeader {
int fMagic; // == 0xbla0
int fVersion; // == 1
int fLength; // Total length in bytes of this RBBI Data,
// including all sections, not just the header.
int fCatCount; // Number of character categories.
//
// Offsets and sizes of each of the subsections within the RBBI data.
// All offsets are bytes from the start of the RBBIDataHeader.
// All sizes are in bytes.
//
int fFTable; // forward state transition table.
int fFTableLen;
int fRTable; // Offset to the reverse state transition table.
int fRTableLen;
int fSFTable; // safe point forward transition table
int fSFTableLen;
int fSRTable; // safe point reverse transition table
int fSRTableLen;
int fTrie; // Offset to Trie data for character categories
int fTrieLen;
int fRuleSource; // Offset to the source for for the break
int fRuleSourceLen; // rules. Stored UChar *.
int fStatusTable; // Offset to the table of rule status values
int fStatusTableLen;
public RBBIDataHeader() {
fMagic = 0;
};
};
static class TrieFoldingFunc implements Trie.DataManipulate {
public int getFoldingOffset(int data) {
if ((data & 0x8000) == 0) {
return data & 0x7fff;
} else {
return 0;
}
}
};
static TrieFoldingFunc fTrieFoldingFunc;
RBBIDataWrapper() {
};
static RBBIDataWrapper get(String name) throws IOException {
String fullName = "data/" + name;
InputStream is = ICUData.getRequiredStream(fullName);
return get(is);
}
/*
* Get an RBBIDataWrapper from an InputStream onto a pre-compiled set
* of RBBI rules.
*/
static RBBIDataWrapper get(InputStream is) throws IOException {
int i;
DataInputStream dis = new DataInputStream(is);
RBBIDataWrapper This = new RBBIDataWrapper();
// Seek past the ICU data header.
// TODO: verify that it looks good.
dis.skip(0x80);
// Read in the RBBI data header...
This.fHeader = new RBBIDataHeader();
This.fHeader.fMagic = dis.readInt();
This.fHeader.fVersion = dis.readInt();
This.fHeader.fLength = dis.readInt();
This.fHeader.fCatCount = dis.readInt();
This.fHeader.fFTable = dis.readInt();
This.fHeader.fFTableLen = dis.readInt();
This.fHeader.fRTable = dis.readInt();
This.fHeader.fRTableLen = dis.readInt();
This.fHeader.fSFTable = dis.readInt();
This.fHeader.fSFTableLen = dis.readInt();
This.fHeader.fSRTable = dis.readInt();
This.fHeader.fSRTableLen = dis.readInt();
This.fHeader.fTrie = dis.readInt();
This.fHeader.fTrieLen = dis.readInt();
This.fHeader.fRuleSource = dis.readInt();
This.fHeader.fRuleSourceLen = dis.readInt();
This.fHeader.fStatusTable = dis.readInt();
This.fHeader.fStatusTableLen = dis.readInt();
dis.skip(6 * 4); // uint32_t fReserved[6];
if (This.fHeader.fMagic != 0xb1a0) {
throw new IOException("Break Iterator Rule Data Magic Number Incorrect");
}
// Current position in input stream.
int pos = 24 * 4; // offset of end of header, which has 24 fields, all int32_t (4 bytes)
//
// Read in the Forward state transition table as an array of shorts.
//
// Quick Sanity Check
if (This.fHeader.fFTable < pos || This.fHeader.fFTable > This.fHeader.fLength) {
throw new IOException("Break iterator Rule data corrupt");
}
// Skip over any padding preceding this table
dis.skip(This.fHeader.fFTable - pos);
pos = This.fHeader.fFTable;
This.fFTable = new short[This.fHeader.fFTableLen / 2];
for ( i=0; i<This.fFTable.length; i++) {
This.fFTable[i] = dis.readShort();
pos += 2;
}
//
// Read in the Reverse state table
//
// Skip over any padding in the file
dis.skip(This.fHeader.fRTable - pos);
pos = This.fHeader.fRTable;
// Create & fill the table itself.
This.fRTable = new short[This.fHeader.fRTableLen / 2];
for (i=0; i<This.fRTable.length; i++) {
This.fRTable[i] = dis.readShort();
pos += 2;
}
//
// Read in the Safe Forward state table
//
if (This.fHeader.fSFTableLen > 0) {
// Skip over any padding in the file
dis.skip(This.fHeader.fSFTable - pos);
pos = This.fHeader.fSFTable;
// Create & fill the table itself.
This.fSFTable = new short[This.fHeader.fSFTableLen / 2];
for (i=0; i<This.fSFTable.length; i++) {
This.fSFTable[i] = dis.readShort();
pos += 2;
}
}
//
// Read in the Safe Reverse state table
//
if (This.fHeader.fSRTableLen > 0) {
// Skip over any padding in the file
dis.skip(This.fHeader.fSRTable - pos);
pos = This.fHeader.fSRTable;
// Create & fill the table itself.
This.fSRTable = new short[This.fHeader.fSRTableLen / 2];
for (i=0; i<This.fSRTable.length; i++) {
This.fSRTable[i] = dis.readShort();
pos += 2;
}
}
//
// Unserialize the Character categories TRIE
// Because we can't be absolutely certain where the Trie deserialize will
// leave the input stream, leave position unchanged.
// The seek to the start of the next item following the TRIE will get us
// back in sync.
//
dis.skip(This.fHeader.fTrie - pos);
pos = This.fHeader.fTrie;
dis.mark(This.fHeader.fTrieLen+100);
This.fTrie = new CharTrie(dis, fTrieFoldingFunc);
dis.reset();
//
// Read the Rule Status Table
//
if (pos > This.fHeader.fStatusTable) {
throw new IOException("Break iterator Rule data corrupt");
}
dis.skip(This.fHeader.fStatusTable - pos);
pos = This.fHeader.fStatusTable;
This.fStatusTable = new int[This.fHeader.fStatusTableLen / 4];
for (i=0; i<This.fStatusTable.length; i++) {
This.fStatusTable[i] = dis.readInt();
pos += 4;
}
//
// Put the break rule source into a String
//
if (pos > This.fHeader.fRuleSource) {
throw new IOException("Break iterator Rule data corrupt");
}
dis.skip(This.fHeader.fRuleSource - pos);
pos = This.fHeader.fRuleSource;
StringBuffer sb = new StringBuffer(This.fHeader.fRuleSourceLen / 2);
for (i=0; i<This.fHeader.fRuleSourceLen; i+=2) {
sb.append(dis.readChar());
pos += 2;
}
This.fRuleSource = sb.toString();
return This;
}
/** Debug function to display the break iterator data. */
void dump() {
System.out.println("RBBI Data Wrapper dump ...");
System.out.println("Source Rules: " + fRuleSource);
}
public static void main(String[] args) {
String s;
if (args.length == 0) {
s = "icudt28b_char.brk";
} else {
s = args[0];
}
System.out.println("RBBIDataWrapper.main(" + s + ") ");
try {
RBBIDataWrapper This = RBBIDataWrapper.get(s);
This.dump();
}
catch (Exception e) {
System.out.println("Exception: " + e.toString());
}
}
}

View file

@ -16,6 +16,42 @@ import java.text.StringCharacterIterator;
* Window - Preferences - Java - Code Generation - Code and Comments
*/
public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
private static final int START_STATE = 1; // The state number of the starting state
private static final int STOP_STATE = 0; // The state-transition value indicating "stop"
/**
* The character iterator through which this BreakIterator accesses the text
* @internal
*/
private CharacterIterator fText;
/**
* The rule data for this BreakIterator instance
* @internal
*/
private RBBIDataWrapper fData;
/** Index of the Rule {tag} values for the most recent match.
* @internal
*/
private int fLastRuleStatusIndex;
/**
* Rule tag value valid flag.
* Some iterator operations don't intrinsically set the correct tag value.
* This flag lets us lazily compute the value if we are ever asked for it.
* @internal
*/
private boolean fLastStatusIndexValid;
/**
* Debugging flag. Trace operation of state machine when true.
* @internal
*/
public static boolean fTrace;
//=======================================================================
// boilerplate
//=======================================================================
@ -42,11 +78,16 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
}
/**
* Returns the description used to create this iterator
* Returns the description (rules) used to create this iterator.
* (In ICU4C, the same function is RuleBasedBreakIterator::getRules())
* @stable ICU 2.0
*/
public String toString() {
return ""; // TODO:
String retStr = null;
if (fData != null) {
retStr = fData.fRuleSource;
}
return retStr;
}
/**
@ -70,8 +111,16 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* @stable ICU 2.0
*/
public int first() {
return 0; // TODO;
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
if (fText == null) {
return BreakIterator.DONE;
}
fText.first();
return fText.getIndex();
}
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
@ -79,8 +128,26 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* @stable ICU 2.0
*/
public int last() {
return 0; // TODO:
if (fText == null) {
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
return BreakIterator.DONE;
}
// I'm not sure why, but t.last() returns the offset of the last character,
// rather than the past-the-end offset
//
// (It's so a loop like for(p=it.last(); p!=DONE; p=it.previous()) ...
// will work correctly.)
fLastStatusIndexValid = false;
int pos = fText.getEndIndex();
fText.setIndex(pos);
return pos;
}
/**
* Advances the iterator either forward or backward the specified number of steps.
* Negative values move backward, and positive values move forward. This is
@ -92,23 +159,88 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
* @stable ICU 2.0
*/
public int next(int n) {
return 0; // TODO:
int result = current();
while (n > 0) {
result = handleNext();
--n;
}
while (n < 0) {
result = previous();
++n;
}
return result;
}
/**
* Advances the iterator to the next boundary position.
* @return The position of the first boundary after this one.
* @stable ICU 2.0
*/
public int next() {
return 0; // TODO:
return handleNext();
}
/**
* Advances the iterator backwards, to the last boundary preceding this one.
* Moves the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
* @stable ICU 2.0
*/
public int previous() {
return 0; // TODO:
// if we're already sitting at the beginning of the text, return DONE
if (fText == null || current() == fText.getBeginIndex()) {
fLastRuleStatusIndex = 0;
fLastStatusIndexValid = true;
return BreakIterator.DONE;
}
if (fData.fSRTable != null || fData.fSFTable != null) {
return handlePrevious(fData.fRTable);
}
// old rule syntax
// set things up. handlePrevious() will back us up to some valid
// break position before the current position (we back our internal
// iterator up one step to prevent handlePrevious() from returning
// the current position), but not necessarily the last one before
// where we started
int start = current();
CIPrevious32(fText);
int lastResult = handlePrevious();
int result = lastResult;
int lastTag = 0;
boolean breakTagValid = false;
// iterate forward from the known break position until we pass our
// starting point. The last break position before the starting
// point is our return value
for (;;) {
result = handleNext();
if (result == BreakIterator.DONE || result >= start) {
break;
}
lastResult = result;
lastTag = fLastRuleStatusIndex;
breakTagValid = true;
}
// fLastBreakTag wants to have the value for section of text preceding
// the result position that we are to return (in lastResult.) If
// the backwards rules overshot and the above loop had to do two or more
// handleNext()s to move up to the desired return position, we will have a valid
// tag value. But, if handlePrevious() took us to exactly the correct result positon,
// we wont have a tag value for that position, which is only set by handleNext().
// set the current iteration position to be the last break position
// before where we started, and then return that value
fText.setIndex(lastResult);
fLastRuleStatusIndex = lastTag; // for use by getRuleStatus()
fLastStatusIndexValid = breakTagValid;
return lastResult;
}
/**
* Sets the iterator to refer to the first boundary position following
@ -218,7 +350,7 @@ public int getRuleStatusVec(int[] fillInArray) {
* @stable ICU 2.0
*/
public CharacterIterator getText() {
return new StringCharacterIterator("");
return fText;
}
@ -229,6 +361,66 @@ public int getRuleStatusVec(int[] fillInArray) {
* @stable ICU 2.0
*/
public void setText(CharacterIterator newText) {
fText = newText;
this.first();
}
private static int CINext32(CharacterIterator ci) {
int retVal = 0;
char cLead = ci.next();
retVal = (int)cLead;
if (UTF16.isLeadSurrogate(cLead)) {
char cTrail = ci.next();
if (UTF16.isTrailSurrogate(cTrail)) {
retVal = ((int)cLead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10 +
((int)cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE);
} else {
ci.previous();
}
}
return retVal;
}
private static int CIPrevious32(CharacterIterator ci) {
int retVal = 0;
char cTrail = ci.previous();
retVal = (int)cTrail;
if (UTF16.isTrailSurrogate(cTrail)) {
char cLead = ci.previous();
if (UTF16.isLeadSurrogate(cLead)) {
retVal = ((int)cLead - UTF16.LEAD_SURROGATE_MIN_VALUE) << 10 +
((int)cTrail - UTF16.TRAIL_SURROGATE_MIN_VALUE);
} else {
ci.next();
}
}
return retVal;
}
/**
* Internal implementation of next() for RBBI.
* @internal
*/
private int handleNext() {
// TODO:
return 0;
}
private int handlePrevious() {
// TODO:
return 0;
}
private int handlePrevious(short statetable[]) {
// TODO:
return 0;
}
}