ICU-3295 rbbi rt port to Java.

X-SVN-Rev: 15241
This commit is contained in:
Andy Heninger 2004-05-10 22:51:38 +00:00
parent 51f4d6a8a2
commit f4f77062d8
3 changed files with 64 additions and 49 deletions

View file

@ -21,23 +21,22 @@ import com.ibm.icu.impl.CharTrie;
* <p>Internal class used for Rule Based Break Iterators</p>
* <p>This class provides access to the compiled break rule data, as
* it is stored in a .brk file.
* @internal
*
*/
public class RBBIDataWrapper {
class RBBIDataWrapper {
//
// These fields are the ready-to-use compiled rule data, as
// read from the file.
//
public RBBIDataHeader fHeader;
public short fFTable[];
public short fRTable[];
public short fSFTable[];
public short fSRTable[];
public CharTrie fTrie;
public String fRuleSource;
public int fStatusTable[];
RBBIDataHeader fHeader;
short fFTable[];
short fRTable[];
short fSFTable[];
short fSRTable[];
CharTrie fTrie;
String fRuleSource;
int fStatusTable[];
// Index offsets to the fields in a state table row.
// Corresponds to struct RBBIStateTableRow in the C version.
@ -148,7 +147,7 @@ public class RBBIDataWrapper {
RBBIDataWrapper This = new RBBIDataWrapper();
// Seek past the ICU data header.
// TODO: verify that it looks good.
// TODO: verify that the header looks good.
dis.skip(0x80);
// Read in the RBBI data header...
@ -295,7 +294,9 @@ public class RBBIDataWrapper {
/** Debug function to display the break iterator data. */
/** Debug function to display the break iterator data.
* @internal
*/
void dump() {
System.out.println("RBBI Data Wrapper dump ...");
System.out.println();
@ -314,7 +315,6 @@ public class RBBIDataWrapper {
}
/** Fixed width int-to-string conversion.
* TODO: there must be an easy built-in way to do this
* @internal
*
*/
@ -328,7 +328,6 @@ public class RBBIDataWrapper {
}
/** Fixed width int-to-string conversion.
* TODO: there must be an easy built-in way to do this
* @internal
*
*/

View file

@ -13,6 +13,7 @@ import java.io.IOException;
/**
* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
* @stable ICU 2.0
*/
public class RuleBasedBreakIterator extends BreakIterator {
@ -57,10 +58,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
* produced by the ICU4C tool "genbrk".
* @return A RuleBasedBreakIterator based on the supplied break rules.
* @throws IOException
* @draft ICU 3.0
*/
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
// TODO:
return null;
return RuleBasedBreakIterator_New.getInstanceFromCompiledRules(is);
}
@ -114,24 +115,33 @@ public class RuleBasedBreakIterator extends BreakIterator {
/** Tag value for "words" that do not fit into any of other categories.
* Includes spaces and most punctuation. */
public static final int WORD_NONE = 0;
/** Upper bound for tags for uncategorized words. */
/** Upper bound for tags for uncategorized words.
* @draft ICU 3.0 */
public static final int WORD_NONE_LIMIT = 100;
/** Tag value for words that appear to be numbers, lower limit. */
/** Tag value for words that appear to be numbers, lower limit.
* @draft ICU 3.0 */
public static final int WORD_NUMBER = 100;
/** Tag value for words that appear to be numbers, upper limit. */
/** Tag value for words that appear to be numbers, upper limit.
* @draft ICU 3.0 */
public static final int WORD_NUMBER_LIMIT = 200;
/** Tag value for words that contain letters, excluding
* hiragana, katakana or ideographic characters, lower limit. */
* hiragana, katakana or ideographic characters, lower limit.
* @draft ICU 3.0 */
public static final int WORD_LETTER = 200;
/** Tag value for words containing letters, upper limit */
/** Tag value for words containing letters, upper limit
* @draft ICU 3.0 */
public static final int WORD_LETTER_LIMIT = 300;
/** Tag value for words containing kana characters, lower limit */
/** Tag value for words containing kana characters, lower limit
* @draft ICU 3.0 */
public static final int WORD_KANA = 300;
/** Tag value for words containing kana characters, upper limit */
/** Tag value for words containing kana characters, upper limit
* @draft ICU 3.0 */
public static final int WORD_KANA_LIMIT = 400;
/** Tag value for words containing ideographic characters, lower limit */
/** Tag value for words containing ideographic characters, lower limit
* @draft ICU 3.0 */
public static final int WORD_IDEO = 400;
/** Tag value for words containing ideographic characters, upper limit */
/** Tag value for words containing ideographic characters, upper limit
* @draft ICU 3.0 */
public static final int WORD_IDEO_LIMIT = 500;
//=======================================================================

View file

@ -13,6 +13,7 @@ import java.io.InputStream;
/**
* Rule Based Break Iterator implementation.
* @internal
*/
public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
@ -133,6 +134,11 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
//=======================================================================
// Constructors & Factories
//=======================================================================
/**
* Create a break iterator from a precompiled set of rules.
* @internal
*/
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
RuleBasedBreakIterator_New This = new RuleBasedBreakIterator_New();
This.fRData = RBBIDataWrapper.get(is);
@ -506,28 +512,6 @@ public int current() {
/**
* Return the status tag from the break rule that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned. If more than one rule applies,
* the numerically largest of the possible status values is returned.
* <p>
* Of the standard types of ICU break iterators, only the word break
* iterator provides status values. The values are defined in
* <code>enum UWordBreak</code>, and allow distinguishing between words
* that contain alphabetic letters, "words" that appear to be numbers,
* punctuation and spaces, words containing ideographic characters, and
* more. Call <code>getRuleStatus</code> after obtaining a boundary
* position from <code>next()<code>, <code>previous()</code>, or
* any other break iterator functions that returns a boundary position.
* <p>
* @return the status from the break rule that determined the most recently
* returned break position.
*
* @draft ICU 3.0
*/
private void makeRuleStatusValid() {
if (fLastStatusIndexValid == false) {
// No cached status is available.
@ -553,6 +537,28 @@ private void makeRuleStatusValid() {
/**
* Return the status tag from the break rule that determined the most recently
* returned break position. The values appear in the rule source
* within brackets, {123}, for example. For rules that do not specify a
* status, a default value of 0 is returned. If more than one rule applies,
* the numerically largest of the possible status values is returned.
* <p>
* Of the standard types of ICU break iterators, only the word break
* iterator provides status values. The values are defined in
* <code>enum UWordBreak</code>, and allow distinguishing between words
* that contain alphabetic letters, "words" that appear to be numbers,
* punctuation and spaces, words containing ideographic characters, and
* more. Call <code>getRuleStatus</code> after obtaining a boundary
* position from <code>next()<code>, <code>previous()</code>, or
* any other break iterator functions that returns a boundary position.
* <p>
* @return the status from the break rule that determined the most recently
* returned break position.
*
* @draft ICU 3.0
*/
public int getRuleStatus() {
makeRuleStatusValid();
// Status records have this form:
@ -943,7 +949,7 @@ public int getRuleStatusVec(int[] fillInArray) {
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != lookaheadStatus) {
// TODO: handle this case of overlapping lookahead matches.
// With correctly written rules, we won't get here.
System.out.println("Trouble in handlePrevious()"); // comment out
// System.out.println("Trouble in handlePrevious()");
}
result = lookaheadResult;
fLastRuleStatusIndex = lookaheadTagIdx;