mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-14 09:21:03 +00:00
ICU-3295 rbbi rt port to Java.
X-SVN-Rev: 15241
This commit is contained in:
parent
51f4d6a8a2
commit
f4f77062d8
3 changed files with 64 additions and 49 deletions
|
@ -21,23 +21,22 @@ import com.ibm.icu.impl.CharTrie;
|
|||
* <p>Internal class used for Rule Based Break Iterators</p>
|
||||
* <p>This class provides access to the compiled break rule data, as
|
||||
* it is stored in a .brk file.
|
||||
* @internal
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
public class RBBIDataWrapper {
|
||||
class RBBIDataWrapper {
|
||||
//
|
||||
// These fields are the ready-to-use compiled rule data, as
|
||||
// read from the file.
|
||||
//
|
||||
public RBBIDataHeader fHeader;
|
||||
public short fFTable[];
|
||||
public short fRTable[];
|
||||
public short fSFTable[];
|
||||
public short fSRTable[];
|
||||
public CharTrie fTrie;
|
||||
public String fRuleSource;
|
||||
public int fStatusTable[];
|
||||
RBBIDataHeader fHeader;
|
||||
short fFTable[];
|
||||
short fRTable[];
|
||||
short fSFTable[];
|
||||
short fSRTable[];
|
||||
CharTrie fTrie;
|
||||
String fRuleSource;
|
||||
int fStatusTable[];
|
||||
|
||||
// Index offsets to the fields in a state table row.
|
||||
// Corresponds to struct RBBIStateTableRow in the C version.
|
||||
|
@ -148,7 +147,7 @@ public class RBBIDataWrapper {
|
|||
RBBIDataWrapper This = new RBBIDataWrapper();
|
||||
|
||||
// Seek past the ICU data header.
|
||||
// TODO: verify that it looks good.
|
||||
// TODO: verify that the header looks good.
|
||||
dis.skip(0x80);
|
||||
|
||||
// Read in the RBBI data header...
|
||||
|
@ -295,7 +294,9 @@ public class RBBIDataWrapper {
|
|||
|
||||
|
||||
|
||||
/** Debug function to display the break iterator data. */
|
||||
/** Debug function to display the break iterator data.
|
||||
* @internal
|
||||
*/
|
||||
void dump() {
|
||||
System.out.println("RBBI Data Wrapper dump ...");
|
||||
System.out.println();
|
||||
|
@ -314,7 +315,6 @@ public class RBBIDataWrapper {
|
|||
}
|
||||
|
||||
/** Fixed width int-to-string conversion.
|
||||
* TODO: there must be an easy built-in way to do this
|
||||
* @internal
|
||||
*
|
||||
*/
|
||||
|
@ -328,7 +328,6 @@ public class RBBIDataWrapper {
|
|||
}
|
||||
|
||||
/** Fixed width int-to-string conversion.
|
||||
* TODO: there must be an easy built-in way to do this
|
||||
* @internal
|
||||
*
|
||||
*/
|
||||
|
|
|
@ -13,6 +13,7 @@ import java.io.IOException;
|
|||
|
||||
/**
|
||||
* <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
|
||||
public class RuleBasedBreakIterator extends BreakIterator {
|
||||
|
@ -57,10 +58,10 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
* produced by the ICU4C tool "genbrk".
|
||||
* @return A RuleBasedBreakIterator based on the supplied break rules.
|
||||
* @throws IOException
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
|
||||
// TODO:
|
||||
return null;
|
||||
return RuleBasedBreakIterator_New.getInstanceFromCompiledRules(is);
|
||||
}
|
||||
|
||||
|
||||
|
@ -114,24 +115,33 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
/** Tag value for "words" that do not fit into any of other categories.
|
||||
* Includes spaces and most punctuation. */
|
||||
public static final int WORD_NONE = 0;
|
||||
/** Upper bound for tags for uncategorized words. */
|
||||
/** Upper bound for tags for uncategorized words.
|
||||
* @draft ICU 3.0 */
|
||||
public static final int WORD_NONE_LIMIT = 100;
|
||||
/** Tag value for words that appear to be numbers, lower limit. */
|
||||
/** Tag value for words that appear to be numbers, lower limit.
|
||||
* @draft ICU 3.0 */
|
||||
public static final int WORD_NUMBER = 100;
|
||||
/** Tag value for words that appear to be numbers, upper limit. */
|
||||
/** Tag value for words that appear to be numbers, upper limit.
|
||||
* @draft ICU 3.0 */
|
||||
public static final int WORD_NUMBER_LIMIT = 200;
|
||||
/** Tag value for words that contain letters, excluding
|
||||
* hiragana, katakana or ideographic characters, lower limit. */
|
||||
* hiragana, katakana or ideographic characters, lower limit.
|
||||
* @draft ICU 3.0 */
|
||||
public static final int WORD_LETTER = 200;
|
||||
/** Tag value for words containing letters, upper limit */
|
||||
/** Tag value for words containing letters, upper limit
|
||||
* @draft ICU 3.0 */
|
||||
public static final int WORD_LETTER_LIMIT = 300;
|
||||
/** Tag value for words containing kana characters, lower limit */
|
||||
/** Tag value for words containing kana characters, lower limit
|
||||
* @draft ICU 3.0 */
|
||||
public static final int WORD_KANA = 300;
|
||||
/** Tag value for words containing kana characters, upper limit */
|
||||
/** Tag value for words containing kana characters, upper limit
|
||||
* @draft ICU 3.0 */
|
||||
public static final int WORD_KANA_LIMIT = 400;
|
||||
/** Tag value for words containing ideographic characters, lower limit */
|
||||
/** Tag value for words containing ideographic characters, lower limit
|
||||
* @draft ICU 3.0 */
|
||||
public static final int WORD_IDEO = 400;
|
||||
/** Tag value for words containing ideographic characters, upper limit */
|
||||
/** Tag value for words containing ideographic characters, upper limit
|
||||
* @draft ICU 3.0 */
|
||||
public static final int WORD_IDEO_LIMIT = 500;
|
||||
|
||||
//=======================================================================
|
||||
|
|
|
@ -13,6 +13,7 @@ import java.io.InputStream;
|
|||
|
||||
/**
|
||||
* Rule Based Break Iterator implementation.
|
||||
* @internal
|
||||
*/
|
||||
public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
|
||||
|
@ -133,6 +134,11 @@ public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
|||
//=======================================================================
|
||||
// Constructors & Factories
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Create a break iterator from a precompiled set of rules.
|
||||
* @internal
|
||||
*/
|
||||
public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is) throws IOException {
|
||||
RuleBasedBreakIterator_New This = new RuleBasedBreakIterator_New();
|
||||
This.fRData = RBBIDataWrapper.get(is);
|
||||
|
@ -506,28 +512,6 @@ public int current() {
|
|||
|
||||
|
||||
|
||||
/**
|
||||
* Return the status tag from the break rule that determined the most recently
|
||||
* returned break position. The values appear in the rule source
|
||||
* within brackets, {123}, for example. For rules that do not specify a
|
||||
* status, a default value of 0 is returned. If more than one rule applies,
|
||||
* the numerically largest of the possible status values is returned.
|
||||
* <p>
|
||||
* Of the standard types of ICU break iterators, only the word break
|
||||
* iterator provides status values. The values are defined in
|
||||
* <code>enum UWordBreak</code>, and allow distinguishing between words
|
||||
* that contain alphabetic letters, "words" that appear to be numbers,
|
||||
* punctuation and spaces, words containing ideographic characters, and
|
||||
* more. Call <code>getRuleStatus</code> after obtaining a boundary
|
||||
* position from <code>next()<code>, <code>previous()</code>, or
|
||||
* any other break iterator functions that returns a boundary position.
|
||||
* <p>
|
||||
* @return the status from the break rule that determined the most recently
|
||||
* returned break position.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
|
||||
private void makeRuleStatusValid() {
|
||||
if (fLastStatusIndexValid == false) {
|
||||
// No cached status is available.
|
||||
|
@ -553,6 +537,28 @@ private void makeRuleStatusValid() {
|
|||
|
||||
|
||||
|
||||
/**
|
||||
* Return the status tag from the break rule that determined the most recently
|
||||
* returned break position. The values appear in the rule source
|
||||
* within brackets, {123}, for example. For rules that do not specify a
|
||||
* status, a default value of 0 is returned. If more than one rule applies,
|
||||
* the numerically largest of the possible status values is returned.
|
||||
* <p>
|
||||
* Of the standard types of ICU break iterators, only the word break
|
||||
* iterator provides status values. The values are defined in
|
||||
* <code>enum UWordBreak</code>, and allow distinguishing between words
|
||||
* that contain alphabetic letters, "words" that appear to be numbers,
|
||||
* punctuation and spaces, words containing ideographic characters, and
|
||||
* more. Call <code>getRuleStatus</code> after obtaining a boundary
|
||||
* position from <code>next()<code>, <code>previous()</code>, or
|
||||
* any other break iterator functions that returns a boundary position.
|
||||
* <p>
|
||||
* @return the status from the break rule that determined the most recently
|
||||
* returned break position.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
|
||||
public int getRuleStatus() {
|
||||
makeRuleStatusValid();
|
||||
// Status records have this form:
|
||||
|
@ -943,7 +949,7 @@ public int getRuleStatusVec(int[] fillInArray) {
|
|||
if (stateTable[row + RBBIDataWrapper.ACCEPTING] != lookaheadStatus) {
|
||||
// TODO: handle this case of overlapping lookahead matches.
|
||||
// With correctly written rules, we won't get here.
|
||||
System.out.println("Trouble in handlePrevious()"); // comment out
|
||||
// System.out.println("Trouble in handlePrevious()");
|
||||
}
|
||||
result = lookaheadResult;
|
||||
fLastRuleStatusIndex = lookaheadTagIdx;
|
||||
|
|
Loading…
Add table
Reference in a new issue