mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-16 02:07:15 +00:00
ICU-3295 rbbi rt port to Java. Stubs for new classes.
X-SVN-Rev: 14935
This commit is contained in:
parent
6a69915b58
commit
a7fafffd6e
6 changed files with 3633 additions and 3088 deletions
|
@ -20,32 +20,32 @@ public class WriteTablesToFiles {
|
|||
String suffix = (littleEndian ? "LE" : "BE");
|
||||
|
||||
bi = BreakIterator.getCharacterInstance();
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
((RuleBasedBreakIterator_Old)bi).writeTablesToFile(new FileOutputStream(
|
||||
"char" + suffix + ".brk"), littleEndian);
|
||||
|
||||
bi = BreakIterator.getWordInstance();
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
((RuleBasedBreakIterator_Old)bi).writeTablesToFile(new FileOutputStream(
|
||||
"word" + suffix + ".brk"), littleEndian);
|
||||
|
||||
bi = BreakIterator.getLineInstance();
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
((RuleBasedBreakIterator_Old)bi).writeTablesToFile(new FileOutputStream(
|
||||
"line" + suffix + ".brk"), littleEndian);
|
||||
|
||||
bi = BreakIterator.getSentenceInstance();
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
((RuleBasedBreakIterator_Old)bi).writeTablesToFile(new FileOutputStream(
|
||||
"sent" + suffix + ".brk"), littleEndian);
|
||||
|
||||
bi = BreakIterator.getTitleInstance();
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
((RuleBasedBreakIterator_Old)bi).writeTablesToFile(new FileOutputStream(
|
||||
"title" + suffix + ".brk"), littleEndian);
|
||||
|
||||
java.util.Locale thai = new java.util.Locale("th", "", "");
|
||||
bi = BreakIterator.getWordInstance(thai);
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
((RuleBasedBreakIterator_Old)bi).writeTablesToFile(new FileOutputStream(
|
||||
"word_th" + suffix + ".brk"), littleEndian);
|
||||
|
||||
bi = BreakIterator.getLineInstance(thai);
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
((RuleBasedBreakIterator_Old)bi).writeTablesToFile(new FileOutputStream(
|
||||
"line_th" + suffix + ".brk"), littleEndian);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -97,7 +97,7 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
|
|||
String[] classNames = bundle.getStringArray("BreakIteratorClasses");
|
||||
String rules = bundle.getString(rulesName);
|
||||
if (classNames[kind].equals("RuleBasedBreakIterator")) {
|
||||
iter = new RuleBasedBreakIterator(rules);
|
||||
iter = new RuleBasedBreakIterator_Old(rules);
|
||||
}
|
||||
else if (classNames[kind].equals("DictionaryBasedBreakIterator")) {
|
||||
try {
|
||||
|
@ -121,7 +121,7 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
|
|||
// in our current tests.
|
||||
///CLOVER:OFF
|
||||
if (iter == null) {
|
||||
iter = new RuleBasedBreakIterator(rules);
|
||||
iter = new RuleBasedBreakIterator_Old(rules);
|
||||
}
|
||||
///CLOVER:ON
|
||||
}
|
||||
|
|
|
@ -17,16 +17,16 @@ import java.io.IOException;
|
|||
import java.io.*;
|
||||
|
||||
/**
|
||||
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
|
||||
* A subclass of RuleBasedBreakIterator_Old that adds the ability to use a dictionary
|
||||
* to further subdivide ranges of text beyond what is possible using just the
|
||||
* state-table-based algorithm. This is necessary, for example, to handle
|
||||
* word and line breaking in Thai, which doesn't use spaces between words. The
|
||||
* state-table-based algorithm used by RuleBasedBreakIterator is used to divide
|
||||
* state-table-based algorithm used by RuleBasedBreakIterator_Old is used to divide
|
||||
* up text as far as possible, and then contiguous ranges of letters are
|
||||
* repeatedly compared against a list of known words (i.e., the dictionary)
|
||||
* to divide them up into words.
|
||||
*
|
||||
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
|
||||
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator_Old,
|
||||
* but adds one more special substitution name: _dictionary_. This substitution
|
||||
* name is used to identify characters in words in the dictionary. The idea is that
|
||||
* if the iterator passes over a chunk of text that includes two or more characters
|
||||
|
@ -41,7 +41,7 @@ import java.io.*;
|
|||
*
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
|
||||
public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator_Old {
|
||||
|
||||
/**
|
||||
* a list of known words that is used to divide up contiguous ranges of letters,
|
||||
|
@ -83,9 +83,9 @@ public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
|
|||
|
||||
/**
|
||||
* Constructs a DictionaryBasedBreakIterator.
|
||||
* @param description Same as the description parameter on RuleBasedBreakIterator,
|
||||
* @param description Same as the description parameter on RuleBasedBreakIterator_Old,
|
||||
* except for the special meaning of DICTIONARY_VAR. This parameter is just
|
||||
* passed through to RuleBasedBreakIterator's constructor.
|
||||
* passed through to RuleBasedBreakIterator_Old's constructor.
|
||||
* @param dictionaryStream the stream containing the dictionary data
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
|
@ -97,11 +97,11 @@ public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
|
|||
|
||||
/**
|
||||
* Returns a Builder that is customized to build a DictionaryBasedBreakIterator.
|
||||
* This is the same as RuleBasedBreakIterator.Builder, except for the extra code
|
||||
* This is the same as RuleBasedBreakIterator_Old.Builder, except for the extra code
|
||||
* to handle the DICTIONARY_VAR tag.
|
||||
* @internal
|
||||
*/
|
||||
protected RuleBasedBreakIterator.Builder makeBuilder() {
|
||||
protected RuleBasedBreakIterator_Old.Builder makeBuilder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
|
@ -313,7 +313,7 @@ switch (categoryFlags.length % 4) {
|
|||
// categories represented in the dictionary. If it is, bump the dictionary-
|
||||
// character count.
|
||||
int result = super.lookupCategory(c);
|
||||
if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
|
||||
if (result != RuleBasedBreakIterator_Old.IGNORE && categoryFlags[result]) {
|
||||
++dictionaryCharCount;
|
||||
}
|
||||
return result;
|
||||
|
@ -514,11 +514,11 @@ switch (categoryFlags.length % 4) {
|
|||
|
||||
/**
|
||||
* The Builder class for DictionaryBasedBreakIterator inherits almost all of
|
||||
* its functionality from the Builder class for RuleBasedBreakIterator, but
|
||||
* its functionality from the Builder class for RuleBasedBreakIterator_Old, but
|
||||
* extends it with extra logic to handle the DICTIONARY_VAR token
|
||||
* @internal
|
||||
*/
|
||||
protected class Builder extends RuleBasedBreakIterator.Builder {
|
||||
protected class Builder extends RuleBasedBreakIterator_Old.Builder {
|
||||
|
||||
/**
|
||||
* A UnicodeSet that contains all the characters represented in the dictionary
|
||||
|
|
File diff suppressed because it is too large
Load diff
234
icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java
Normal file
234
icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_New.java
Normal file
|
@ -0,0 +1,234 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2004 International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import java.text.StringCharacterIterator;
|
||||
|
||||
/**
|
||||
* @author andy
|
||||
*
|
||||
* To change the template for this generated type comment go to
|
||||
* Window - Preferences - Java - Code Generation - Code and Comments
|
||||
*/
|
||||
public class RuleBasedBreakIterator_New extends RuleBasedBreakIterator {
|
||||
//=======================================================================
|
||||
// boilerplate
|
||||
//=======================================================================
|
||||
/**
|
||||
* Clones this iterator.
|
||||
* @return A newly-constructed RuleBasedBreakIterator with the same
|
||||
* behavior as this one.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public Object clone()
|
||||
{
|
||||
RuleBasedBreakIterator_New result = (RuleBasedBreakIterator_New) super.clone();
|
||||
// TODO: real clone code
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if both BreakIterators are of the same class, have the same
|
||||
* rules, and iterate over the same text.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public boolean equals(Object that) {
|
||||
return false; // TODO:
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the description used to create this iterator
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public String toString() {
|
||||
return ""; // TODO:
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute a hashcode for this BreakIterator
|
||||
* @return A hash code
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int hashCode()
|
||||
{
|
||||
return 0; // TODO
|
||||
}
|
||||
|
||||
//=======================================================================
|
||||
// BreakIterator overrides
|
||||
//=======================================================================
|
||||
|
||||
/**
|
||||
* Sets the current iteration position to the beginning of the text.
|
||||
* (i.e., the CharacterIterator's starting offset).
|
||||
* @return The offset of the beginning of the text.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int first() {
|
||||
return 0; // TODO;
|
||||
}
|
||||
/**
|
||||
* Sets the current iteration position to the end of the text.
|
||||
* (i.e., the CharacterIterator's ending offset).
|
||||
* @return The text's past-the-end offset.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int last() {
|
||||
return 0; // TODO:
|
||||
}
|
||||
/**
|
||||
* Advances the iterator either forward or backward the specified number of steps.
|
||||
* Negative values move backward, and positive values move forward. This is
|
||||
* equivalent to repeatedly calling next() or previous().
|
||||
* @param n The number of steps to move. The sign indicates the direction
|
||||
* (negative is backwards, and positive is forwards).
|
||||
* @return The character offset of the boundary position n boundaries away from
|
||||
* the current one.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int next(int n) {
|
||||
return 0; // TODO:
|
||||
}
|
||||
/**
|
||||
* Advances the iterator to the next boundary position.
|
||||
* @return The position of the first boundary after this one.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int next() {
|
||||
return 0; // TODO:
|
||||
}
|
||||
/**
|
||||
* Advances the iterator backwards, to the last boundary preceding this one.
|
||||
* @return The position of the last boundary position preceding this one.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int previous() {
|
||||
return 0; // TODO:
|
||||
}
|
||||
/**
|
||||
* Sets the iterator to refer to the first boundary position following
|
||||
* the specified position.
|
||||
* @param offset The position from which to begin searching for a break position.
|
||||
* @return The position of the first break after the current position.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int following(int offset) {
|
||||
return 0; // TODO:
|
||||
}
|
||||
/**
|
||||
* Sets the iterator to refer to the last boundary position before the
|
||||
* specified position.
|
||||
* @param offset The position to begin searching for a break from.
|
||||
* @return The position of the last boundary before the starting position.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int preceding(int offset) {
|
||||
return 0; // TODO:
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the specfied position is a boundary position. As a side
|
||||
* effect, leaves the iterator pointing to the first boundary position at
|
||||
* or after "offset".
|
||||
* @param offset the offset to check.
|
||||
* @return True if "offset" is a boundary position.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public boolean isBoundary(int offset) {
|
||||
return true; // TODO:
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current iteration position.
|
||||
* @return The current iteration position.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public int current() {
|
||||
return 0; // TODO:
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Return the status tag from the break rule that determined the most recently
|
||||
* returned break position. The values appear in the rule source
|
||||
* within brackets, {123}, for example. For rules that do not specify a
|
||||
* status, a default value of 0 is returned. If more than one rule applies,
|
||||
* the numerically largest of the possible status values is returned.
|
||||
* <p>
|
||||
* Of the standard types of ICU break iterators, only the word break
|
||||
* iterator provides status values. The values are defined in
|
||||
* <code>enum UWordBreak</code>, and allow distinguishing between words
|
||||
* that contain alphabetic letters, "words" that appear to be numbers,
|
||||
* punctuation and spaces, words containing ideographic characters, and
|
||||
* more. Call <code>getRuleStatus</code> after obtaining a boundary
|
||||
* position from <code>next()<code>, <code>previous()</code>, or
|
||||
* any other break iterator functions that returns a boundary position.
|
||||
* <p>
|
||||
* @return the status from the break rule that determined the most recently
|
||||
* returned break position.
|
||||
*
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
public int getRuleStatus() {
|
||||
return 0; // TODO:
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Get the status (tag) values from the break rule(s) that determined the most
|
||||
* recently returned break position. The values appear in the rule source
|
||||
* within brackets, {123}, for example. The default status value for rules
|
||||
* that do not explicitly provide one is zero.
|
||||
* <p>
|
||||
* For word break iterators, the possible values are defined in enum UWordBreak.
|
||||
* <p>
|
||||
* If the size of the output array is insufficient to hold the data,
|
||||
* the output will be truncated to the available length. No exception
|
||||
* will be thrown.
|
||||
*
|
||||
* @param fillInArray an array to be filled in with the status values.
|
||||
* @return The number of rule status values from rules that determined
|
||||
* the most recent boundary returned by the break iterator.
|
||||
* In the event that the array is too small, the return value
|
||||
* is the total number of status values that were available,
|
||||
* not the reduced number that were actually returned.
|
||||
* @draft ICU 3.0
|
||||
*/
|
||||
public int getRuleStatusVec(int[] fillInArray) {
|
||||
if (fillInArray != null && fillInArray.length >= 1) { // TODO:
|
||||
fillInArray[0] = 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a CharacterIterator over the text being analyzed. This version
|
||||
* of this method returns the actual CharacterIterator we're using internally.
|
||||
* Changing the state of this iterator can have undefined consequences. If
|
||||
* you need to change it, clone it first.
|
||||
* @return An iterator over the text being analyzed.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public CharacterIterator getText() {
|
||||
return new StringCharacterIterator("");
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Set the iterator to analyze a new piece of text. This function resets
|
||||
* the current iteration position to the beginning of the text.
|
||||
* @param newText An iterator over the text to analyze.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public void setText(CharacterIterator newText) {
|
||||
}
|
||||
|
||||
}
|
3255
icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_Old.java
Normal file
3255
icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator_Old.java
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue