mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-10688 Remove break iterator type logic. It's implicit from the rules.
X-SVN-Rev: 40687
This commit is contained in:
parent
ca7b62180e
commit
023e8b289f
10 changed files with 63 additions and 120 deletions
|
@ -29,8 +29,7 @@ U_NAMESPACE_BEGIN
|
|||
******************************************************************
|
||||
*/
|
||||
|
||||
DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
|
||||
fTypes = breakTypes;
|
||||
DictionaryBreakEngine::DictionaryBreakEngine() {
|
||||
}
|
||||
|
||||
DictionaryBreakEngine::~DictionaryBreakEngine() {
|
||||
|
@ -45,7 +44,6 @@ int32_t
|
|||
DictionaryBreakEngine::findBreaks( UText *text,
|
||||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
int32_t breakType,
|
||||
UVector32 &foundBreaks ) const {
|
||||
(void)startPos; // TODO: remove this param?
|
||||
int32_t result = 0;
|
||||
|
@ -65,10 +63,8 @@ DictionaryBreakEngine::findBreaks( UText *text,
|
|||
}
|
||||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
|
||||
utext_setNativeIndex(text, current);
|
||||
}
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
|
||||
utext_setNativeIndex(text, current);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -193,7 +189,7 @@ static const int32_t THAI_MIN_WORD = 2;
|
|||
static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
|
||||
|
||||
ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
|
||||
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
|
||||
: DictionaryBreakEngine(),
|
||||
fDictionary(adoptDictionary)
|
||||
{
|
||||
fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
|
||||
|
@ -435,7 +431,7 @@ static const int32_t LAO_MIN_WORD = 2;
|
|||
static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
|
||||
|
||||
LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
|
||||
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
|
||||
: DictionaryBreakEngine(),
|
||||
fDictionary(adoptDictionary)
|
||||
{
|
||||
fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
|
||||
|
@ -631,7 +627,7 @@ static const int32_t BURMESE_MIN_WORD = 2;
|
|||
static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
|
||||
|
||||
BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
|
||||
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
|
||||
: DictionaryBreakEngine(),
|
||||
fDictionary(adoptDictionary)
|
||||
{
|
||||
fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
|
||||
|
@ -824,7 +820,7 @@ static const int32_t KHMER_MIN_WORD = 2;
|
|||
static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
|
||||
|
||||
KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
|
||||
: DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
|
||||
: DictionaryBreakEngine(),
|
||||
fDictionary(adoptDictionary)
|
||||
{
|
||||
fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
|
||||
|
@ -1046,7 +1042,7 @@ foundBest:
|
|||
*/
|
||||
static const uint32_t kuint32max = 0xFFFFFFFF;
|
||||
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
|
||||
: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
|
||||
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
|
||||
// Korean dictionary only includes Hangul syllables
|
||||
fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
|
||||
fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
|
||||
|
|
|
@ -1241,7 +1241,7 @@ static void U_CALLCONV initLanguageFactories() {
|
|||
|
||||
|
||||
static const LanguageBreakEngine*
|
||||
getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
|
||||
getLanguageBreakEngineFromFactory(UChar32 c)
|
||||
{
|
||||
umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
|
||||
if (gLanguageBreakFactories == NULL) {
|
||||
|
@ -1252,7 +1252,7 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
|
|||
const LanguageBreakEngine *lbe = NULL;
|
||||
while (--i >= 0) {
|
||||
LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
|
||||
lbe = factory->getEngineFor(c, breakType);
|
||||
lbe = factory->getEngineFor(c);
|
||||
if (lbe != NULL) {
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -885,7 +885,6 @@ s */
|
|||
iterCache[kind] = CacheValue.getInstance(cache);
|
||||
if (result instanceof RuleBasedBreakIterator) {
|
||||
RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result;
|
||||
rbbi.setBreakType(kind);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
|
@ -161,7 +161,6 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
|
|||
// TODO: Determine valid and actual locale correctly.
|
||||
ULocale uloc = ULocale.forLocale(rb.getLocale());
|
||||
iter.setLocale(uloc, uloc);
|
||||
iter.setBreakType(kind);
|
||||
|
||||
// filtered break
|
||||
if (kind == BreakIterator.KIND_SENTENCE) {
|
||||
|
|
|
@ -80,12 +80,9 @@ class BurmeseBreakEngine extends DictionaryBreakEngine {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean handles(int c, int breakType) {
|
||||
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
return (script == UScript.MYANMAR);
|
||||
}
|
||||
return false;
|
||||
public boolean handles(int c) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
return (script == UScript.MYANMAR);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -182,14 +182,13 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine {
|
|||
}
|
||||
|
||||
@Override
|
||||
public boolean handles(int c, int breakType) {
|
||||
return fTypes.get(breakType) && // this type can use us
|
||||
fSet.contains(c); // we recognize the character
|
||||
public boolean handles(int c) {
|
||||
return fSet.contains(c); // we recognize the character
|
||||
}
|
||||
|
||||
@Override
|
||||
public int findBreaks(CharacterIterator text, int startPos, int endPos,
|
||||
int breakType, DequeI foundBreaks) {
|
||||
DequeI foundBreaks) {
|
||||
int result = 0;
|
||||
|
||||
// Find the span of characters included in the set.
|
||||
|
@ -208,8 +207,6 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine {
|
|||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
|
||||
// if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
|
||||
// TODO: Why does icu4c have this?
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
|
||||
text.setIndex(current);
|
||||
|
||||
|
|
|
@ -17,11 +17,9 @@ import java.text.CharacterIterator;
|
|||
interface LanguageBreakEngine {
|
||||
/**
|
||||
* @param c A Unicode codepoint value
|
||||
* @param breakType The kind of break iterator that is wanting to make use
|
||||
* of this engine - character, word, line, sentence
|
||||
* @return true if the engine can handle this character, false otherwise
|
||||
*/
|
||||
boolean handles(int c, int breakType);
|
||||
boolean handles(int c);
|
||||
|
||||
/**
|
||||
* Implements the actual breaking logic. Find any breaks within a run in the supplied text.
|
||||
|
@ -30,13 +28,11 @@ interface LanguageBreakEngine {
|
|||
* @param startPos The index of the beginning of the range
|
||||
* @param endPos The index of the possible end of our range. It is possible,
|
||||
* however, that the range ends earlier
|
||||
* @param breakType The kind of break iterator that is wanting to make use
|
||||
* of this engine - character, word, line, sentence
|
||||
* @param foundBreaks A data structure to receive the break positions.
|
||||
* @return the number of breaks found
|
||||
*/
|
||||
int findBreaks(CharacterIterator text, int startPos, int endPos,
|
||||
int breakType, DictionaryBreakEngine.DequeI foundBreaks);
|
||||
DictionaryBreakEngine.DequeI foundBreaks);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@ import com.ibm.icu.lang.UProperty;
|
|||
import com.ibm.icu.lang.UScript;
|
||||
|
||||
class LaoBreakEngine extends DictionaryBreakEngine {
|
||||
|
||||
|
||||
// Constants for LaoBreakIterator
|
||||
// How many words in a row are "good enough"?
|
||||
private static final byte LAO_LOOKAHEAD = 3;
|
||||
|
@ -27,13 +27,13 @@ class LaoBreakEngine extends DictionaryBreakEngine {
|
|||
private static final byte LAO_PREFIX_COMBINE_THRESHOLD = 3;
|
||||
// Minimum word size
|
||||
private static final byte LAO_MIN_WORD = 2;
|
||||
|
||||
|
||||
private DictionaryMatcher fDictionary;
|
||||
private static UnicodeSet fLaoWordSet;
|
||||
private static UnicodeSet fEndWordSet;
|
||||
private static UnicodeSet fBeginWordSet;
|
||||
private static UnicodeSet fMarkSet;
|
||||
|
||||
|
||||
static {
|
||||
// Initialize UnicodeSets
|
||||
fLaoWordSet = new UnicodeSet();
|
||||
|
@ -55,14 +55,14 @@ class LaoBreakEngine extends DictionaryBreakEngine {
|
|||
fMarkSet.compact();
|
||||
fEndWordSet.compact();
|
||||
fBeginWordSet.compact();
|
||||
|
||||
|
||||
// Freeze the static UnicodeSet
|
||||
fLaoWordSet.freeze();
|
||||
fMarkSet.freeze();
|
||||
fEndWordSet.freeze();
|
||||
fBeginWordSet.freeze();
|
||||
}
|
||||
|
||||
|
||||
public LaoBreakEngine() throws IOException {
|
||||
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
|
||||
setCharacters(fLaoWordSet);
|
||||
|
@ -70,28 +70,29 @@ class LaoBreakEngine extends DictionaryBreakEngine {
|
|||
fDictionary = DictionaryData.loadDictionaryFor("Laoo");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
// Normally is a singleton, but it's possible to have duplicates
|
||||
// during initialization. All are equivalent.
|
||||
return obj instanceof LaoBreakEngine;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return getClass().hashCode();
|
||||
}
|
||||
|
||||
public boolean handles(int c, int breakType) {
|
||||
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
return (script == UScript.LAO);
|
||||
}
|
||||
return false;
|
||||
|
||||
@Override
|
||||
public boolean handles(int c) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
return (script == UScript.LAO);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
|
||||
DequeI foundBreaks) {
|
||||
|
||||
|
||||
|
||||
|
||||
if ((rangeEnd - rangeStart) < LAO_MIN_WORD) {
|
||||
return 0; // Not enough characters for word
|
||||
}
|
||||
|
@ -162,7 +163,7 @@ class LaoBreakEngine extends DictionaryBreakEngine {
|
|||
// no preceding word, or the non-word shares less than the minimum threshold
|
||||
// of characters with a dictionary word, then scan to resynchronize
|
||||
if (words[wordsFound%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
|
||||
(wordLength == 0 ||
|
||||
(wordLength == 0 ||
|
||||
words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
|
||||
// Look for a plausible word boundary
|
||||
int remaining = rangeEnd - (current + wordLength);
|
||||
|
@ -208,7 +209,7 @@ class LaoBreakEngine extends DictionaryBreakEngine {
|
|||
|
||||
// Look ahead for possible suffixes if a dictionary word does not follow.
|
||||
// We do this in code rather than using a rule so that the heuristic
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
// resynch continues to function. For example, one of the suffix characters
|
||||
// could be a typo in the middle of a word.
|
||||
// NOT CURRENTLY APPLICABLE TO LAO
|
||||
|
||||
|
|
|
@ -275,13 +275,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
|
||||
&& ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
|
||||
|
||||
/**
|
||||
* What kind of break iterator this is.
|
||||
* Defaulting BreakType to word gives reasonable dictionary behavior for
|
||||
* Break Iterators that are built from rules.
|
||||
*/
|
||||
private int fBreakType = KIND_WORD;
|
||||
|
||||
/**
|
||||
* The "default" break engine - just skips over ranges of dictionary words,
|
||||
* producing no breaks. Should only be used if characters need to be handled
|
||||
|
@ -646,21 +639,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
this.first();
|
||||
}
|
||||
|
||||
/**
|
||||
* package private
|
||||
*/
|
||||
void setBreakType(int type) {
|
||||
fBreakType = type;
|
||||
}
|
||||
|
||||
/**
|
||||
* package private
|
||||
*/
|
||||
int getBreakType() {
|
||||
return fBreakType;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Control debug, trace and dump options.
|
||||
* @internal
|
||||
*/
|
||||
|
@ -673,7 +652,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
// We have a dictionary character.
|
||||
// Does an already instantiated break engine handle it?
|
||||
for (LanguageBreakEngine candidate : fBreakEngines) {
|
||||
if (candidate.handles(c, fBreakType)) {
|
||||
if (candidate.handles(c)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
@ -683,7 +662,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
// Check the global list, another break iterator may have instantiated the
|
||||
// desired engine.
|
||||
for (LanguageBreakEngine candidate : gAllBreakEngines) {
|
||||
if (candidate.handles(c, fBreakType)) {
|
||||
if (candidate.handles(c)) {
|
||||
fBreakEngines.add(candidate);
|
||||
return candidate;
|
||||
}
|
||||
|
@ -713,24 +692,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
eng = new KhmerBreakEngine();
|
||||
break;
|
||||
case UScript.HAN:
|
||||
if (getBreakType() == KIND_WORD) {
|
||||
eng = new CjkBreakEngine(false);
|
||||
}
|
||||
else {
|
||||
gUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
eng = gUnhandledBreakEngine;
|
||||
}
|
||||
break;
|
||||
eng = new CjkBreakEngine(false);
|
||||
break;
|
||||
case UScript.HANGUL:
|
||||
if (getBreakType() == KIND_WORD) {
|
||||
eng = new CjkBreakEngine(true);
|
||||
} else {
|
||||
gUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
eng = gUnhandledBreakEngine;
|
||||
}
|
||||
eng = new CjkBreakEngine(true);
|
||||
break;
|
||||
default:
|
||||
gUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
gUnhandledBreakEngine.handleChar(c);
|
||||
eng = gUnhandledBreakEngine;
|
||||
break;
|
||||
}
|
||||
|
@ -1306,7 +1274,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
// Ask the language object if there are any breaks. It will add them to the cache and
|
||||
// leave the text pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != null) {
|
||||
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreakType, fBreaks);
|
||||
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
|
||||
}
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
|
|
|
@ -8,17 +8,14 @@
|
|||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import static com.ibm.icu.impl.CharacterIteration.DONE32;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.concurrent.atomic.AtomicReferenceArray;
|
||||
|
||||
import com.ibm.icu.impl.CharacterIteration;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
|
||||
final class UnhandledBreakEngine implements LanguageBreakEngine {
|
||||
// TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
|
||||
// TODO: Use two UnicodeSets, one with all frozen sets, one with unfrozen.
|
||||
// in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
|
||||
|
||||
// Note on concurrency: A single instance of UnhandledBreakEngine is shared across all
|
||||
|
@ -35,49 +32,42 @@ final class UnhandledBreakEngine implements LanguageBreakEngine {
|
|||
// on which scripts have been previously seen by handleChar(). (This is not a
|
||||
// threading specific issue). Possibly stop on script boundaries?
|
||||
|
||||
final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1);
|
||||
volatile UnicodeSet fHandled = new UnicodeSet();
|
||||
public UnhandledBreakEngine() {
|
||||
for (int i = 0; i < fHandled.length(); i++) {
|
||||
fHandled.set(i, new UnicodeSet());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean handles(int c, int breakType) {
|
||||
return (breakType >= 0 && breakType < fHandled.length()) &&
|
||||
(fHandled.get(breakType).contains(c));
|
||||
public boolean handles(int c) {
|
||||
return fHandled.contains(c);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int findBreaks(CharacterIterator text, int startPos, int endPos,
|
||||
int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
|
||||
if (breakType >= 0 && breakType < fHandled.length()) {
|
||||
UnicodeSet uniset = fHandled.get(breakType);
|
||||
int c = CharacterIteration.current32(text);
|
||||
while (text.getIndex() < endPos && uniset.contains(c)) {
|
||||
CharacterIteration.next32(text);
|
||||
c = CharacterIteration.current32(text);
|
||||
}
|
||||
DictionaryBreakEngine.DequeI foundBreaks) {
|
||||
|
||||
UnicodeSet uniset = fHandled;
|
||||
int c = CharacterIteration.current32(text);
|
||||
while (text.getIndex() < endPos && uniset.contains(c)) {
|
||||
CharacterIteration.next32(text);
|
||||
c = CharacterIteration.current32(text);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the set of unhandled characters for the specified breakType to include
|
||||
* Update the set of unhandled characters to include
|
||||
* all that have the same script as c.
|
||||
* May be called concurrently with handles() or findBreaks().
|
||||
* Must not be called concurrently with itself.
|
||||
*/
|
||||
public void handleChar(int c, int breakType) {
|
||||
if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) {
|
||||
UnicodeSet originalSet = fHandled.get(breakType);
|
||||
if (!originalSet.contains(c)) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
UnicodeSet newSet = new UnicodeSet();
|
||||
newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
|
||||
newSet.addAll(originalSet);
|
||||
fHandled.set(breakType, newSet);
|
||||
}
|
||||
public void handleChar(int c) {
|
||||
UnicodeSet originalSet = fHandled;
|
||||
if (!originalSet.contains(c)) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
UnicodeSet newSet = new UnicodeSet();
|
||||
newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
|
||||
newSet.addAll(originalSet);
|
||||
fHandled = newSet;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue