ICU-10688 Remove break iterator type logic. It's implicit from the rules.

X-SVN-Rev: 40687
This commit is contained in:
Andy Heninger 2017-12-04 02:14:32 +00:00
parent ca7b62180e
commit 023e8b289f
10 changed files with 63 additions and 120 deletions

View file

@ -29,8 +29,7 @@ U_NAMESPACE_BEGIN
******************************************************************
*/
DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
fTypes = breakTypes;
DictionaryBreakEngine::DictionaryBreakEngine() {
}
DictionaryBreakEngine::~DictionaryBreakEngine() {
@ -45,7 +44,6 @@ int32_t
DictionaryBreakEngine::findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
int32_t breakType,
UVector32 &foundBreaks ) const {
(void)startPos; // TODO: remove this param?
int32_t result = 0;
@ -65,10 +63,8 @@ DictionaryBreakEngine::findBreaks( UText *text,
}
rangeStart = start;
rangeEnd = current;
if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
utext_setNativeIndex(text, current);
}
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
utext_setNativeIndex(text, current);
return result;
}
@ -193,7 +189,7 @@ static const int32_t THAI_MIN_WORD = 2;
static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
: DictionaryBreakEngine(),
fDictionary(adoptDictionary)
{
fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
@ -435,7 +431,7 @@ static const int32_t LAO_MIN_WORD = 2;
static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
: DictionaryBreakEngine(),
fDictionary(adoptDictionary)
{
fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
@ -631,7 +627,7 @@ static const int32_t BURMESE_MIN_WORD = 2;
static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
: DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
: DictionaryBreakEngine(),
fDictionary(adoptDictionary)
{
fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
@ -824,7 +820,7 @@ static const int32_t KHMER_MIN_WORD = 2;
static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
: DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
: DictionaryBreakEngine(),
fDictionary(adoptDictionary)
{
fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
@ -1046,7 +1042,7 @@ foundBest:
*/
static const uint32_t kuint32max = 0xFFFFFFFF;
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
// Korean dictionary only includes Hangul syllables
fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);

View file

@ -1241,7 +1241,7 @@ static void U_CALLCONV initLanguageFactories() {
static const LanguageBreakEngine*
getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
getLanguageBreakEngineFromFactory(UChar32 c)
{
umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
if (gLanguageBreakFactories == NULL) {
@ -1252,7 +1252,7 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
const LanguageBreakEngine *lbe = NULL;
while (--i >= 0) {
LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
lbe = factory->getEngineFor(c, breakType);
lbe = factory->getEngineFor(c);
if (lbe != NULL) {
break;
}

View file

@ -885,7 +885,6 @@ s */
iterCache[kind] = CacheValue.getInstance(cache);
if (result instanceof RuleBasedBreakIterator) {
RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result;
rbbi.setBreakType(kind);
}
return result;

View file

@ -161,7 +161,6 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
// TODO: Determine valid and actual locale correctly.
ULocale uloc = ULocale.forLocale(rb.getLocale());
iter.setLocale(uloc, uloc);
iter.setBreakType(kind);
// filtered break
if (kind == BreakIterator.KIND_SENTENCE) {

View file

@ -80,12 +80,9 @@ class BurmeseBreakEngine extends DictionaryBreakEngine {
}
@Override
public boolean handles(int c, int breakType) {
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.MYANMAR);
}
return false;
public boolean handles(int c) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.MYANMAR);
}
@Override

View file

@ -182,14 +182,13 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine {
}
@Override
public boolean handles(int c, int breakType) {
return fTypes.get(breakType) && // this type can use us
fSet.contains(c); // we recognize the character
public boolean handles(int c) {
return fSet.contains(c); // we recognize the character
}
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
int breakType, DequeI foundBreaks) {
DequeI foundBreaks) {
int result = 0;
// Find the span of characters included in the set.
@ -208,8 +207,6 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine {
rangeStart = start;
rangeEnd = current;
// if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
// TODO: Why does icu4c have this?
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
text.setIndex(current);

View file

@ -17,11 +17,9 @@ import java.text.CharacterIterator;
interface LanguageBreakEngine {
/**
* @param c A Unicode codepoint value
* @param breakType The kind of break iterator that is wanting to make use
* of this engine - character, word, line, sentence
* @return true if the engine can handle this character, false otherwise
*/
boolean handles(int c, int breakType);
boolean handles(int c);
/**
* Implements the actual breaking logic. Find any breaks within a run in the supplied text.
@ -30,13 +28,11 @@ interface LanguageBreakEngine {
* @param startPos The index of the beginning of the range
* @param endPos The index of the possible end of our range. It is possible,
* however, that the range ends earlier
* @param breakType The kind of break iterator that is wanting to make use
* of this engine - character, word, line, sentence
* @param foundBreaks A data structure to receive the break positions.
* @return the number of breaks found
*/
int findBreaks(CharacterIterator text, int startPos, int endPos,
int breakType, DictionaryBreakEngine.DequeI foundBreaks);
DictionaryBreakEngine.DequeI foundBreaks);
}

View file

@ -16,7 +16,7 @@ import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
class LaoBreakEngine extends DictionaryBreakEngine {
// Constants for LaoBreakIterator
// How many words in a row are "good enough"?
private static final byte LAO_LOOKAHEAD = 3;
@ -27,13 +27,13 @@ class LaoBreakEngine extends DictionaryBreakEngine {
private static final byte LAO_PREFIX_COMBINE_THRESHOLD = 3;
// Minimum word size
private static final byte LAO_MIN_WORD = 2;
private DictionaryMatcher fDictionary;
private static UnicodeSet fLaoWordSet;
private static UnicodeSet fEndWordSet;
private static UnicodeSet fBeginWordSet;
private static UnicodeSet fMarkSet;
static {
// Initialize UnicodeSets
fLaoWordSet = new UnicodeSet();
@ -55,14 +55,14 @@ class LaoBreakEngine extends DictionaryBreakEngine {
fMarkSet.compact();
fEndWordSet.compact();
fBeginWordSet.compact();
// Freeze the static UnicodeSet
fLaoWordSet.freeze();
fMarkSet.freeze();
fEndWordSet.freeze();
fBeginWordSet.freeze();
}
public LaoBreakEngine() throws IOException {
super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
setCharacters(fLaoWordSet);
@ -70,28 +70,29 @@ class LaoBreakEngine extends DictionaryBreakEngine {
fDictionary = DictionaryData.loadDictionaryFor("Laoo");
}
@Override
public boolean equals(Object obj) {
// Normally is a singleton, but it's possible to have duplicates
// during initialization. All are equivalent.
return obj instanceof LaoBreakEngine;
}
@Override
public int hashCode() {
return getClass().hashCode();
}
public boolean handles(int c, int breakType) {
if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.LAO);
}
return false;
@Override
public boolean handles(int c) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
return (script == UScript.LAO);
}
@Override
public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
DequeI foundBreaks) {
if ((rangeEnd - rangeStart) < LAO_MIN_WORD) {
return 0; // Not enough characters for word
}
@ -162,7 +163,7 @@ class LaoBreakEngine extends DictionaryBreakEngine {
// no preceding word, or the non-word shares less than the minimum threshold
// of characters with a dictionary word, then scan to resynchronize
if (words[wordsFound%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
(wordLength == 0 ||
(wordLength == 0 ||
words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
// Look for a plausible word boundary
int remaining = rangeEnd - (current + wordLength);
@ -208,7 +209,7 @@ class LaoBreakEngine extends DictionaryBreakEngine {
// Look ahead for possible suffixes if a dictionary word does not follow.
// We do this in code rather than using a rule so that the heuristic
// resynch continues to function. For example, one of the suffix characters
// resynch continues to function. For example, one of the suffix characters
// could be a typo in the middle of a word.
// NOT CURRENTLY APPLICABLE TO LAO

View file

@ -275,13 +275,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
&& ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;
/**
* What kind of break iterator this is.
* Defaulting BreakType to word gives reasonable dictionary behavior for
* Break Iterators that are built from rules.
*/
private int fBreakType = KIND_WORD;
/**
* The "default" break engine - just skips over ranges of dictionary words,
* producing no breaks. Should only be used if characters need to be handled
@ -646,21 +639,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
this.first();
}
/**
* package private
*/
void setBreakType(int type) {
fBreakType = type;
}
/**
* package private
*/
int getBreakType() {
return fBreakType;
}
/**
/**
* Control debug, trace and dump options.
* @internal
*/
@ -673,7 +652,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// We have a dictionary character.
// Does an already instantiated break engine handle it?
for (LanguageBreakEngine candidate : fBreakEngines) {
if (candidate.handles(c, fBreakType)) {
if (candidate.handles(c)) {
return candidate;
}
}
@ -683,7 +662,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// Check the global list, another break iterator may have instantiated the
// desired engine.
for (LanguageBreakEngine candidate : gAllBreakEngines) {
if (candidate.handles(c, fBreakType)) {
if (candidate.handles(c)) {
fBreakEngines.add(candidate);
return candidate;
}
@ -713,24 +692,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
eng = new KhmerBreakEngine();
break;
case UScript.HAN:
if (getBreakType() == KIND_WORD) {
eng = new CjkBreakEngine(false);
}
else {
gUnhandledBreakEngine.handleChar(c, getBreakType());
eng = gUnhandledBreakEngine;
}
break;
eng = new CjkBreakEngine(false);
break;
case UScript.HANGUL:
if (getBreakType() == KIND_WORD) {
eng = new CjkBreakEngine(true);
} else {
gUnhandledBreakEngine.handleChar(c, getBreakType());
eng = gUnhandledBreakEngine;
}
eng = new CjkBreakEngine(true);
break;
default:
gUnhandledBreakEngine.handleChar(c, getBreakType());
gUnhandledBreakEngine.handleChar(c);
eng = gUnhandledBreakEngine;
break;
}
@ -1306,7 +1274,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != null) {
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreakType, fBreaks);
foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
}
// Reload the loop variables for the next go-round

View file

@ -8,17 +8,14 @@
*/
package com.ibm.icu.text;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import java.text.CharacterIterator;
import java.util.concurrent.atomic.AtomicReferenceArray;
import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
final class UnhandledBreakEngine implements LanguageBreakEngine {
// TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
// TODO: Use two UnicodeSets, one with all frozen sets, one with unfrozen.
// in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
// Note on concurrency: A single instance of UnhandledBreakEngine is shared across all
@ -35,49 +32,42 @@ final class UnhandledBreakEngine implements LanguageBreakEngine {
// on which scripts have been previously seen by handleChar(). (This is not a
// threading specific issue). Possibly stop on script boundaries?
final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1);
volatile UnicodeSet fHandled = new UnicodeSet();
public UnhandledBreakEngine() {
for (int i = 0; i < fHandled.length(); i++) {
fHandled.set(i, new UnicodeSet());
}
}
@Override
public boolean handles(int c, int breakType) {
return (breakType >= 0 && breakType < fHandled.length()) &&
(fHandled.get(breakType).contains(c));
public boolean handles(int c) {
return fHandled.contains(c);
}
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
if (breakType >= 0 && breakType < fHandled.length()) {
UnicodeSet uniset = fHandled.get(breakType);
int c = CharacterIteration.current32(text);
while (text.getIndex() < endPos && uniset.contains(c)) {
CharacterIteration.next32(text);
c = CharacterIteration.current32(text);
}
DictionaryBreakEngine.DequeI foundBreaks) {
UnicodeSet uniset = fHandled;
int c = CharacterIteration.current32(text);
while (text.getIndex() < endPos && uniset.contains(c)) {
CharacterIteration.next32(text);
c = CharacterIteration.current32(text);
}
return 0;
}
/**
* Update the set of unhandled characters for the specified breakType to include
* Update the set of unhandled characters to include
* all that have the same script as c.
* May be called concurrently with handles() or findBreaks().
* Must not be called concurrently with itself.
*/
public void handleChar(int c, int breakType) {
if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) {
UnicodeSet originalSet = fHandled.get(breakType);
if (!originalSet.contains(c)) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
UnicodeSet newSet = new UnicodeSet();
newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
newSet.addAll(originalSet);
fHandled.set(breakType, newSet);
}
public void handleChar(int c) {
UnicodeSet originalSet = fHandled;
if (!originalSet.contains(c)) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
UnicodeSet newSet = new UnicodeSet();
newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
newSet.addAll(originalSet);
fHandled = newSet;
}
}
}