ICU-10688 Remove break iterator type logic. It's implicit from the rules.

X-SVN-Rev: 40687
2025-04-07 22:44:49 +00:00 · 2017-12-04 02:14:32 +00:00 · 2017-12-04 02:14:32 +00:00 · 023e8b289f
commit 023e8b289f
parent ca7b62180e
10 changed files with 63 additions and 120 deletions
--- a/icu4c/source/common/dictbe.cpp
+++ b/icu4c/source/common/dictbe.cpp
@ -29,8 +29,7 @@ U_NAMESPACE_BEGIN
 ******************************************************************
 */

-DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
-    fTypes = breakTypes;
+DictionaryBreakEngine::DictionaryBreakEngine() {
 }

 DictionaryBreakEngine::~DictionaryBreakEngine() {
@ -45,7 +44,6 @@ int32_t
 DictionaryBreakEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
-                                 int32_t breakType,
                                 UVector32 &foundBreaks ) const {
    (void)startPos;            // TODO: remove this param?
    int32_t result = 0;
@ -65,10 +63,8 @@ DictionaryBreakEngine::findBreaks( UText *text,
    }
    rangeStart = start;
    rangeEnd = current;
-    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
-        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
-        utext_setNativeIndex(text, current);
-    }
+    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
+    utext_setNativeIndex(text, current);
    
    return result;
 }
@ -193,7 +189,7 @@ static const int32_t THAI_MIN_WORD = 2;
 static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;

 ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
-    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
+    : DictionaryBreakEngine(),
      fDictionary(adoptDictionary)
 {
    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
@ -435,7 +431,7 @@ static const int32_t LAO_MIN_WORD = 2;
 static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;

 LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
-    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
+    : DictionaryBreakEngine(),
      fDictionary(adoptDictionary)
 {
    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
@ -631,7 +627,7 @@ static const int32_t BURMESE_MIN_WORD = 2;
 static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;

 BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
-    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
+    : DictionaryBreakEngine(),
      fDictionary(adoptDictionary)
 {
    fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
@ -824,7 +820,7 @@ static const int32_t KHMER_MIN_WORD = 2;
 static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;

 KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
-    : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
+    : DictionaryBreakEngine(),
      fDictionary(adoptDictionary)
 {
    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
@ -1046,7 +1042,7 @@ foundBest:
 */
 static const uint32_t kuint32max = 0xFFFFFFFF;
 CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
-: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
+: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
    // Korean dictionary only includes Hangul syllables
    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -1241,7 +1241,7 @@ static void U_CALLCONV initLanguageFactories() {


 static const LanguageBreakEngine*
-getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
+getLanguageBreakEngineFromFactory(UChar32 c)
 {
    umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories);
    if (gLanguageBreakFactories == NULL) {
@ -1252,7 +1252,7 @@ getLanguageBreakEngineFromFactory(UChar32 c, int32_t breakType)
    const LanguageBreakEngine *lbe = NULL;
    while (--i >= 0) {
        LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i));
-        lbe = factory->getEngineFor(c, breakType);
+        lbe = factory->getEngineFor(c);
        if (lbe != NULL) {
            break;
        }
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIterator.java
@ -885,7 +885,6 @@ s     */
        iterCache[kind] = CacheValue.getInstance(cache);
        if (result instanceof RuleBasedBreakIterator) {
            RuleBasedBreakIterator rbbi = (RuleBasedBreakIterator)result;
-            rbbi.setBreakType(kind);
        }

        return result;
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BreakIteratorFactory.java
@ -161,7 +161,6 @@ final class BreakIteratorFactory extends BreakIterator.BreakIteratorServiceShim
        // TODO: Determine valid and actual locale correctly.
        ULocale uloc = ULocale.forLocale(rb.getLocale());
        iter.setLocale(uloc, uloc);
-        iter.setBreakType(kind);

        // filtered break
        if (kind == BreakIterator.KIND_SENTENCE) {
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/BurmeseBreakEngine.java
@ -80,12 +80,9 @@ class BurmeseBreakEngine extends DictionaryBreakEngine {
    }

    @Override
-    public boolean handles(int c, int breakType) {
-        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
-            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
-            return (script == UScript.MYANMAR);
-        }
-        return false;
+    public boolean handles(int c) {
+        int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+        return (script == UScript.MYANMAR);
    }

    @Override
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/DictionaryBreakEngine.java
@ -182,14 +182,13 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine {
    }

    @Override
-    public boolean handles(int c, int breakType) {
-        return fTypes.get(breakType) &&  // this type can use us
-                fSet.contains(c);        // we recognize the character
+    public boolean handles(int c) {
+        return fSet.contains(c);        // we recognize the character
    }

    @Override
    public int findBreaks(CharacterIterator text, int startPos, int endPos,
-            int breakType, DequeI foundBreaks) {
+            DequeI foundBreaks) {
        int result = 0;

         // Find the span of characters included in the set.
@ -208,8 +207,6 @@ abstract class DictionaryBreakEngine implements LanguageBreakEngine {
        rangeStart = start;
        rangeEnd = current;

-        // if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
-        // TODO: Why does icu4c have this?
        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
        text.setIndex(current);

--- a/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/LanguageBreakEngine.java
@ -17,11 +17,9 @@ import java.text.CharacterIterator;
 interface LanguageBreakEngine {
    /**
     * @param c A Unicode codepoint value
-     * @param breakType The kind of break iterator that is wanting to make use
-     *  of this engine - character, word, line, sentence
     * @return true if the engine can handle this character, false otherwise
     */
-    boolean handles(int c, int breakType);
+    boolean handles(int c);

    /**
     * Implements the actual breaking logic. Find any breaks within a run in the supplied text.
@ -30,13 +28,11 @@ interface LanguageBreakEngine {
     * @param startPos The index of the beginning of the range
     * @param endPos The index of the possible end of our range. It is possible,
     *  however, that the range ends earlier
-     * @param breakType The kind of break iterator that is wanting to make use
-     *  of this engine - character, word, line, sentence
     * @param foundBreaks A data structure to receive the break positions.
     * @return the number of breaks found
     */
    int findBreaks(CharacterIterator text, int startPos, int endPos,
-            int breakType, DictionaryBreakEngine.DequeI foundBreaks);
+            DictionaryBreakEngine.DequeI foundBreaks);
 }


--- a/icu4j/main/classes/core/src/com/ibm/icu/text/LaoBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/LaoBreakEngine.java
@ -16,7 +16,7 @@ import com.ibm.icu.lang.UProperty;
 import com.ibm.icu.lang.UScript;

 class LaoBreakEngine extends DictionaryBreakEngine {
-    
+
    // Constants for LaoBreakIterator
    // How many words in a row are "good enough"?
    private static final byte LAO_LOOKAHEAD = 3;
@ -27,13 +27,13 @@ class LaoBreakEngine extends DictionaryBreakEngine {
    private static final byte LAO_PREFIX_COMBINE_THRESHOLD = 3;
    // Minimum word size
    private static final byte LAO_MIN_WORD = 2;
-    
+
    private DictionaryMatcher fDictionary;
    private static UnicodeSet fLaoWordSet;
    private static UnicodeSet fEndWordSet;
    private static UnicodeSet fBeginWordSet;
    private static UnicodeSet fMarkSet;
-    
+
    static {
        // Initialize UnicodeSets
        fLaoWordSet = new UnicodeSet();
@ -55,14 +55,14 @@ class LaoBreakEngine extends DictionaryBreakEngine {
        fMarkSet.compact();
        fEndWordSet.compact();
        fBeginWordSet.compact();
-        
+
        // Freeze the static UnicodeSet
        fLaoWordSet.freeze();
        fMarkSet.freeze();
        fEndWordSet.freeze();
        fBeginWordSet.freeze();
    }
-    
+
    public LaoBreakEngine() throws IOException {
        super(BreakIterator.KIND_WORD, BreakIterator.KIND_LINE);
        setCharacters(fLaoWordSet);
@ -70,28 +70,29 @@ class LaoBreakEngine extends DictionaryBreakEngine {
        fDictionary = DictionaryData.loadDictionaryFor("Laoo");
    }

+    @Override
    public boolean equals(Object obj) {
        // Normally is a singleton, but it's possible to have duplicates
        //   during initialization. All are equivalent.
        return obj instanceof LaoBreakEngine;
    }

+    @Override
    public int hashCode() {
        return getClass().hashCode();
    }
-    
-    public boolean handles(int c, int breakType) {
-        if (breakType == BreakIterator.KIND_WORD || breakType == BreakIterator.KIND_LINE) {
-            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
-            return (script == UScript.LAO);
-        }
-        return false;
+
+    @Override
+    public boolean handles(int c) {
+        int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+        return (script == UScript.LAO);
    }

+    @Override
    public int divideUpDictionaryRange(CharacterIterator fIter, int rangeStart, int rangeEnd,
            DequeI foundBreaks) {
-        
-        
+
+
        if ((rangeEnd - rangeStart) < LAO_MIN_WORD) {
            return 0;  // Not enough characters for word
        }
@ -162,7 +163,7 @@ class LaoBreakEngine extends DictionaryBreakEngine {
                // no preceding word, or the non-word shares less than the minimum threshold
                // of characters with a dictionary word, then scan to resynchronize
                if (words[wordsFound%LAO_LOOKAHEAD].candidates(fIter, fDictionary, rangeEnd) <= 0 &&
-                        (wordLength == 0 || 
+                        (wordLength == 0 ||
                                words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
                    // Look for a plausible word boundary
                    int remaining = rangeEnd - (current + wordLength);
@ -208,7 +209,7 @@ class LaoBreakEngine extends DictionaryBreakEngine {

            // Look ahead for possible suffixes if a dictionary word does not follow.
            // We do this in code rather than using a rule so that the heuristic
-            // resynch continues to function. For example, one of the suffix characters 
+            // resynch continues to function. For example, one of the suffix characters
            // could be a typo in the middle of a word.
            // NOT CURRENTLY APPLICABLE TO LAO

--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@ -275,13 +275,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
    private static final boolean TRACE = ICUDebug.enabled(RBBI_DEBUG_ARG)
            && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;

-    /**
-     * What kind of break iterator this is.
-     * Defaulting BreakType to word gives reasonable dictionary behavior for
-     * Break Iterators that are built from rules.
-     */
-    private int fBreakType = KIND_WORD;
-
    /**
     * The "default" break engine - just skips over ranges of dictionary words,
     * producing no breaks. Should only be used if characters need to be handled
@ -646,21 +639,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
        this.first();
    }

-    /**
-     * package private
-     */
-    void setBreakType(int type) {
-        fBreakType = type;
-    }
-
-    /**
-     * package private
-     */
-    int getBreakType() {
-        return fBreakType;
-    }
-
-    /**
+     /**
     * Control debug, trace and dump options.
     * @internal
     */
@ -673,7 +652,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
        // We have a dictionary character.
        // Does an already instantiated break engine handle it?
        for (LanguageBreakEngine candidate : fBreakEngines) {
-            if (candidate.handles(c, fBreakType)) {
+            if (candidate.handles(c)) {
                return candidate;
            }
        }
@ -683,7 +662,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
            // Check the global list, another break iterator may have instantiated the
            // desired engine.
            for (LanguageBreakEngine candidate : gAllBreakEngines) {
-                if (candidate.handles(c, fBreakType)) {
+                if (candidate.handles(c)) {
                    fBreakEngines.add(candidate);
                    return candidate;
                }
@ -713,24 +692,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
                    eng = new KhmerBreakEngine();
                    break;
                case UScript.HAN:
-                    if (getBreakType() == KIND_WORD) {
-                        eng = new CjkBreakEngine(false);
-                    }
-                    else {
-                        gUnhandledBreakEngine.handleChar(c, getBreakType());
-                        eng = gUnhandledBreakEngine;
-                    }
-                    break;
+                    eng = new CjkBreakEngine(false);
+                     break;
                case UScript.HANGUL:
-                    if (getBreakType() == KIND_WORD) {
-                        eng = new CjkBreakEngine(true);
-                    } else {
-                        gUnhandledBreakEngine.handleChar(c, getBreakType());
-                        eng = gUnhandledBreakEngine;
-                    }
+                    eng = new CjkBreakEngine(true);
                    break;
                default:
-                    gUnhandledBreakEngine.handleChar(c, getBreakType());
+                    gUnhandledBreakEngine.handleChar(c);
                    eng = gUnhandledBreakEngine;
                    break;
                }
@ -1306,7 +1274,7 @@ public class RuleBasedBreakIterator extends BreakIterator {
                // Ask the language object if there are any breaks. It will add them to the cache and
                // leave the text pointer on the other side of its range, ready to search for the next one.
                if (lbe != null) {
-                    foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreakType, fBreaks);
+                    foundBreakCount += lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks);
                }

                // Reload the loop variables for the next go-round
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java
@ -8,17 +8,14 @@
 */
 package com.ibm.icu.text;

-import static com.ibm.icu.impl.CharacterIteration.DONE32;
-
 import java.text.CharacterIterator;
-import java.util.concurrent.atomic.AtomicReferenceArray;

 import com.ibm.icu.impl.CharacterIteration;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.lang.UProperty;

 final class UnhandledBreakEngine implements LanguageBreakEngine {
-    // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
+    // TODO: Use two UnicodeSets, one with all frozen sets, one with unfrozen.
    // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.

    // Note on concurrency: A single instance of UnhandledBreakEngine is shared across all
@ -35,49 +32,42 @@ final class UnhandledBreakEngine implements LanguageBreakEngine {
    // on which scripts have been previously seen by handleChar(). (This is not a
    // threading specific issue). Possibly stop on script boundaries?

-    final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1);
+    volatile UnicodeSet fHandled = new UnicodeSet();
    public UnhandledBreakEngine() {
-        for (int i = 0; i < fHandled.length(); i++) {
-            fHandled.set(i, new UnicodeSet());
-        }
    }

    @Override
-    public boolean handles(int c, int breakType) {
-        return (breakType >= 0 && breakType < fHandled.length()) &&
-                (fHandled.get(breakType).contains(c));
+    public boolean handles(int c) {
+        return fHandled.contains(c);
    }

    @Override
    public int findBreaks(CharacterIterator text, int startPos, int endPos,
-            int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
-        if (breakType >= 0 && breakType < fHandled.length()) {
-            UnicodeSet uniset = fHandled.get(breakType);
-            int c = CharacterIteration.current32(text);
-            while (text.getIndex() < endPos && uniset.contains(c)) {
-                CharacterIteration.next32(text);
-                c = CharacterIteration.current32(text);
-            }
+            DictionaryBreakEngine.DequeI foundBreaks) {
+
+        UnicodeSet uniset = fHandled;
+        int c = CharacterIteration.current32(text);
+        while (text.getIndex() < endPos && uniset.contains(c)) {
+            CharacterIteration.next32(text);
+            c = CharacterIteration.current32(text);
        }
        return 0;
    }

    /**
-     * Update the set of unhandled characters for the specified breakType to include
+     * Update the set of unhandled characters to include
     * all that have the same script as c.
     * May be called concurrently with handles() or findBreaks().
     * Must not be called concurrently with itself.
     */
-    public void handleChar(int c, int breakType) {
-        if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) {
-            UnicodeSet originalSet = fHandled.get(breakType);
-            if (!originalSet.contains(c)) {
-                int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
-                UnicodeSet newSet = new UnicodeSet();
-                newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
-                newSet.addAll(originalSet);
-                fHandled.set(breakType, newSet);
-            }
+    public void handleChar(int c) {
+        UnicodeSet originalSet = fHandled;
+        if (!originalSet.contains(c)) {
+            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+            UnicodeSet newSet = new UnicodeSet();
+            newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
+            newSet.addAll(originalSet);
+            fHandled = newSet;
        }
    }
 }