ICU-12873 Race Conditions in RuleBasedBreakIterator.

X-SVN-Rev: 39572
2025-04-07 22:44:49 +00:00 · 2017-01-17 23:13:25 +00:00 · 2017-01-17 23:13:25 +00:00 · 82dfd8b26e
commit 82dfd8b26e
parent 32ca386c27
3 changed files with 205 additions and 61 deletions
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@ -20,7 +20,8 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.nio.ByteBuffer;
 import java.text.CharacterIterator;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.ArrayList;
+import java.util.List;

 import com.ibm.icu.impl.Assert;
 import com.ibm.icu.impl.CharTrie;
@ -48,7 +49,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
    private RuleBasedBreakIterator() {
        fLastStatusIndexValid = true;
        fDictionaryCharCount  = 0;
-        fBreakEngines.put(-1, fUnhandledBreakEngine);
+        synchronized(gAllBreakEngines) {
+            fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
+        }
    }

    /**
@ -134,6 +137,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
        if (fText != null) {
            result.fText = (CharacterIterator)(fText.clone());
        }
+        synchronized (gAllBreakEngines)  {
+            result.fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
+        }
+        result.fLookAheadMatches = new LookAheadResults();
+        if (fCachedBreakPositions != null) {
+            result.fCachedBreakPositions = fCachedBreakPositions.clone();
+        }
        return result;
    }

@ -260,10 +270,34 @@ public class RuleBasedBreakIterator extends BreakIterator {
     * The "default" break engine - just skips over ranges of dictionary words,
     * producing no breaks. Should only be used if characters need to be handled
     * by a dictionary but we have no dictionary implementation for them.
+     *
+     * Only one instance; shared by all break iterators.
     */
-    private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
+    private static final UnhandledBreakEngine gUnhandledBreakEngine;

    /**
+     * List of all known break engines, common for all break iterators.
+     * Lazily updated as break engines are needed, because instantiation of
+     * break engines is expensive.
+     *
+     * Because gAllBreakEngines can be referenced concurrently from different
+     * BreakIterator instances, all access is synchronized.
+     */
+    private static final List<LanguageBreakEngine> gAllBreakEngines;
+
+    static {
+        gUnhandledBreakEngine = new UnhandledBreakEngine();
+        gAllBreakEngines = new ArrayList<LanguageBreakEngine>();
+        gAllBreakEngines.add(gUnhandledBreakEngine);
+    }
+
+    /**
+     * List of all known break engines. Similar to gAllBreakEngines, but local to a
+     * break iterator, allowing it to be used without synchronization.
+     */
+    private List<LanguageBreakEngine> fBreakEngines;
+
+  /**
     * when a range of characters is divided up using the dictionary, the break
     * positions that are discovered are stored here, preventing us from having
     * to use either the dictionary or the state table again until the iterator
@ -277,9 +311,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
     */
    private int fPositionInCache;

-
-    private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
-            new ConcurrentHashMap<Integer, LanguageBreakEngine>();
    /**
     * Dumps caches and performs other actions associated with a complete change
     * in text or iteration position.
@ -1107,26 +1138,32 @@ public class RuleBasedBreakIterator extends BreakIterator {

        // We have a dictionary character.
        // Does an already instantiated break engine handle it?
-        for (LanguageBreakEngine candidate : fBreakEngines.values()) {
+        for (LanguageBreakEngine candidate : fBreakEngines) {
            if (candidate.handles(c, fBreakType)) {
                return candidate;
            }
        }

-        // if we don't have an existing engine, build one.
-        int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
-        if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
-            // Katakana, Hiragana and Han are handled by the same dictionary engine.
-            // Fold them together for mapping from script -> engine.
-            script = UScript.HAN;
-        }
+        synchronized (gAllBreakEngines) {
+            // This break iterator's list of break engines didn't handle the character.
+            // Check the global list, another break iterator may have instantiated the
+            // desired engine.
+            for (LanguageBreakEngine candidate : gAllBreakEngines) {
+                if (candidate.handles(c, fBreakType)) {
+                    fBreakEngines.add(candidate);
+                    return candidate;
+                }
+            }

-        LanguageBreakEngine eng = fBreakEngines.get(script);
-        /*
-        if (eng != null && !eng.handles(c, fBreakType)) {
-            fUnhandledBreakEngine.handleChar(c, getBreakType());
-            eng = fUnhandledBreakEngine;
-        } else  */  {
+            // The global list doesn't have an existing engine, build one.
+            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
+            if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
+                // Katakana, Hiragana and Han are handled by the same dictionary engine.
+                // Fold them together for mapping from script -> engine.
+                script = UScript.HAN;
+            }
+
+            LanguageBreakEngine eng;
            try {
                switch (script) {
                case UScript.THAI:
@ -1146,38 +1183,33 @@ public class RuleBasedBreakIterator extends BreakIterator {
                        eng = new CjkBreakEngine(false);
                    }
                    else {
-                        fUnhandledBreakEngine.handleChar(c, getBreakType());
-                        eng = fUnhandledBreakEngine;
+                        gUnhandledBreakEngine.handleChar(c, getBreakType());
+                        eng = gUnhandledBreakEngine;
                    }
                    break;
                case UScript.HANGUL:
                    if (getBreakType() == KIND_WORD) {
                        eng = new CjkBreakEngine(true);
                    } else {
-                        fUnhandledBreakEngine.handleChar(c, getBreakType());
-                        eng = fUnhandledBreakEngine;
+                        gUnhandledBreakEngine.handleChar(c, getBreakType());
+                        eng = gUnhandledBreakEngine;
                    }
                    break;
                default:
-                    fUnhandledBreakEngine.handleChar(c, getBreakType());
-                    eng = fUnhandledBreakEngine;
+                    gUnhandledBreakEngine.handleChar(c, getBreakType());
+                    eng = gUnhandledBreakEngine;
                    break;
                }
            } catch (IOException e) {
                eng = null;
            }
-        }

-        if (eng != null && eng != fUnhandledBreakEngine) {
-            LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
-            if (existingEngine != null) {
-                // There was a race & another thread was first to register an engine for this script.
-                // Use theirs and discard the one we just created.
-                eng = existingEngine;
+            if (eng != null && eng != gUnhandledBreakEngine) {
+                gAllBreakEngines.add(eng);
+                fBreakEngines.add(eng);
            }
-            // assert eng.handles(c, fBreakType);
-        }
-        return eng;
+            return eng;
+        }   // end synchronized(gAllBreakEngines)
    }

    private static final int kMaxLookaheads = 8;
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/UnhandledBreakEngine.java
@ -11,6 +11,7 @@ package com.ibm.icu.text;
 import static com.ibm.icu.impl.CharacterIteration.DONE32;

 import java.text.CharacterIterator;
+import java.util.concurrent.atomic.AtomicReferenceArray;

 import com.ibm.icu.impl.CharacterIteration;
 import com.ibm.icu.lang.UCharacter;
@ -19,42 +20,70 @@ import com.ibm.icu.lang.UProperty;
 final class UnhandledBreakEngine implements LanguageBreakEngine {
    // TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
    // in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
-    private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1];
+
+    // Note on concurrency: A single instance of UnhandledBreakEngine is shared across all
+    // RuleBasedBreakIterators in a process. They may make arbitrary concurrent calls.
+    // If handleChar() is updating the set of unhandled characters at the same time
+    // findBreaks() or handles() is referencing it, the referencing functions must see
+    // a consistent set. It doesn't matter whether they see it before or after the update,
+    // but they should not see an inconsistent, changing set.
+    //
+    // To do this, an update is made by cloning the old set, updating the clone, then
+    // replacing the old with the new. Once made visible, each set remains constant.
+
+    // TODO: it's odd that findBreaks() can produce different results, depending
+    // on which scripts have been previously seen by handleChar(). (This is not a
+    // threading specific issue). Possibly stop on script boundaries?
+
+    final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1);
    public UnhandledBreakEngine() {
-        for (int i = 0; i < fHandled.length; i++) {
-            fHandled[i] = new UnicodeSet();
+        for (int i = 0; i < fHandled.length(); i++) {
+            fHandled.set(i, new UnicodeSet());
        }
    }
-    
+
+    @Override
    public boolean handles(int c, int breakType) {
-        return (breakType >= 0 && breakType < fHandled.length) && 
-                (fHandled[breakType].contains(c));
+        return (breakType >= 0 && breakType < fHandled.length()) &&
+                (fHandled.get(breakType).contains(c));
    }

+    @Override
    public int findBreaks(CharacterIterator text, int startPos, int endPos,
            boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
-        if (breakType >= 0 && breakType < fHandled.length) { 
-            int c = CharacterIteration.current32(text); 
-            if (reverse) { 
-                while (text.getIndex() > startPos && fHandled[breakType].contains(c)) { 
-                    CharacterIteration.previous32(text); 
-                    c = CharacterIteration.current32(text); 
-                } 
-            } else { 
-                while (text.getIndex() < endPos && fHandled[breakType].contains(c)) { 
-                    CharacterIteration.next32(text); 
-                    c = CharacterIteration.current32(text); 
-                } 
-            } 
-        } 
+        if (breakType >= 0 && breakType < fHandled.length()) {
+            UnicodeSet uniset = fHandled.get(breakType);
+            int c = CharacterIteration.current32(text);
+            if (reverse) {
+                while (text.getIndex() > startPos && uniset.contains(c)) {
+                    CharacterIteration.previous32(text);
+                    c = CharacterIteration.current32(text);
+                }
+            } else {
+                while (text.getIndex() < endPos && uniset.contains(c)) {
+                    CharacterIteration.next32(text);
+                    c = CharacterIteration.current32(text);
+                }
+            }
+        }
        return 0;
    }

-    public synchronized void handleChar(int c, int breakType) {
-        if (breakType >= 0 && breakType < fHandled.length && c != DONE32) {
-            if (!fHandled[breakType].contains(c)) {
+    /**
+     * Update the set of unhandled characters for the specified breakType to include
+     * all that have the same script as c.
+     * May be called concurrently with handles() or findBreaks().
+     * Must not be called concurrently with itself.
+     */
+    public void handleChar(int c, int breakType) {
+        if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) {
+            UnicodeSet originalSet = fHandled.get(breakType);
+            if (!originalSet.contains(c)) {
                int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
-                fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script);
+                UnicodeSet newSet = new UnicodeSet();
+                newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
+                newSet.addAll(originalSet);
+                fHandled.set(breakType, newSet);
            }
        }
    }
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/RBBITest.java
@ -851,6 +851,89 @@ public class RBBITest extends TestFmwk {
        bi.setText("abc");
        bi.first();
        assertEquals("Rule chaining test", 3,  bi.next());
-         }
    }

+
+    @Test
+    public void TestBug12873() {
+        // Bug with RuleBasedBreakIterator's internal structure for recording potential look-ahead
+        // matches not being cloned when a break iterator is cloned. This resulted in usage
+        // collisions if the original break iterator and its clone were used concurrently.
+
+        // The Line Break rules for Regional Indicators make use of look-ahead rules, and
+        // show the bug. 1F1E6 = \uD83C\uDDE6 = REGIONAL INDICATOR SYMBOL LETTER A
+        // Regional indicators group into pairs, expect breaks after two code points, which
+        // is after four 16 bit code units.
+
+        final String dataToBreak = "\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6";
+        final RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance();
+        final AssertionError[] assertErr = new AssertionError[1];  // saves an error found from within a thread
+
+        class WorkerThread implements Runnable {
+            @Override
+            public void run() {
+                try {
+                    RuleBasedBreakIterator localBI = (RuleBasedBreakIterator)bi.clone();
+                    localBI.setText(dataToBreak);
+                    for (int loop=0; loop<100; loop++) {
+                        int nextExpectedBreak = 0;
+                        for (int actualBreak = localBI.first(); actualBreak != BreakIterator.DONE;
+                                actualBreak = localBI.next(), nextExpectedBreak+= 4) {
+                            assertEquals("", nextExpectedBreak, actualBreak);
+                        }
+                        assertEquals("", dataToBreak.length()+4, nextExpectedBreak);
+                    }
+                } catch (AssertionError e) {
+                    assertErr[0] = e;
+                }
+            }
+        }
+
+        List<Thread> threads = new ArrayList<Thread>();
+        for (int n = 0; n<4; ++n) {
+            threads.add(new Thread(new WorkerThread()));
+        }
+        for (Thread thread: threads) {
+            thread.start();
+        }
+        for (Thread thread: threads) {
+            try {
+                thread.join();
+            } catch (InterruptedException e) {
+                fail(e.toString());
+            }
+        }
+
+        // JUnit wont see failures from within the worker threads, so
+        // check again if one occurred.
+        if (assertErr[0] != null) {
+            throw assertErr[0];
+        }
+    }
+
+    @Test
+    public void TestBreakAllChars() {
+        // Make a "word" from each code point, separated by spaces.
+        // For dictionary based breaking, runs the start-of-range
+        // logic with all possible dictionary characters.
+        StringBuilder sb = new StringBuilder();
+        for (int c=0; c<0x110000; ++c) {
+            sb.appendCodePoint(c);
+            sb.appendCodePoint(c);
+            sb.appendCodePoint(c);
+            sb.appendCodePoint(c);
+            sb.append(' ');
+        }
+        String s = sb.toString();
+
+        for (int breakKind=BreakIterator.KIND_CHARACTER; breakKind<=BreakIterator.KIND_TITLE; ++breakKind) {
+            RuleBasedBreakIterator bi =
+                    (RuleBasedBreakIterator)BreakIterator.getBreakInstance(ULocale.ENGLISH, breakKind);
+            bi.setText(s);
+            int lastb = -1;
+            for (int b = bi.first(); b != BreakIterator.DONE; b = bi.next()) {
+                assertTrue("(lastb < b) : (" + lastb + " < " + b + ")", lastb < b);
+            }
+        }
+    }
+}