ICU-12873 Race Conditions in RuleBasedBreakIterator.

X-SVN-Rev: 39572
This commit is contained in:
Andy Heninger 2017-01-17 23:13:25 +00:00
parent 32ca386c27
commit 82dfd8b26e
3 changed files with 205 additions and 61 deletions

View file

@ -20,7 +20,8 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.text.CharacterIterator;
import java.util.concurrent.ConcurrentHashMap;
import java.util.ArrayList;
import java.util.List;
import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.CharTrie;
@ -48,7 +49,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
private RuleBasedBreakIterator() {
fLastStatusIndexValid = true;
fDictionaryCharCount = 0;
fBreakEngines.put(-1, fUnhandledBreakEngine);
synchronized(gAllBreakEngines) {
fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
}
}
/**
@ -134,6 +137,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
if (fText != null) {
result.fText = (CharacterIterator)(fText.clone());
}
synchronized (gAllBreakEngines) {
result.fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
}
result.fLookAheadMatches = new LookAheadResults();
if (fCachedBreakPositions != null) {
result.fCachedBreakPositions = fCachedBreakPositions.clone();
}
return result;
}
@ -260,10 +270,34 @@ public class RuleBasedBreakIterator extends BreakIterator {
* The "default" break engine - just skips over ranges of dictionary words,
* producing no breaks. Should only be used if characters need to be handled
* by a dictionary but we have no dictionary implementation for them.
*
* Only one instance; shared by all break iterators.
*/
private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
private static final UnhandledBreakEngine gUnhandledBreakEngine;
/**
* List of all known break engines, common for all break iterators.
* Lazily updated as break engines are needed, because instantiation of
* break engines is expensive.
*
* Because gAllBreakEngines can be referenced concurrently from different
* BreakIterator instances, all access is synchronized.
*/
private static final List<LanguageBreakEngine> gAllBreakEngines;
static {
gUnhandledBreakEngine = new UnhandledBreakEngine();
gAllBreakEngines = new ArrayList<LanguageBreakEngine>();
gAllBreakEngines.add(gUnhandledBreakEngine);
}
/**
* List of all known break engines. Similar to gAllBreakEngines, but local to a
* break iterator, allowing it to be used without synchronization.
*/
private List<LanguageBreakEngine> fBreakEngines;
/**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
@ -277,9 +311,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
*/
private int fPositionInCache;
private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
new ConcurrentHashMap<Integer, LanguageBreakEngine>();
/**
* Dumps caches and performs other actions associated with a complete change
* in text or iteration position.
@ -1107,26 +1138,32 @@ public class RuleBasedBreakIterator extends BreakIterator {
// We have a dictionary character.
// Does an already instantiated break engine handle it?
for (LanguageBreakEngine candidate : fBreakEngines.values()) {
for (LanguageBreakEngine candidate : fBreakEngines) {
if (candidate.handles(c, fBreakType)) {
return candidate;
}
}
// if we don't have an existing engine, build one.
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
// Katakana, Hiragana and Han are handled by the same dictionary engine.
// Fold them together for mapping from script -> engine.
script = UScript.HAN;
}
synchronized (gAllBreakEngines) {
// This break iterator's list of break engines didn't handle the character.
// Check the global list, another break iterator may have instantiated the
// desired engine.
for (LanguageBreakEngine candidate : gAllBreakEngines) {
if (candidate.handles(c, fBreakType)) {
fBreakEngines.add(candidate);
return candidate;
}
}
LanguageBreakEngine eng = fBreakEngines.get(script);
/*
if (eng != null && !eng.handles(c, fBreakType)) {
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
} else */ {
// The global list doesn't have an existing engine, build one.
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
// Katakana, Hiragana and Han are handled by the same dictionary engine.
// Fold them together for mapping from script -> engine.
script = UScript.HAN;
}
LanguageBreakEngine eng;
try {
switch (script) {
case UScript.THAI:
@ -1146,38 +1183,33 @@ public class RuleBasedBreakIterator extends BreakIterator {
eng = new CjkBreakEngine(false);
}
else {
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
gUnhandledBreakEngine.handleChar(c, getBreakType());
eng = gUnhandledBreakEngine;
}
break;
case UScript.HANGUL:
if (getBreakType() == KIND_WORD) {
eng = new CjkBreakEngine(true);
} else {
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
gUnhandledBreakEngine.handleChar(c, getBreakType());
eng = gUnhandledBreakEngine;
}
break;
default:
fUnhandledBreakEngine.handleChar(c, getBreakType());
eng = fUnhandledBreakEngine;
gUnhandledBreakEngine.handleChar(c, getBreakType());
eng = gUnhandledBreakEngine;
break;
}
} catch (IOException e) {
eng = null;
}
}
if (eng != null && eng != fUnhandledBreakEngine) {
LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
if (existingEngine != null) {
// There was a race & another thread was first to register an engine for this script.
// Use theirs and discard the one we just created.
eng = existingEngine;
if (eng != null && eng != gUnhandledBreakEngine) {
gAllBreakEngines.add(eng);
fBreakEngines.add(eng);
}
// assert eng.handles(c, fBreakType);
}
return eng;
return eng;
} // end synchronized(gAllBreakEngines)
}
private static final int kMaxLookaheads = 8;

View file

@ -11,6 +11,7 @@ package com.ibm.icu.text;
import static com.ibm.icu.impl.CharacterIteration.DONE32;
import java.text.CharacterIterator;
import java.util.concurrent.atomic.AtomicReferenceArray;
import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.lang.UCharacter;
@ -19,42 +20,70 @@ import com.ibm.icu.lang.UProperty;
final class UnhandledBreakEngine implements LanguageBreakEngine {
// TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
// in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1];
// Note on concurrency: A single instance of UnhandledBreakEngine is shared across all
// RuleBasedBreakIterators in a process. They may make arbitrary concurrent calls.
// If handleChar() is updating the set of unhandled characters at the same time
// findBreaks() or handles() is referencing it, the referencing functions must see
// a consistent set. It doesn't matter whether they see it before or after the update,
// but they should not see an inconsistent, changing set.
//
// To do this, an update is made by cloning the old set, updating the clone, then
// replacing the old with the new. Once made visible, each set remains constant.
// TODO: it's odd that findBreaks() can produce different results, depending
// on which scripts have been previously seen by handleChar(). (This is not a
// threading specific issue). Possibly stop on script boundaries?
final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1);
public UnhandledBreakEngine() {
for (int i = 0; i < fHandled.length; i++) {
fHandled[i] = new UnicodeSet();
for (int i = 0; i < fHandled.length(); i++) {
fHandled.set(i, new UnicodeSet());
}
}
@Override
public boolean handles(int c, int breakType) {
return (breakType >= 0 && breakType < fHandled.length) &&
(fHandled[breakType].contains(c));
return (breakType >= 0 && breakType < fHandled.length()) &&
(fHandled.get(breakType).contains(c));
}
@Override
public int findBreaks(CharacterIterator text, int startPos, int endPos,
boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
if (breakType >= 0 && breakType < fHandled.length) {
int c = CharacterIteration.current32(text);
if (reverse) {
while (text.getIndex() > startPos && fHandled[breakType].contains(c)) {
CharacterIteration.previous32(text);
c = CharacterIteration.current32(text);
}
} else {
while (text.getIndex() < endPos && fHandled[breakType].contains(c)) {
CharacterIteration.next32(text);
c = CharacterIteration.current32(text);
}
}
}
if (breakType >= 0 && breakType < fHandled.length()) {
UnicodeSet uniset = fHandled.get(breakType);
int c = CharacterIteration.current32(text);
if (reverse) {
while (text.getIndex() > startPos && uniset.contains(c)) {
CharacterIteration.previous32(text);
c = CharacterIteration.current32(text);
}
} else {
while (text.getIndex() < endPos && uniset.contains(c)) {
CharacterIteration.next32(text);
c = CharacterIteration.current32(text);
}
}
}
return 0;
}
public synchronized void handleChar(int c, int breakType) {
if (breakType >= 0 && breakType < fHandled.length && c != DONE32) {
if (!fHandled[breakType].contains(c)) {
/**
* Update the set of unhandled characters for the specified breakType to include
* all that have the same script as c.
* May be called concurrently with handles() or findBreaks().
* Must not be called concurrently with itself.
*/
public void handleChar(int c, int breakType) {
if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) {
UnicodeSet originalSet = fHandled.get(breakType);
if (!originalSet.contains(c)) {
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script);
UnicodeSet newSet = new UnicodeSet();
newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
newSet.addAll(originalSet);
fHandled.set(breakType, newSet);
}
}
}

View file

@ -851,6 +851,89 @@ public class RBBITest extends TestFmwk {
bi.setText("abc");
bi.first();
assertEquals("Rule chaining test", 3, bi.next());
}
}
@Test
public void TestBug12873() {
// Bug with RuleBasedBreakIterator's internal structure for recording potential look-ahead
// matches not being cloned when a break iterator is cloned. This resulted in usage
// collisions if the original break iterator and its clone were used concurrently.
// The Line Break rules for Regional Indicators make use of look-ahead rules, and
// show the bug. 1F1E6 = \uD83C\uDDE6 = REGIONAL INDICATOR SYMBOL LETTER A
// Regional indicators group into pairs, expect breaks after two code points, which
// is after four 16 bit code units.
final String dataToBreak = "\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6";
final RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance();
final AssertionError[] assertErr = new AssertionError[1]; // saves an error found from within a thread
class WorkerThread implements Runnable {
@Override
public void run() {
try {
RuleBasedBreakIterator localBI = (RuleBasedBreakIterator)bi.clone();
localBI.setText(dataToBreak);
for (int loop=0; loop<100; loop++) {
int nextExpectedBreak = 0;
for (int actualBreak = localBI.first(); actualBreak != BreakIterator.DONE;
actualBreak = localBI.next(), nextExpectedBreak+= 4) {
assertEquals("", nextExpectedBreak, actualBreak);
}
assertEquals("", dataToBreak.length()+4, nextExpectedBreak);
}
} catch (AssertionError e) {
assertErr[0] = e;
}
}
}
List<Thread> threads = new ArrayList<Thread>();
for (int n = 0; n<4; ++n) {
threads.add(new Thread(new WorkerThread()));
}
for (Thread thread: threads) {
thread.start();
}
for (Thread thread: threads) {
try {
thread.join();
} catch (InterruptedException e) {
fail(e.toString());
}
}
// JUnit wont see failures from within the worker threads, so
// check again if one occurred.
if (assertErr[0] != null) {
throw assertErr[0];
}
}
@Test
public void TestBreakAllChars() {
// Make a "word" from each code point, separated by spaces.
// For dictionary based breaking, runs the start-of-range
// logic with all possible dictionary characters.
StringBuilder sb = new StringBuilder();
for (int c=0; c<0x110000; ++c) {
sb.appendCodePoint(c);
sb.appendCodePoint(c);
sb.appendCodePoint(c);
sb.appendCodePoint(c);
sb.append(' ');
}
String s = sb.toString();
for (int breakKind=BreakIterator.KIND_CHARACTER; breakKind<=BreakIterator.KIND_TITLE; ++breakKind) {
RuleBasedBreakIterator bi =
(RuleBasedBreakIterator)BreakIterator.getBreakInstance(ULocale.ENGLISH, breakKind);
bi.setText(s);
int lastb = -1;
for (int b = bi.first(); b != BreakIterator.DONE; b = bi.next()) {
assertTrue("(lastb < b) : (" + lastb + " < " + b + ")", lastb < b);
}
}
}
}