mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-12873 Race Conditions in RuleBasedBreakIterator.
X-SVN-Rev: 39572
This commit is contained in:
parent
32ca386c27
commit
82dfd8b26e
3 changed files with 205 additions and 61 deletions
|
@ -20,7 +20,8 @@ import java.io.InputStream;
|
|||
import java.io.OutputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.ibm.icu.impl.Assert;
|
||||
import com.ibm.icu.impl.CharTrie;
|
||||
|
@ -48,7 +49,9 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
private RuleBasedBreakIterator() {
|
||||
fLastStatusIndexValid = true;
|
||||
fDictionaryCharCount = 0;
|
||||
fBreakEngines.put(-1, fUnhandledBreakEngine);
|
||||
synchronized(gAllBreakEngines) {
|
||||
fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -134,6 +137,13 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
if (fText != null) {
|
||||
result.fText = (CharacterIterator)(fText.clone());
|
||||
}
|
||||
synchronized (gAllBreakEngines) {
|
||||
result.fBreakEngines = new ArrayList<LanguageBreakEngine>(gAllBreakEngines);
|
||||
}
|
||||
result.fLookAheadMatches = new LookAheadResults();
|
||||
if (fCachedBreakPositions != null) {
|
||||
result.fCachedBreakPositions = fCachedBreakPositions.clone();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -260,10 +270,34 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
* The "default" break engine - just skips over ranges of dictionary words,
|
||||
* producing no breaks. Should only be used if characters need to be handled
|
||||
* by a dictionary but we have no dictionary implementation for them.
|
||||
*
|
||||
* Only one instance; shared by all break iterators.
|
||||
*/
|
||||
private final UnhandledBreakEngine fUnhandledBreakEngine = new UnhandledBreakEngine();
|
||||
private static final UnhandledBreakEngine gUnhandledBreakEngine;
|
||||
|
||||
/**
|
||||
* List of all known break engines, common for all break iterators.
|
||||
* Lazily updated as break engines are needed, because instantiation of
|
||||
* break engines is expensive.
|
||||
*
|
||||
* Because gAllBreakEngines can be referenced concurrently from different
|
||||
* BreakIterator instances, all access is synchronized.
|
||||
*/
|
||||
private static final List<LanguageBreakEngine> gAllBreakEngines;
|
||||
|
||||
static {
|
||||
gUnhandledBreakEngine = new UnhandledBreakEngine();
|
||||
gAllBreakEngines = new ArrayList<LanguageBreakEngine>();
|
||||
gAllBreakEngines.add(gUnhandledBreakEngine);
|
||||
}
|
||||
|
||||
/**
|
||||
* List of all known break engines. Similar to gAllBreakEngines, but local to a
|
||||
* break iterator, allowing it to be used without synchronization.
|
||||
*/
|
||||
private List<LanguageBreakEngine> fBreakEngines;
|
||||
|
||||
/**
|
||||
* when a range of characters is divided up using the dictionary, the break
|
||||
* positions that are discovered are stored here, preventing us from having
|
||||
* to use either the dictionary or the state table again until the iterator
|
||||
|
@ -277,9 +311,6 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
*/
|
||||
private int fPositionInCache;
|
||||
|
||||
|
||||
private final ConcurrentHashMap<Integer, LanguageBreakEngine> fBreakEngines =
|
||||
new ConcurrentHashMap<Integer, LanguageBreakEngine>();
|
||||
/**
|
||||
* Dumps caches and performs other actions associated with a complete change
|
||||
* in text or iteration position.
|
||||
|
@ -1107,26 +1138,32 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
|
||||
// We have a dictionary character.
|
||||
// Does an already instantiated break engine handle it?
|
||||
for (LanguageBreakEngine candidate : fBreakEngines.values()) {
|
||||
for (LanguageBreakEngine candidate : fBreakEngines) {
|
||||
if (candidate.handles(c, fBreakType)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
// if we don't have an existing engine, build one.
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
|
||||
// Katakana, Hiragana and Han are handled by the same dictionary engine.
|
||||
// Fold them together for mapping from script -> engine.
|
||||
script = UScript.HAN;
|
||||
}
|
||||
synchronized (gAllBreakEngines) {
|
||||
// This break iterator's list of break engines didn't handle the character.
|
||||
// Check the global list, another break iterator may have instantiated the
|
||||
// desired engine.
|
||||
for (LanguageBreakEngine candidate : gAllBreakEngines) {
|
||||
if (candidate.handles(c, fBreakType)) {
|
||||
fBreakEngines.add(candidate);
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
LanguageBreakEngine eng = fBreakEngines.get(script);
|
||||
/*
|
||||
if (eng != null && !eng.handles(c, fBreakType)) {
|
||||
fUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
eng = fUnhandledBreakEngine;
|
||||
} else */ {
|
||||
// The global list doesn't have an existing engine, build one.
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
|
||||
// Katakana, Hiragana and Han are handled by the same dictionary engine.
|
||||
// Fold them together for mapping from script -> engine.
|
||||
script = UScript.HAN;
|
||||
}
|
||||
|
||||
LanguageBreakEngine eng;
|
||||
try {
|
||||
switch (script) {
|
||||
case UScript.THAI:
|
||||
|
@ -1146,38 +1183,33 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
|||
eng = new CjkBreakEngine(false);
|
||||
}
|
||||
else {
|
||||
fUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
eng = fUnhandledBreakEngine;
|
||||
gUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
eng = gUnhandledBreakEngine;
|
||||
}
|
||||
break;
|
||||
case UScript.HANGUL:
|
||||
if (getBreakType() == KIND_WORD) {
|
||||
eng = new CjkBreakEngine(true);
|
||||
} else {
|
||||
fUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
eng = fUnhandledBreakEngine;
|
||||
gUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
eng = gUnhandledBreakEngine;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
fUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
eng = fUnhandledBreakEngine;
|
||||
gUnhandledBreakEngine.handleChar(c, getBreakType());
|
||||
eng = gUnhandledBreakEngine;
|
||||
break;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
eng = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (eng != null && eng != fUnhandledBreakEngine) {
|
||||
LanguageBreakEngine existingEngine = fBreakEngines.putIfAbsent(script, eng);
|
||||
if (existingEngine != null) {
|
||||
// There was a race & another thread was first to register an engine for this script.
|
||||
// Use theirs and discard the one we just created.
|
||||
eng = existingEngine;
|
||||
if (eng != null && eng != gUnhandledBreakEngine) {
|
||||
gAllBreakEngines.add(eng);
|
||||
fBreakEngines.add(eng);
|
||||
}
|
||||
// assert eng.handles(c, fBreakType);
|
||||
}
|
||||
return eng;
|
||||
return eng;
|
||||
} // end synchronized(gAllBreakEngines)
|
||||
}
|
||||
|
||||
private static final int kMaxLookaheads = 8;
|
||||
|
|
|
@ -11,6 +11,7 @@ package com.ibm.icu.text;
|
|||
import static com.ibm.icu.impl.CharacterIteration.DONE32;
|
||||
|
||||
import java.text.CharacterIterator;
|
||||
import java.util.concurrent.atomic.AtomicReferenceArray;
|
||||
|
||||
import com.ibm.icu.impl.CharacterIteration;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
|
@ -19,42 +20,70 @@ import com.ibm.icu.lang.UProperty;
|
|||
final class UnhandledBreakEngine implements LanguageBreakEngine {
|
||||
// TODO: Use two arrays of UnicodeSet, one with all frozen sets, one with unfrozen.
|
||||
// in handleChar(), update the unfrozen version, clone, freeze, replace the frozen one.
|
||||
private final UnicodeSet[] fHandled = new UnicodeSet[BreakIterator.KIND_TITLE + 1];
|
||||
|
||||
// Note on concurrency: A single instance of UnhandledBreakEngine is shared across all
|
||||
// RuleBasedBreakIterators in a process. They may make arbitrary concurrent calls.
|
||||
// If handleChar() is updating the set of unhandled characters at the same time
|
||||
// findBreaks() or handles() is referencing it, the referencing functions must see
|
||||
// a consistent set. It doesn't matter whether they see it before or after the update,
|
||||
// but they should not see an inconsistent, changing set.
|
||||
//
|
||||
// To do this, an update is made by cloning the old set, updating the clone, then
|
||||
// replacing the old with the new. Once made visible, each set remains constant.
|
||||
|
||||
// TODO: it's odd that findBreaks() can produce different results, depending
|
||||
// on which scripts have been previously seen by handleChar(). (This is not a
|
||||
// threading specific issue). Possibly stop on script boundaries?
|
||||
|
||||
final AtomicReferenceArray<UnicodeSet> fHandled = new AtomicReferenceArray<UnicodeSet>(BreakIterator.KIND_TITLE + 1);
|
||||
public UnhandledBreakEngine() {
|
||||
for (int i = 0; i < fHandled.length; i++) {
|
||||
fHandled[i] = new UnicodeSet();
|
||||
for (int i = 0; i < fHandled.length(); i++) {
|
||||
fHandled.set(i, new UnicodeSet());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean handles(int c, int breakType) {
|
||||
return (breakType >= 0 && breakType < fHandled.length) &&
|
||||
(fHandled[breakType].contains(c));
|
||||
return (breakType >= 0 && breakType < fHandled.length()) &&
|
||||
(fHandled.get(breakType).contains(c));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int findBreaks(CharacterIterator text, int startPos, int endPos,
|
||||
boolean reverse, int breakType, DictionaryBreakEngine.DequeI foundBreaks) {
|
||||
if (breakType >= 0 && breakType < fHandled.length) {
|
||||
int c = CharacterIteration.current32(text);
|
||||
if (reverse) {
|
||||
while (text.getIndex() > startPos && fHandled[breakType].contains(c)) {
|
||||
CharacterIteration.previous32(text);
|
||||
c = CharacterIteration.current32(text);
|
||||
}
|
||||
} else {
|
||||
while (text.getIndex() < endPos && fHandled[breakType].contains(c)) {
|
||||
CharacterIteration.next32(text);
|
||||
c = CharacterIteration.current32(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (breakType >= 0 && breakType < fHandled.length()) {
|
||||
UnicodeSet uniset = fHandled.get(breakType);
|
||||
int c = CharacterIteration.current32(text);
|
||||
if (reverse) {
|
||||
while (text.getIndex() > startPos && uniset.contains(c)) {
|
||||
CharacterIteration.previous32(text);
|
||||
c = CharacterIteration.current32(text);
|
||||
}
|
||||
} else {
|
||||
while (text.getIndex() < endPos && uniset.contains(c)) {
|
||||
CharacterIteration.next32(text);
|
||||
c = CharacterIteration.current32(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
public synchronized void handleChar(int c, int breakType) {
|
||||
if (breakType >= 0 && breakType < fHandled.length && c != DONE32) {
|
||||
if (!fHandled[breakType].contains(c)) {
|
||||
/**
|
||||
* Update the set of unhandled characters for the specified breakType to include
|
||||
* all that have the same script as c.
|
||||
* May be called concurrently with handles() or findBreaks().
|
||||
* Must not be called concurrently with itself.
|
||||
*/
|
||||
public void handleChar(int c, int breakType) {
|
||||
if (breakType >= 0 && breakType < fHandled.length() && c != DONE32) {
|
||||
UnicodeSet originalSet = fHandled.get(breakType);
|
||||
if (!originalSet.contains(c)) {
|
||||
int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
|
||||
fHandled[breakType].applyIntPropertyValue(UProperty.SCRIPT, script);
|
||||
UnicodeSet newSet = new UnicodeSet();
|
||||
newSet.applyIntPropertyValue(UProperty.SCRIPT, script);
|
||||
newSet.addAll(originalSet);
|
||||
fHandled.set(breakType, newSet);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -851,6 +851,89 @@ public class RBBITest extends TestFmwk {
|
|||
bi.setText("abc");
|
||||
bi.first();
|
||||
assertEquals("Rule chaining test", 3, bi.next());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void TestBug12873() {
|
||||
// Bug with RuleBasedBreakIterator's internal structure for recording potential look-ahead
|
||||
// matches not being cloned when a break iterator is cloned. This resulted in usage
|
||||
// collisions if the original break iterator and its clone were used concurrently.
|
||||
|
||||
// The Line Break rules for Regional Indicators make use of look-ahead rules, and
|
||||
// show the bug. 1F1E6 = \uD83C\uDDE6 = REGIONAL INDICATOR SYMBOL LETTER A
|
||||
// Regional indicators group into pairs, expect breaks after two code points, which
|
||||
// is after four 16 bit code units.
|
||||
|
||||
final String dataToBreak = "\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6\uD83C\uDDE6";
|
||||
final RuleBasedBreakIterator bi = (RuleBasedBreakIterator)BreakIterator.getLineInstance();
|
||||
final AssertionError[] assertErr = new AssertionError[1]; // saves an error found from within a thread
|
||||
|
||||
class WorkerThread implements Runnable {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
RuleBasedBreakIterator localBI = (RuleBasedBreakIterator)bi.clone();
|
||||
localBI.setText(dataToBreak);
|
||||
for (int loop=0; loop<100; loop++) {
|
||||
int nextExpectedBreak = 0;
|
||||
for (int actualBreak = localBI.first(); actualBreak != BreakIterator.DONE;
|
||||
actualBreak = localBI.next(), nextExpectedBreak+= 4) {
|
||||
assertEquals("", nextExpectedBreak, actualBreak);
|
||||
}
|
||||
assertEquals("", dataToBreak.length()+4, nextExpectedBreak);
|
||||
}
|
||||
} catch (AssertionError e) {
|
||||
assertErr[0] = e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Thread> threads = new ArrayList<Thread>();
|
||||
for (int n = 0; n<4; ++n) {
|
||||
threads.add(new Thread(new WorkerThread()));
|
||||
}
|
||||
for (Thread thread: threads) {
|
||||
thread.start();
|
||||
}
|
||||
for (Thread thread: threads) {
|
||||
try {
|
||||
thread.join();
|
||||
} catch (InterruptedException e) {
|
||||
fail(e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
// JUnit wont see failures from within the worker threads, so
|
||||
// check again if one occurred.
|
||||
if (assertErr[0] != null) {
|
||||
throw assertErr[0];
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestBreakAllChars() {
|
||||
// Make a "word" from each code point, separated by spaces.
|
||||
// For dictionary based breaking, runs the start-of-range
|
||||
// logic with all possible dictionary characters.
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int c=0; c<0x110000; ++c) {
|
||||
sb.appendCodePoint(c);
|
||||
sb.appendCodePoint(c);
|
||||
sb.appendCodePoint(c);
|
||||
sb.appendCodePoint(c);
|
||||
sb.append(' ');
|
||||
}
|
||||
String s = sb.toString();
|
||||
|
||||
for (int breakKind=BreakIterator.KIND_CHARACTER; breakKind<=BreakIterator.KIND_TITLE; ++breakKind) {
|
||||
RuleBasedBreakIterator bi =
|
||||
(RuleBasedBreakIterator)BreakIterator.getBreakInstance(ULocale.ENGLISH, breakKind);
|
||||
bi.setText(s);
|
||||
int lastb = -1;
|
||||
for (int b = bi.first(); b != BreakIterator.DONE; b = bi.next()) {
|
||||
assertTrue("(lastb < b) : (" + lastb + " < " + b + ")", lastb < b);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue