ICU-21459 properly guard BytesTrie.Result.getValue()

and clone/copy objects so that objects shared among instances are not modified
and use an atomic int for the C++ refcount
This commit is contained in:
Markus Scherer 2021-04-05 15:57:33 -07:00
parent 7fd2844d10
commit 152867f7ab
4 changed files with 103 additions and 48 deletions

View file

@ -20,6 +20,7 @@
#include "ubrkimpl.h" // U_ICUDATA_BRKITR
#include "uvector.h"
#include "cmemory.h"
#include "umutex.h"
U_NAMESPACE_BEGIN
@ -139,13 +140,30 @@ class SimpleFilteredSentenceBreakData : public UMemory {
public:
SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
: fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
SimpleFilteredSentenceBreakData *incr() { refcount++; return this; }
SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
virtual ~SimpleFilteredSentenceBreakData();
SimpleFilteredSentenceBreakData *incr() {
umtx_atomic_inc(&refcount);
return this;
}
SimpleFilteredSentenceBreakData *decr() {
if(umtx_atomic_dec(&refcount) <= 0) {
delete this;
}
return 0;
}
virtual ~SimpleFilteredSentenceBreakData();
LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
int32_t refcount;
bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
private:
// These tries own their data arrays.
// They are shared and must therefore not be modified.
LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
u_atomic_int32_t refcount;
};
SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
@ -244,7 +262,13 @@ SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIt
fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
fDelegate(adopt)
{
// all set..
if (fData == nullptr) {
delete forwards;
delete backwards;
if (U_SUCCESS(status)) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
}
SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
@ -261,59 +285,62 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
int32_t bestValue = -1;
// loops while 'n' points to an exception.
utext_setNativeIndex(fText.getAlias(), n); // from n..
fData->fBackwardsTrie->reset();
UChar32 uch;
//if(debug2) u_printf(" n@ %d\n", n);
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here??
// TODO only do this the 1st time?
//if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
} else {
//if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
uch = utext_next32(fText.getAlias());
utext_next32(fText.getAlias());
//if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
}
UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
bestPosn = utext_getNativeIndex(fText.getAlias());
bestValue = fData->fBackwardsTrie->getValue();
}
//if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
{
// Do not modify the shared trie!
UCharsTrie iter(fData->getBackwardsTrie());
UChar32 uch;
while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards
UStringTrieResult r = iter.nextForCodePoint(uch);
if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
bestPosn = utext_getNativeIndex(fText.getAlias());
bestValue = iter.getValue();
}
if(!USTRINGTRIE_HAS_NEXT(r)) {
break;
}
//if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
}
}
if(USTRINGTRIE_MATCHES(r)) { // exact match?
//if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
bestValue = fData->fBackwardsTrie->getValue();
bestPosn = utext_getNativeIndex(fText.getAlias());
//if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
}
//if(bestValue >= 0) {
//if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
//}
if(bestPosn>=0) {
//if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
//if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
//int32_t bestValue = fBackwardsTrie->getValue();
//int32_t bestValue = iter.getValue();
////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
if(bestValue == kMATCH) { // exact match!
//if(debug2) u_printf(" exact backward match\n");
return kExceptionHere; // See if the next is another exception.
} else if(bestValue == kPARTIAL
&& fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
&& fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
//if(debug2) u_printf(" partial backward match\n");
// We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
// to see if it matches something going forward.
fData->fForwardsPartialTrie->reset();
UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
//if(debug2) u_printf("Retrying at %d\n", bestPosn);
// Do not modify the shared trie!
UCharsTrie iter(fData->getForwardsPartialTrie());
UChar32 uch;
while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
//if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
}
if(USTRINGTRIE_MATCHES(rfwd)) {
@ -339,7 +366,7 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
int32_t
SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
if(n == UBRK_DONE || // at end or
fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
!fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
return n;
}
// OK, do we need to break here?
@ -369,7 +396,7 @@ SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
int32_t
SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
if(n == 0 || n == UBRK_DONE || // at end or
fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
!fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
return n;
}
// OK, do we need to break here?
@ -420,7 +447,7 @@ SimpleFilteredSentenceBreakIterator::previous(void) {
UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions
if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
UErrorCode status = U_ZERO_ERROR;
resetState(status);

View file

@ -62,6 +62,11 @@
<data>\
•Doctor with a D. •As in, Ph.D., you know.•</data>
# ICU-21459 logic error.
<locale en@ss=standard>
<sent>
<data>•on. •But after a day in the arena sun, the metal feels hot enough to blister my hands.•</data>
# same as root (unless some exceptions are added!)
<locale tfg@ss=standard>
<sent>

View file

@ -19,6 +19,7 @@ import com.ibm.icu.text.UCharacterIterator;
import com.ibm.icu.util.BytesTrie;
import com.ibm.icu.util.CharsTrie;
import com.ibm.icu.util.CharsTrieBuilder;
import com.ibm.icu.util.ICUCloneNotSupportedException;
import com.ibm.icu.util.StringTrieBuilder;
import com.ibm.icu.util.ULocale;
@ -72,8 +73,6 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
backwardsTrie.reset();
int uch;
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
// TODO only do this the 1st time?
@ -81,20 +80,17 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
uch = text.nextCodePoint();
}
BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and..
((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
while ((uch = text.previousCodePoint()) >= 0) { // more to consume backwards
BytesTrie.Result r = backwardsTrie.nextForCodePoint(uch);
if (r.hasValue()) { // remember the best match so far
bestPosn = text.getIndex();
bestValue = backwardsTrie.getValue();
}
if (!r.hasNext()) {
break;
}
}
if (r.matches()) { // exact match?
bestValue = backwardsTrie.getValue();
bestPosn = text.getIndex();
}
backwardsTrie.reset(); // for equals() & hashCode()
if (bestPosn >= 0) {
if (bestValue == Builder.MATCH) { // exact match!
@ -110,6 +106,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
while ((uch = text.nextCodePoint()) != BreakIterator.DONE
&& ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
}
forwardsPartialTrie.reset(); // for equals() & hashCode()
if (rfwd.matches()) {
// Exception here
return true;
@ -186,18 +183,39 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
if (getClass() != obj.getClass())
return false;
SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj;
return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie)
// TODO(ICU-21575): CharsTrie.equals() is not defined.
// Should compare the underlying data, and can then stop resetting after iteration.
return delegate.equals(other.delegate) && text.equals(other.text)
&& backwardsTrie.equals(other.backwardsTrie)
&& forwardsPartialTrie.equals(other.forwardsPartialTrie);
}
@Override
public int hashCode() {
return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + delegate.hashCode();
// TODO(ICU-21575): CharsTrie.hashCode() is not defined.
return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11)
+ delegate.hashCode();
}
@Override
public Object clone() {
SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone();
try {
if (delegate != null) {
other.delegate = (BreakIterator) delegate.clone();
}
if (text != null) {
other.text = (UCharacterIterator) text.clone();
}
if (backwardsTrie != null) {
other.backwardsTrie = backwardsTrie.clone();
}
if (forwardsPartialTrie != null) {
other.forwardsPartialTrie = forwardsPartialTrie.clone();
}
} catch (CloneNotSupportedException e) {
throw new ICUCloneNotSupportedException(e);
}
return other;
}
@ -273,7 +291,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
/**
* filter set to store all exceptions
*/
private HashSet<CharSequence> filterSet = new HashSet<CharSequence>();
private HashSet<CharSequence> filterSet = new HashSet<>();
static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie
static final int MATCH = (1 << 1); // < exact match - skip this one.

View file

@ -32,7 +32,7 @@
# [ICU4C] source/test/testdata/rbbitst.txt
# [ICU4J] main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
#
# ICU4C's copy is the master. If any changes are made to ICU4J's copy, make sure they
# ICU4C's copy is the primary one. If any changes are made to ICU4J's copy, make sure they
# are merged back into ICU4C's copy of the file, lest they get overwritten later.
# TODO: figure out how to have a single copy of the file for use by both C and Java.
@ -62,6 +62,11 @@
<data>\
•Doctor with a D. •As in, Ph.D., you know.•</data>
# ICU-21459 logic error.
<locale en@ss=standard>
<sent>
<data>•on. •But after a day in the arena sun, the metal feels hot enough to blister my hands.•</data>
# same as root (unless some exceptions are added!)
<locale tfg@ss=standard>
<sent>