mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-21459 properly guard BytesTrie.Result.getValue()
and clone/copy objects so that objects shared among instances are not modified and use an atomic int for the C++ refcount
This commit is contained in:
parent
7fd2844d10
commit
152867f7ab
4 changed files with 103 additions and 48 deletions
|
@ -20,6 +20,7 @@
|
|||
#include "ubrkimpl.h" // U_ICUDATA_BRKITR
|
||||
#include "uvector.h"
|
||||
#include "cmemory.h"
|
||||
#include "umutex.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -139,13 +140,30 @@ class SimpleFilteredSentenceBreakData : public UMemory {
|
|||
public:
|
||||
SimpleFilteredSentenceBreakData(UCharsTrie *forwards, UCharsTrie *backwards )
|
||||
: fForwardsPartialTrie(forwards), fBackwardsTrie(backwards), refcount(1) { }
|
||||
SimpleFilteredSentenceBreakData *incr() { refcount++; return this; }
|
||||
SimpleFilteredSentenceBreakData *decr() { if((--refcount) <= 0) delete this; return 0; }
|
||||
virtual ~SimpleFilteredSentenceBreakData();
|
||||
SimpleFilteredSentenceBreakData *incr() {
|
||||
umtx_atomic_inc(&refcount);
|
||||
return this;
|
||||
}
|
||||
SimpleFilteredSentenceBreakData *decr() {
|
||||
if(umtx_atomic_dec(&refcount) <= 0) {
|
||||
delete this;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
virtual ~SimpleFilteredSentenceBreakData();
|
||||
|
||||
LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
|
||||
LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
|
||||
int32_t refcount;
|
||||
bool hasForwardsPartialTrie() const { return fForwardsPartialTrie.isValid(); }
|
||||
bool hasBackwardsTrie() const { return fBackwardsTrie.isValid(); }
|
||||
|
||||
const UCharsTrie &getForwardsPartialTrie() const { return *fForwardsPartialTrie; }
|
||||
const UCharsTrie &getBackwardsTrie() const { return *fBackwardsTrie; }
|
||||
|
||||
private:
|
||||
// These tries own their data arrays.
|
||||
// They are shared and must therefore not be modified.
|
||||
LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
|
||||
LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
|
||||
u_atomic_int32_t refcount;
|
||||
};
|
||||
|
||||
SimpleFilteredSentenceBreakData::~SimpleFilteredSentenceBreakData() {}
|
||||
|
@ -244,7 +262,13 @@ SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIt
|
|||
fData(new SimpleFilteredSentenceBreakData(forwards, backwards)),
|
||||
fDelegate(adopt)
|
||||
{
|
||||
// all set..
|
||||
if (fData == nullptr) {
|
||||
delete forwards;
|
||||
delete backwards;
|
||||
if (U_SUCCESS(status)) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SimpleFilteredSentenceBreakIterator::~SimpleFilteredSentenceBreakIterator() {
|
||||
|
@ -261,59 +285,62 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
|
|||
int32_t bestValue = -1;
|
||||
// loops while 'n' points to an exception.
|
||||
utext_setNativeIndex(fText.getAlias(), n); // from n..
|
||||
fData->fBackwardsTrie->reset();
|
||||
UChar32 uch;
|
||||
|
||||
//if(debug2) u_printf(" n@ %d\n", n);
|
||||
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
|
||||
if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
|
||||
if(utext_previous32(fText.getAlias())==u' ') { // TODO: skip a class of chars here??
|
||||
// TODO only do this the 1st time?
|
||||
//if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
|
||||
} else {
|
||||
//if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
|
||||
uch = utext_next32(fText.getAlias());
|
||||
utext_next32(fText.getAlias());
|
||||
//if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
|
||||
}
|
||||
|
||||
UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
|
||||
|
||||
while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
|
||||
USTRINGTRIE_HAS_NEXT(r=fData->fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
|
||||
if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
|
||||
bestPosn = utext_getNativeIndex(fText.getAlias());
|
||||
bestValue = fData->fBackwardsTrie->getValue();
|
||||
}
|
||||
//if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
|
||||
{
|
||||
// Do not modify the shared trie!
|
||||
UCharsTrie iter(fData->getBackwardsTrie());
|
||||
UChar32 uch;
|
||||
while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL) { // more to consume backwards
|
||||
UStringTrieResult r = iter.nextForCodePoint(uch);
|
||||
if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
|
||||
bestPosn = utext_getNativeIndex(fText.getAlias());
|
||||
bestValue = iter.getValue();
|
||||
}
|
||||
if(!USTRINGTRIE_HAS_NEXT(r)) {
|
||||
break;
|
||||
}
|
||||
//if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
|
||||
}
|
||||
}
|
||||
|
||||
if(USTRINGTRIE_MATCHES(r)) { // exact match?
|
||||
//if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
|
||||
bestValue = fData->fBackwardsTrie->getValue();
|
||||
bestPosn = utext_getNativeIndex(fText.getAlias());
|
||||
//if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
|
||||
}
|
||||
//if(bestValue >= 0) {
|
||||
//if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
|
||||
//}
|
||||
|
||||
if(bestPosn>=0) {
|
||||
//if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
|
||||
|
||||
//if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
|
||||
//int32_t bestValue = fBackwardsTrie->getValue();
|
||||
//int32_t bestValue = iter.getValue();
|
||||
////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
|
||||
|
||||
if(bestValue == kMATCH) { // exact match!
|
||||
//if(debug2) u_printf(" exact backward match\n");
|
||||
return kExceptionHere; // See if the next is another exception.
|
||||
} else if(bestValue == kPARTIAL
|
||||
&& fData->fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
|
||||
&& fData->hasForwardsPartialTrie()) { // make sure there's a forward trie
|
||||
//if(debug2) u_printf(" partial backward match\n");
|
||||
// We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
|
||||
// to see if it matches something going forward.
|
||||
fData->fForwardsPartialTrie->reset();
|
||||
UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
|
||||
utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
|
||||
//if(debug2) u_printf("Retrying at %d\n", bestPosn);
|
||||
// Do not modify the shared trie!
|
||||
UCharsTrie iter(fData->getForwardsPartialTrie());
|
||||
UChar32 uch;
|
||||
while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
|
||||
USTRINGTRIE_HAS_NEXT(rfwd=fData->fForwardsPartialTrie->nextForCodePoint(uch))) {
|
||||
USTRINGTRIE_HAS_NEXT(rfwd=iter.nextForCodePoint(uch))) {
|
||||
//if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
|
||||
}
|
||||
if(USTRINGTRIE_MATCHES(rfwd)) {
|
||||
|
@ -339,7 +366,7 @@ SimpleFilteredSentenceBreakIterator::breakExceptionAt(int32_t n) {
|
|||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
|
||||
if(n == UBRK_DONE || // at end or
|
||||
fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
|
||||
!fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
|
||||
return n;
|
||||
}
|
||||
// OK, do we need to break here?
|
||||
|
@ -369,7 +396,7 @@ SimpleFilteredSentenceBreakIterator::internalNext(int32_t n) {
|
|||
int32_t
|
||||
SimpleFilteredSentenceBreakIterator::internalPrev(int32_t n) {
|
||||
if(n == 0 || n == UBRK_DONE || // at end or
|
||||
fData->fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
|
||||
!fData->hasBackwardsTrie()) { // .. no backwards table loaded == no exceptions
|
||||
return n;
|
||||
}
|
||||
// OK, do we need to break here?
|
||||
|
@ -420,7 +447,7 @@ SimpleFilteredSentenceBreakIterator::previous(void) {
|
|||
UBool SimpleFilteredSentenceBreakIterator::isBoundary(int32_t offset) {
|
||||
if (!fDelegate->isBoundary(offset)) return false; // no break to suppress
|
||||
|
||||
if (fData->fBackwardsTrie.isNull()) return true; // no data = no suppressions
|
||||
if (!fData->hasBackwardsTrie()) return true; // no data = no suppressions
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
resetState(status);
|
||||
|
|
5
icu4c/source/test/testdata/rbbitst.txt
vendored
5
icu4c/source/test/testdata/rbbitst.txt
vendored
|
@ -62,6 +62,11 @@
|
|||
<data>\
|
||||
•Doctor with a D. •As in, Ph.D., you know.•</data>
|
||||
|
||||
# ICU-21459 logic error.
|
||||
<locale en@ss=standard>
|
||||
<sent>
|
||||
<data>•on. •But after a day in the arena sun, the metal feels hot enough to blister my hands.•</data>
|
||||
|
||||
# same as root (unless some exceptions are added!)
|
||||
<locale tfg@ss=standard>
|
||||
<sent>
|
||||
|
|
|
@ -19,6 +19,7 @@ import com.ibm.icu.text.UCharacterIterator;
|
|||
import com.ibm.icu.util.BytesTrie;
|
||||
import com.ibm.icu.util.CharsTrie;
|
||||
import com.ibm.icu.util.CharsTrieBuilder;
|
||||
import com.ibm.icu.util.ICUCloneNotSupportedException;
|
||||
import com.ibm.icu.util.StringTrieBuilder;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
|
@ -72,8 +73,6 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
|
|||
backwardsTrie.reset();
|
||||
int uch;
|
||||
|
||||
|
||||
|
||||
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
|
||||
if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
|
||||
// TODO only do this the 1st time?
|
||||
|
@ -81,20 +80,17 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
|
|||
uch = text.nextCodePoint();
|
||||
}
|
||||
|
||||
BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
|
||||
|
||||
while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and..
|
||||
((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
|
||||
while ((uch = text.previousCodePoint()) >= 0) { // more to consume backwards
|
||||
BytesTrie.Result r = backwardsTrie.nextForCodePoint(uch);
|
||||
if (r.hasValue()) { // remember the best match so far
|
||||
bestPosn = text.getIndex();
|
||||
bestValue = backwardsTrie.getValue();
|
||||
}
|
||||
if (!r.hasNext()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (r.matches()) { // exact match?
|
||||
bestValue = backwardsTrie.getValue();
|
||||
bestPosn = text.getIndex();
|
||||
}
|
||||
backwardsTrie.reset(); // for equals() & hashCode()
|
||||
|
||||
if (bestPosn >= 0) {
|
||||
if (bestValue == Builder.MATCH) { // exact match!
|
||||
|
@ -110,6 +106,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
|
|||
while ((uch = text.nextCodePoint()) != BreakIterator.DONE
|
||||
&& ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
|
||||
}
|
||||
forwardsPartialTrie.reset(); // for equals() & hashCode()
|
||||
if (rfwd.matches()) {
|
||||
// Exception here
|
||||
return true;
|
||||
|
@ -186,18 +183,39 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
|
|||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj;
|
||||
return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie)
|
||||
// TODO(ICU-21575): CharsTrie.equals() is not defined.
|
||||
// Should compare the underlying data, and can then stop resetting after iteration.
|
||||
return delegate.equals(other.delegate) && text.equals(other.text)
|
||||
&& backwardsTrie.equals(other.backwardsTrie)
|
||||
&& forwardsPartialTrie.equals(other.forwardsPartialTrie);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + delegate.hashCode();
|
||||
// TODO(ICU-21575): CharsTrie.hashCode() is not defined.
|
||||
return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11)
|
||||
+ delegate.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object clone() {
|
||||
SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone();
|
||||
try {
|
||||
if (delegate != null) {
|
||||
other.delegate = (BreakIterator) delegate.clone();
|
||||
}
|
||||
if (text != null) {
|
||||
other.text = (UCharacterIterator) text.clone();
|
||||
}
|
||||
if (backwardsTrie != null) {
|
||||
other.backwardsTrie = backwardsTrie.clone();
|
||||
}
|
||||
if (forwardsPartialTrie != null) {
|
||||
other.forwardsPartialTrie = forwardsPartialTrie.clone();
|
||||
}
|
||||
} catch (CloneNotSupportedException e) {
|
||||
throw new ICUCloneNotSupportedException(e);
|
||||
}
|
||||
return other;
|
||||
}
|
||||
|
||||
|
@ -273,7 +291,7 @@ public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
|
|||
/**
|
||||
* filter set to store all exceptions
|
||||
*/
|
||||
private HashSet<CharSequence> filterSet = new HashSet<CharSequence>();
|
||||
private HashSet<CharSequence> filterSet = new HashSet<>();
|
||||
|
||||
static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie
|
||||
static final int MATCH = (1 << 1); // < exact match - skip this one.
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
# [ICU4C] source/test/testdata/rbbitst.txt
|
||||
# [ICU4J] main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
|
||||
#
|
||||
# ICU4C's copy is the master. If any changes are made to ICU4J's copy, make sure they
|
||||
# ICU4C's copy is the primary one. If any changes are made to ICU4J's copy, make sure they
|
||||
# are merged back into ICU4C's copy of the file, lest they get overwritten later.
|
||||
# TODO: figure out how to have a single copy of the file for use by both C and Java.
|
||||
|
||||
|
@ -62,6 +62,11 @@
|
|||
<data>\
|
||||
•Doctor with a D. •As in, Ph.D., you know.•</data>
|
||||
|
||||
# ICU-21459 logic error.
|
||||
<locale en@ss=standard>
|
||||
<sent>
|
||||
<data>•on. •But after a day in the arena sun, the metal feels hot enough to blister my hands.•</data>
|
||||
|
||||
# same as root (unless some exceptions are added!)
|
||||
<locale tfg@ss=standard>
|
||||
<sent>
|
||||
|
|
Loading…
Add table
Reference in a new issue