ICU-21525 UnicodeSet.hasString(), UnicodeSetIterator.skipToStrings() & C API

This commit is contained in:
Markus Scherer 2021-06-29 17:27:09 +00:00
parent 84595b49a6
commit e4e2ae9544
15 changed files with 364 additions and 192 deletions

View file

@ -771,8 +771,12 @@ public:
* Note than the elements of a set may include both individual
* codepoints and strings.
*
* This is slower than getRangeCount() because
* it counts the code points of all ranges.
*
* @return the number of elements in this set (its cardinality).
* @stable ICU 2.0
* @see getRangeCount
*/
virtual int32_t size(void) const;
@ -784,6 +788,14 @@ public:
*/
virtual UBool isEmpty(void) const;
#ifndef U_HIDE_DRAFT_API
/**
* @return true if this set contains multi-character strings or the empty string.
* @draft ICU 70
*/
UBool hasStrings() const;
#endif // U_HIDE_DRAFT_API
/**
* Returns true if this set contains the given character.
* This function works faster with a frozen set.
@ -1064,8 +1076,14 @@ public:
/**
* Returns the character at the given index within this set, where
* the set is ordered by ascending code point. If the index is
* out of range, return (UChar32)-1. The inverse of this method is
* <code>indexOf()</code>.
* out of range for characters, returns (UChar32)-1.
* The inverse of this method is <code>indexOf()</code>.
*
* For iteration, this is slower than UnicodeSetIterator or
* getRangeCount()/getRangeStart()/getRangeEnd(),
* because for each call it skips linearly over <code>index</code>
* characters in the ranges.
*
* @param index an index from 0..size()-1
* @return the character at the given index, or (UChar32)-1.
* @stable ICU 2.4
@ -1567,7 +1585,6 @@ private:
void swapBuffers(void);
UBool allocateStrings(UErrorCode &status);
UBool hasStrings() const;
int32_t stringsSize() const;
UBool stringsContains(const UnicodeString &s) const;

View file

@ -851,6 +851,16 @@ uset_removeAllStrings(USet* set);
U_CAPI UBool U_EXPORT2
uset_isEmpty(const USet* set);
#ifndef U_HIDE_DRAFT_API
/**
* @param set the set
* @return true if this set contains multi-character strings or the empty string.
* @draft ICU 70
*/
U_CAPI UBool U_EXPORT2
uset_hasStrings(const USet *set);
#endif // U_HIDE_DRAFT_API
/**
* Returns true if the given USet contains the given character.
* This function works faster with a frozen set.
@ -901,8 +911,13 @@ uset_indexOf(const USet* set, UChar32 c);
/**
* Returns the character at the given index within this set, where
* the set is ordered by ascending code point. If the index is
* out of range, return (UChar32)-1. The inverse of this method is
* <code>indexOf()</code>.
* out of range for characters, returns (UChar32)-1.
* The inverse of this method is <code>indexOf()</code>.
*
* For iteration, this is slower than uset_getRangeCount()/uset_getItemCount()
* with uset_getItem(), because for each call it skips linearly over <code>index</code>
* characters in the ranges.
*
* @param set the set
* @param charIndex an index from 0..size()-1 to obtain the char for
* @return the character at the given index, or (UChar32)-1.
@ -912,16 +927,34 @@ U_CAPI UChar32 U_EXPORT2
uset_charAt(const USet* set, int32_t charIndex);
/**
* Returns the number of characters and strings contained in the given
* USet.
* Returns the number of characters and strings contained in this set.
* The last (uset_getItemCount() - uset_getRangeCount()) items are strings.
*
* This is slower than uset_getRangeCount() and uset_getItemCount() because
* it counts the code points of all ranges.
*
* @param set the set
* @return a non-negative integer counting the characters and strings
* contained in set
* @stable ICU 2.4
* @see uset_getRangeCount
*/
U_CAPI int32_t U_EXPORT2
uset_size(const USet* set);
#ifndef U_HIDE_DRAFT_API
/**
* @param set the set
* @return the number of ranges in this set.
* @draft ICU 70
* @see uset_getItemCount
* @see uset_getItem
* @see uset_size
*/
U_CAPI int32_t U_EXPORT2
uset_getRangeCount(const USet *set);
#endif // U_HIDE_DRAFT_API
/**
* Returns the number of items in this set. An item is either a range
* of characters or a single multicharacter string.
@ -935,20 +968,30 @@ uset_getItemCount(const USet* set);
/**
* Returns an item of this set. An item is either a range of
* characters or a single multicharacter string.
* characters or a single multicharacter string (which can be the empty string).
*
* If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0,
* and the range is <code>*start</code>..<code>*end</code>.
*
* If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then
* this function copies the string into <code>str[strCapacity]</code> and
* returns the length of the string (0 for the empty string).
*
* If <code>itemIndex</code> is out of range, then this function returns -1.
*
* Note that 0 is returned for each range as well as for the empty string.
*
* @param set the set
* @param itemIndex a non-negative integer in the range 0..
* uset_getItemCount(set)-1
* @param start pointer to variable to receive first character
* in range, inclusive
* @param end pointer to variable to receive last character in range,
* inclusive
* @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1
* @param start pointer to variable to receive first character in range, inclusive;
* can be NULL for a string item
* @param end pointer to variable to receive last character in range, inclusive;
* can be NULL for a string item
* @param str buffer to receive the string, may be NULL
* @param strCapacity capacity of str, or 0 if str is NULL
* @param ec error code
* @return the length of the string (>= 2), or 0 if the item is a
* range, in which case it is the range *start..*end, or -1 if
* itemIndex is out of range
* @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range
* @return the length of the string (0 or >= 2), or 0 if the item is a range,
* or -1 if the itemIndex is out of range
* @stable ICU 2.4
*/
U_CAPI int32_t U_EXPORT2

View file

@ -60,6 +60,9 @@ class UnicodeString;
* }
* }
* </pre>
*
* To iterate over only the strings, start with <code>skipToStrings()</code>.
*
* @author M. Davis
* @stable ICU 2.4
*/
@ -170,6 +173,25 @@ class U_COMMON_API UnicodeSetIterator : public UObject {
*/
const UnicodeString& getString();
#ifndef U_HIDE_DRAFT_API
/**
* Skips over the remaining code points/ranges, if any.
* A following call to next() or nextRange() will yield a string, if there is one.
* No-op if next() would return false, or if it would yield a string anyway.
*
* @return *this
* @draft ICU 70
* @see UnicodeSet#strings()
*/
inline UnicodeSetIterator &skipToStrings() {
// Finish code point/range iteration.
range = endRange;
endElement = -1;
nextElement = 0;
return *this;
}
#endif // U_HIDE_DRAFT_API
/**
* Advances the iteration position to the next element in the set,
* which can be either a single code point or a string.
@ -281,13 +303,16 @@ class U_COMMON_API UnicodeSetIterator : public UObject {
*/
int32_t stringCount;
private:
/**
* Points to the string to use when the caller asks for a
* string and the current iteration item is a code point, not a string.
* @internal
*/
UnicodeString *cpString;
protected:
/** Copy constructor. Disallowed.
* @stable ICU 2.4
*/
@ -306,7 +331,7 @@ class U_COMMON_API UnicodeSetIterator : public UObject {
};
inline UBool UnicodeSetIterator::isString() const {
return codepoint == (UChar32)IS_STRING;
return codepoint < 0;
}
inline UChar32 UnicodeSetIterator::getCodepoint() const {

View file

@ -196,6 +196,11 @@ uset_isEmpty(const USet* set) {
return ((const UnicodeSet*) set)->UnicodeSet::isEmpty();
}
U_CAPI UBool U_EXPORT2
uset_hasStrings(const USet* set) {
return ((const UnicodeSet*) set)->UnicodeSet::hasStrings();
}
U_CAPI UBool U_EXPORT2
uset_contains(const USet* set, UChar32 c) {
return ((const UnicodeSet*) set)->UnicodeSet::contains(c);
@ -296,6 +301,11 @@ private:
};
U_NAMESPACE_END
U_CAPI int32_t U_EXPORT2
uset_getRangeCount(const USet *set) {
return ((const UnicodeSet *)set)->UnicodeSet::getRangeCount();
}
U_CAPI int32_t U_EXPORT2
uset_getItemCount(const USet* uset) {
const UnicodeSet& set = *(const UnicodeSet*)uset;
@ -330,11 +340,6 @@ uset_getItem(const USet* uset, int32_t itemIndex,
}
}
//U_CAPI int32_t U_EXPORT2
//uset_getRangeCount(const USet* set) {
// return ((const UnicodeSet*) set)->getRangeCount();
//}
//
//U_CAPI UBool U_EXPORT2
//uset_getRange(const USet* set, int32_t rangeIndex,
// UChar32* pStart, UChar32* pEnd) {

View file

@ -6,12 +6,15 @@
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "cintltst.h"
#include "cmemory.h"
#include <stdlib.h>
#include <string.h>
#define TEST(x) addTest(root, &x, "uset/" # x)
@ -101,6 +104,9 @@ static void TestAPI() {
/* [ABC] */
set = uset_open(0x0041, 0x0043);
expect(set, "ABC", "DEF{ab}", NULL);
if(uset_hasStrings(set)) {
log_err("uset_hasStrings([ABC]) = true");
}
uset_close(set);
/* [a-c{ab}] */
@ -113,6 +119,9 @@ static void TestAPI() {
if(!uset_resemblesPattern(PAT, PAT_LEN, 0)) {
log_err("uset_resemblesPattern of PAT failed\n");
}
if(!uset_hasStrings(set)) {
log_err("uset_hasStrings([a-c{ab}]) = false");
}
expect(set, "abc{ab}", "def{bc}", &ec);
/* [a-d{ab}] */
@ -167,6 +176,9 @@ static void TestAPI() {
return;
}
expect(set, "0123456789ABCDEFabcdef", "GHIjkl{bc}", NULL);
if (uset_size(set) != 22 || uset_getRangeCount(set) != 3 || uset_getItemCount(set) != 3) {
log_err("line %d: uset_size()/uset_getRangeCount()/uset_getItemCount() wrong", __LINE__);
}
/* [ab] */
uset_clear(set);
@ -243,6 +255,9 @@ static void TestAPI() {
return;
}
expect(set, "abcdef{ch}{sch}", "", NULL);
if (uset_size(set) != 8 || uset_getRangeCount(set) != 1 || uset_getItemCount(set) != 3) {
log_err("line %d: uset_size()/uset_getRangeCount()/uset_getItemCount() wrong", __LINE__);
}
uset_retainString(set, u"sch", 3);
expect(set, "{sch}", "abcdef{ch}", NULL);
@ -400,10 +415,12 @@ static void expectItems(const USet* set,
char *pat;
UErrorCode ec;
int32_t expectedSize = 0;
int32_t rangeCount = uset_getRangeCount(set);
int32_t itemCount = uset_getItemCount(set);
int32_t itemIndex = 0;
UChar32 start = 1, end = 0;
int32_t itemLen = 0, length;
bool isString = false;
ec = U_ZERO_ERROR;
length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
@ -435,17 +452,26 @@ static void expectItems(const USet* set,
return;
}
itemLen = uset_getItem(set, itemIndex, &start, &end,
itemStr, sizeof(itemStr), &ec);
// Pass in NULL pointers where we expect them to be ok.
if (itemIndex < rangeCount) {
itemLen = uset_getItem(set, itemIndex, &start, &end, NULL, 0, &ec);
} else {
itemLen = uset_getItem(set, itemIndex, NULL, NULL,
itemStr, UPRV_LENGTHOF(itemStr), &ec);
isString = true;
}
if (U_FAILURE(ec) || itemLen < 0) {
log_err("FAIL: uset_getItem => %s\n", u_errorName(ec));
return;
}
if (itemLen == 0) {
if (!isString) {
log_verbose("Ok: %s item %d is %c-%c\n", pat,
itemIndex, oneUCharToChar(start),
oneUCharToChar(end));
if (itemLen != 0) {
log_err("FAIL: uset_getItem(%d) => length %d\n", itemIndex, itemLen);
}
} else {
itemStr[itemLen] = 0;
u_UCharsToChars(itemStr, buf, itemLen+1);
@ -469,7 +495,7 @@ static void expectItems(const USet* set,
u_charsToUChars(stringStart, ustr, stringLength);
ustr[stringLength] = 0;
if (itemLen == 0) {
if (!isString) {
log_err("FAIL: for %s expect \"%s\" next, but got a char\n",
pat, strCopy);
return;
@ -488,18 +514,19 @@ static void expectItems(const USet* set,
u_charsToUChars(p, ustr, 1);
c = ustr[0];
if (itemLen != 0) {
if (isString) {
log_err("FAIL: for %s expect '%c' next, but got a string\n",
pat, *p);
return;
}
if (c != start++) {
if (c != start) {
log_err("FAIL: for %s expect '%c' next\n",
pat, *p);
return;
}
++start;
++p;
}
}

View file

@ -780,7 +780,7 @@ void CharsetDetectionTest::Ticket6394Test() {
return;
}
UnicodeSet setOfCharsetNames; // UnicodSets can hold strings.
UnicodeSet setOfCharsetNames; // UnicodeSets can hold strings.
int32_t i;
for (i=0; i<matchCount; i++) {
UnicodeString charSetName(ucsdet_getName(matches[i], &status));

View file

@ -262,67 +262,41 @@ UBool LegalGreek::isRho(UChar c) {
return FALSE;
}
// AbbreviatedUnicodeSetIterator Interface ---------------------------------------------
//
// Iterate over a UnicodeSet, only returning a sampling of the contained code points.
// density is the approximate total number of code points to returned for the entire set.
//
namespace {
class AbbreviatedUnicodeSetIterator : public UnicodeSetIterator {
public :
AbbreviatedUnicodeSetIterator();
virtual ~AbbreviatedUnicodeSetIterator();
void reset(UnicodeSet& set, UBool abb = FALSE, int32_t density = 100);
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*/
static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; }
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
*/
virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); }
private :
UBool abbreviated;
int32_t perRange; // The maximum number of code points to be returned from each range
virtual void loadRange(int32_t range);
/**
* The address of this static class variable serves as this class's ID
* for ICU "poor man's RTTI".
*/
static const char fgClassID;
};
// AbbreviatedUnicodeSetIterator Implementation ---------------------------------------
const char AbbreviatedUnicodeSetIterator::fgClassID=0;
AbbreviatedUnicodeSetIterator::AbbreviatedUnicodeSetIterator() :
UnicodeSetIterator(), abbreviated(FALSE) {
}
AbbreviatedUnicodeSetIterator::~AbbreviatedUnicodeSetIterator() {
}
void AbbreviatedUnicodeSetIterator::reset(UnicodeSet& newSet, UBool abb, int32_t density) {
UnicodeSetIterator::reset(newSet);
abbreviated = abb;
perRange = newSet.getRangeCount();
/**
* If abbreviated=true, returns a set which only a sampling of the original code points.
* density is the approximate total number of code points to returned for the entire set.
*/
const UnicodeSet &abbreviateSet(const UnicodeSet &set, bool abbreviated, int density,
UnicodeSet &copy) {
if (!abbreviated) {
return set;
}
int32_t rangeCount = set.getRangeCount();
int32_t perRange = rangeCount;
if (perRange != 0) {
perRange = density / perRange;
}
const UnicodeSet *p = &set;
bool unchanged = true;
for (int32_t i = 0; i < rangeCount; ++i) {
int32_t start = set.getRangeStart(i);
int32_t end = set.getRangeEnd(i);
int32_t newEnd = start + perRange;
if (end > newEnd) {
if (unchanged) {
copy = set;
p = &copy;
unchanged = false;
}
copy.remove(newEnd + 1, end);
}
}
return *p;
}
void AbbreviatedUnicodeSetIterator::loadRange(int32_t myRange) {
UnicodeSetIterator::loadRange(myRange);
if (abbreviated && (endElement > nextElement + perRange)) {
endElement = nextElement + perRange;
}
}
} // namespace
//--------------------------------------------------------------------
// RTTest Interface
@ -587,8 +561,8 @@ void RTTest::test2(UBool quickRt, int32_t density) {
return;
}
AbbreviatedUnicodeSetIterator usi;
AbbreviatedUnicodeSetIterator usi2;
UnicodeSetIterator usi;
UnicodeSetIterator usi2;
parent->logln("Checking that at least one irrelevant character is not NFC'ed");
// string is from NFC_NO in the UCD
@ -702,13 +676,14 @@ void RTTest::test2(UBool quickRt, int32_t density) {
UnicodeSet sourceRangeMinusFailures(sourceRange);
sourceRangeMinusFailures.removeAll(failSourceTarg);
usi.reset(sourceRangeMinusFailures, quickRt, density);
UnicodeSet copy, copy2;
usi.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy));
for (;;) {
if (!usi.next() || usi.isString()) break;
UChar32 c = usi.getCodepoint();
usi2.reset(sourceRangeMinusFailures, quickRt, density);
usi2.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density, copy2));
for (;;) {
if (!usi2.next() || usi2.isString()) break;
UChar32 d = usi2.getCodepoint();
@ -816,7 +791,7 @@ void RTTest::test2(UBool quickRt, int32_t density) {
targetRangeMinusFailures.removeAll(failTargSource);
targetRangeMinusFailures.removeAll(failRound);
usi.reset(targetRangeMinusFailures, quickRt, density);
usi.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy));
UnicodeString targ2;
UnicodeString reverse2;
UnicodeString targD;
@ -830,7 +805,7 @@ void RTTest::test2(UBool quickRt, int32_t density) {
return;
}
usi2.reset(targetRangeMinusFailures, quickRt, density);
usi2.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density, copy2));
for (;;) {
if (!usi2.next() || usi2.isString())
break;

View file

@ -99,6 +99,7 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE_AUTO(TestUnusedCcc);
TESTCASE_AUTO(TestDeepPattern);
TESTCASE_AUTO(TestEmptyString);
TESTCASE_AUTO(TestSkipToStrings);
TESTCASE_AUTO_END;
}
@ -882,6 +883,8 @@ void UnicodeSetTest::TestStrings() {
if (U_FAILURE(ec)) {
errln("FAIL: couldn't construct test sets");
}
assertFalse("[a-c].hasStrings()", testList[0]->hasStrings());
assertTrue("[{ll}{ch}a-z].hasStrings()", testList[2]->hasStrings());
for (int32_t i = 0; testList[i] != NULL; i+=2) {
if (U_SUCCESS(ec)) {
@ -896,7 +899,7 @@ void UnicodeSetTest::TestStrings() {
}
delete testList[i];
delete testList[i+1];
}
}
}
/**
@ -4059,3 +4062,49 @@ void UnicodeSetTest::TestEmptyString() {
assertTrue("frozen containsNone", set.containsNone(u"def"));
assertFalse("frozen containsSome", set.containsSome(u"def"));
}
void UnicodeSetTest::assertNext(UnicodeSetIterator &iter, const UnicodeString &expected) {
assertTrue(expected + ".next()", iter.next());
assertEquals(expected + ".getString()", expected, iter.getString());
}
void UnicodeSetTest::TestSkipToStrings() {
IcuTestErrorCode errorCode(*this, "TestSkipToStrings");
UnicodeSet set(u"[0189{}{ch}]", errorCode);
UnicodeSetIterator iter(set);
assertNext(iter.skipToStrings(), u"");
assertNext(iter, u"ch");
assertFalse("no next", iter.next());
iter.reset();
assertNext(iter, u"0");
assertNext(iter, u"1");
assertNext(iter, u"8");
assertNext(iter, u"9");
assertNext(iter, u"");
assertNext(iter, u"ch");
assertFalse("no next", iter.next());
iter.reset();
assertNext(iter, u"0");
iter.skipToStrings();
assertNext(iter, u"");
assertNext(iter, u"ch");
assertFalse("no next", iter.next());
iter.reset();
iter.nextRange();
assertNext(iter, u"8");
iter.skipToStrings();
assertNext(iter, u"");
assertNext(iter, u"ch");
assertFalse("no next", iter.next());
iter.reset();
iter.nextRange();
iter.nextRange();
iter.nextRange();
iter.skipToStrings();
assertNext(iter, u"ch");
assertFalse("no next", iter.next());
}

View file

@ -19,6 +19,7 @@
#include "unicode/unistr.h"
#include "unicode/uniset.h"
#include "unicode/ucnv_err.h"
#include "unicode/usetiter.h"
#include "intltest.h"
#include "cmemory.h"
@ -96,6 +97,9 @@ private:
void TestDeepPattern();
void TestEmptyString();
void assertNext(UnicodeSetIterator &iter, const UnicodeString &expected);
void TestSkipToStrings();
private:
UBool toPatternAux(UChar32 start, UChar32 end);

View file

@ -487,9 +487,8 @@ public final class NumberFormatter {
* <li>AUTO: 0.90, 1.00, 1.10
* <li>HIDE_IF_WHOLE: 0.90, 1, 1.10
* </ul>
*
*
* @draft ICU 69
* @provisional This API might change or be removed in a future release.
*/
public static enum TrailingZeroDisplay {
/**
@ -498,7 +497,7 @@ public final class NumberFormatter {
* @draft ICU 69
*/
AUTO,
/**
* Same as AUTO, but hide trailing zeros after the decimal separator if they are all zero.
*

View file

@ -829,10 +829,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
}
}
boolean hasStrings() {
return !strings.isEmpty();
}
/**
* Returns the number of elements in this set (its cardinality)
* Note than the elements of a set may include both individual
@ -860,6 +856,14 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
return len == 1 && !hasStrings();
}
/**
* @return true if this set contains multi-character strings or the empty string.
* @draft ICU 70
*/
public boolean hasStrings() {
return !strings.isEmpty();
}
/**
* Implementation of UnicodeMatcher API. Returns <tt>true</tt> if
* this set contains any character whose low byte is the given

View file

@ -14,7 +14,7 @@ import java.util.Iterator;
* UnicodeSetIterator iterates over the contents of a UnicodeSet. It
* iterates over either code points or code point ranges. After all
* code points or ranges have been returned, it returns the
* multicharacter strings of the UnicodSet, if any.
* multicharacter strings of the UnicodeSet, if any.
*
* <p>To iterate over code points and multicharacter strings,
* use a loop like this:
@ -34,10 +34,16 @@ import java.util.Iterator;
* }
* }
* </pre>
*
* <p>To iterate over only the strings, start with <code>new UnicodeSetIterator(set).skipToStrings()</code>.
*
* <p><b>Warning: </b>For speed, UnicodeSet iteration does not check for concurrent modification.
* Do not alter the UnicodeSet while iterating.
* @author M. Davis
* @stable ICU 2.0
* @see UnicodeSet#ranges()
* @see UnicodeSet#strings()
* @see UnicodeSet#iterator()
*/
public class UnicodeSetIterator {
@ -94,6 +100,23 @@ public class UnicodeSetIterator {
reset(new UnicodeSet());
}
/**
* Skips over the remaining code points/ranges, if any.
* A following call to next() or nextRange() will yield a string, if there is one.
* No-op if next() would return false, or if it would yield a string anyway.
*
* @return this
* @draft ICU 70
* @see UnicodeSet#strings()
*/
public UnicodeSetIterator skipToStrings() {
// Finish code point/range iteration.
range = endRange;
endElement = -1;
nextElement = 0;
return this;
}
/**
* Returns the next element in the set, either a single code point
* or a string. If there are no more elements in the set, return
@ -234,39 +257,15 @@ public class UnicodeSetIterator {
private int endRange = 0;
private int range = 0;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
public UnicodeSet getSet() {
return set;
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int endElement;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int nextElement;
private Iterator<String> stringIterator = null;
private int endElement;
private int nextElement;
/**
* Invariant: stringIterator is null when there are no (more) strings remaining
*/
private Iterator<String> stringIterator = null;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected void loadRange(int aRange) {
private void loadRange(int aRange) {
nextElement = set.getRangeStart(aRange);
endElement = set.getRangeEnd(aRange);
}

View file

@ -819,6 +819,8 @@ public class UnicodeSetTest extends TestFmwk {
{new UnicodeSet('a','z').add('A', 'Z').retain('M','m').complement('X'),
new UnicodeSet("[[a-zA-Z]&[M-m]-[X]]")},
};
assertFalse("[a-c].hasStrings()", testList[0][0].hasStrings());
assertTrue("[{ll}{ch}a-z].hasStrings()", testList[1][0].hasStrings());
for (int i = 0; i < testList.length; ++i) {
if (!testList[i][0].equals(testList[i][1])) {
@ -2420,17 +2422,6 @@ public class UnicodeSetTest extends TestFmwk {
return Utility.unescape(s);
}
/* Test the method public UnicodeSet getSet() */
@Test
public void TestGetSet() {
UnicodeSetIterator us = new UnicodeSetIterator();
try {
us.getSet();
} catch (Exception e) {
errln("UnicodeSetIterator.getSet() was not suppose to given an " + "an exception.");
}
}
/* Tests the method public UnicodeSet add(Collection<?> source) */
@Test
public void TestAddCollection() {
@ -2840,4 +2831,50 @@ public class UnicodeSetTest extends TestFmwk {
assertTrue("frozen containsNone", set.containsNone("def"));
assertFalse("frozen containsSome", set.containsSome("def"));
}
private void assertNext(UnicodeSetIterator iter, String expected) {
assertTrue(expected + ".next()", iter.next());
assertEquals(expected + ".getString()", expected, iter.getString());
}
@Test
public void TestSkipToStrings() {
UnicodeSet set = new UnicodeSet("[0189{}{ch}]");
UnicodeSetIterator iter = new UnicodeSetIterator(set).skipToStrings();
assertNext(iter, "");
assertNext(iter, "ch");
assertFalse("no next", iter.next());
iter.reset();
assertNext(iter, "0");
assertNext(iter, "1");
assertNext(iter, "8");
assertNext(iter, "9");
assertNext(iter, "");
assertNext(iter, "ch");
assertFalse("no next", iter.next());
iter.reset();
assertNext(iter, "0");
iter.skipToStrings();
assertNext(iter, "");
assertNext(iter, "ch");
assertFalse("no next", iter.next());
iter.reset();
iter.nextRange();
assertNext(iter, "8");
iter.skipToStrings();
assertNext(iter, "");
assertNext(iter, "ch");
assertFalse("no next", iter.next());
iter.reset();
iter.nextRange();
iter.nextRange();
iter.nextRange();
iter.skipToStrings();
assertNext(iter, "ch");
assertFalse("no next", iter.next());
}
}

View file

@ -17,7 +17,7 @@ import com.ibm.icu.text.UTF16;
* UnicodeSetIterator iterates over the contents of a UnicodeSet. It
* iterates over either code points or code point ranges. After all
* code points or ranges have been returned, it returns the
* multicharacter strings of the UnicodSet, if any.
* multicharacter strings of the UnicodeSet, if any.
*
* <p>To iterate over code points, use a loop like this:
* <pre>
@ -106,7 +106,7 @@ public class UnicodeMapIterator<T> {
* false. If <tt>codepoint == IS_STRING</tt>, the value is a
* string in the <tt>string</tt> field. Otherwise the value is a
* single code point in the <tt>codepoint</tt> field.
*
*
* <p>The order of iteration is all code points in sorted order,
* followed by all strings sorted order. <tt>codepointEnd</tt> is
* undefined after calling this method. <tt>string</tt> is
@ -135,7 +135,7 @@ public class UnicodeMapIterator<T> {
if (stringIterator == null) return false;
codepoint = IS_STRING; // signal that value is actually a string
string = (String)stringIterator.next();
string = stringIterator.next();
if (!stringIterator.hasNext()) stringIterator = null;
return true;
}
@ -147,7 +147,7 @@ public class UnicodeMapIterator<T> {
* string in the <tt>string</tt> field. Otherwise the value is a
* range of one or more code points from <tt>codepoint</tt> to
* <tt>codepointeEnd</tt> inclusive.
*
*
* <p>The order of iteration is all code points ranges in sorted
* order, followed by all strings sorted order. Ranges are
* disjoint and non-contiguous. <tt>string</tt> is undefined
@ -180,7 +180,7 @@ public class UnicodeMapIterator<T> {
if (stringIterator == null) return false;
codepoint = IS_STRING; // signal that value is actually a string
string = (String)stringIterator.next();
string = stringIterator.next();
if (!stringIterator.hasNext()) stringIterator = null;
return true;
}
@ -198,13 +198,13 @@ public class UnicodeMapIterator<T> {
/**
* Resets this iterator to the start of the set.
* @return
* @return
*/
public UnicodeMapIterator<T> reset() {
endRange = map.getRangeCount() - 1;
// both next*() methods will test: if (nextElement <= endElement)
// we set them to fail this test, which will cause them to load the first range
nextElement = 0;
nextElement = 0;
endElement = -1;
range = -1;

View file

@ -77,45 +77,33 @@ public class RoundTripTest extends TestFmwk {
static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
static String HIRAGANA_ITERATION = "[\u309D\u309E]";
//------------------------------------------------------------------
// AbbreviatedUnicodeSetIterator
//------------------------------------------------------------------
static class AbbreviatedUnicodeSetIterator extends UnicodeSetIterator {
private boolean abbreviated;
private int perRange;
public AbbreviatedUnicodeSetIterator() {
super();
abbreviated = false;
/**
* If abbreviated=true, returns a set which only a sampling of the original code points.
* density is the approximate total number of code points to returned for the entire set.
*/
private static UnicodeSet abbreviateSet(UnicodeSet set, boolean abbreviated, int density) {
if (!abbreviated) {
return set;
}
@Override
public void reset(UnicodeSet newSet) {
reset(newSet, false);
int rangeCount = set.getRangeCount();
int perRange = rangeCount;
if (perRange != 0) {
perRange = density / perRange;
}
public void reset(UnicodeSet newSet, boolean abb) {
reset(newSet, abb, 100);
}
public void reset(UnicodeSet newSet, boolean abb, int density) {
super.reset(newSet);
abbreviated = abb;
perRange = newSet.getRangeCount();
if (perRange != 0) {
perRange = density / perRange;
}
}
@Override
protected void loadRange(int myRange) {
super.loadRange(myRange);
if (abbreviated && (endElement > nextElement + perRange)) {
endElement = nextElement + perRange;
boolean unchanged = true;
for (int i = 0; i < rangeCount; ++i) {
int start = set.getRangeStart(i);
int end = set.getRangeEnd(i);
int newEnd = start + perRange;
if (end > newEnd) {
if (unchanged) {
set = set.cloneAsThawed();
unchanged = false;
}
set.remove(newEnd + 1, end);
}
}
return set;
}
//--------------------------------------------------------------------
@ -1295,8 +1283,8 @@ public class RoundTripTest extends TestFmwk {
return false;
}
AbbreviatedUnicodeSetIterator usi = new AbbreviatedUnicodeSetIterator();
AbbreviatedUnicodeSetIterator usi2 = new AbbreviatedUnicodeSetIterator();
UnicodeSetIterator usi = new UnicodeSetIterator();
UnicodeSetIterator usi2 = new UnicodeSetIterator();
Transliterator sourceToTarget;
Transliterator targetToSource;
@ -1454,7 +1442,7 @@ public class RoundTripTest extends TestFmwk {
boolean quickRt = TestFmwk.getExhaustiveness() < 10;
usi.reset(sourceRangeMinusFailures, quickRt, density);
usi.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density));
while (usi.next()) {
int c = usi.codepoint;
@ -1466,7 +1454,7 @@ public class RoundTripTest extends TestFmwk {
if (failSourceTarg.get(d)) continue;
*/
TestFmwk.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));
usi2.reset(sourceRangeMinusFailures, quickRt, density);
usi2.reset(abbreviateSet(sourceRangeMinusFailures, quickRt, density));
while (usi2.next()) {
int d = usi2.codepoint;
@ -1561,7 +1549,7 @@ public class RoundTripTest extends TestFmwk {
!targetRange.contains(c)) continue;
*/
usi.reset(targetRangeMinusFailures, quickRt, density);
usi.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density));
while (usi.next()) {
int c = usi.codepoint;
@ -1574,7 +1562,7 @@ public class RoundTripTest extends TestFmwk {
!targetRange.contains(d)) continue;
*/
TestFmwk.logln(count + "/" + pairLimit + " Checking starting with " + UTF16.valueOf(c));
usi2.reset(targetRangeMinusFailures, quickRt, density);
usi2.reset(abbreviateSet(targetRangeMinusFailures, quickRt, density));
while (usi2.next()) {