mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 06:25:30 +00:00
parent
263c735400
commit
37b41495d7
9 changed files with 1087 additions and 62 deletions
|
@ -313,7 +313,7 @@ private:
|
|||
char16_t *pat = nullptr;
|
||||
int32_t patLen = 0;
|
||||
|
||||
UVector* strings = nullptr; // maintained in sorted order
|
||||
UVector* strings_ = nullptr; // maintained in sorted order
|
||||
UnicodeSetStringSpan *stringSpan = nullptr;
|
||||
|
||||
/**
|
||||
|
@ -1102,6 +1102,118 @@ public:
|
|||
*/
|
||||
UChar32 charAt(int32_t index) const;
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Returns a C++ "range" for iterating over the code points of this set.
|
||||
*
|
||||
* \code
|
||||
* UnicodeSet set(u"[abcçカ🚴]", errorCode);
|
||||
* for (UChar32 c : set.codePoints()) {
|
||||
* printf("set.codePoint U+%04lx\n", (long)c);
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* @return a "range" object for iterating over the code points of this set.
|
||||
* @draft ICU 76
|
||||
* @see ranges
|
||||
* @see strings
|
||||
* @see begin
|
||||
* @see end
|
||||
*/
|
||||
inline U_HEADER_NESTED_NAMESPACE::USetCodePoints codePoints() const {
|
||||
return U_HEADER_NESTED_NAMESPACE::USetCodePoints(toUSet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a C++ "range" for iterating over the code point ranges of this set.
|
||||
*
|
||||
* \code
|
||||
* UnicodeSet set(u"[abcçカ🚴]", errorCode);
|
||||
* for (auto [start, end] : set.ranges()) {
|
||||
* printf("set.range U+%04lx..U+%04lx\n", (long)start, (long)end);
|
||||
* }
|
||||
* for (auto range : set.ranges()) {
|
||||
* for (UChar32 c : range) {
|
||||
* printf("set.range.c U+%04lx\n", (long)c);
|
||||
* }
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* @return a "range" object for iterating over the code point ranges of this set.
|
||||
* @draft ICU 76
|
||||
* @see codePoints
|
||||
* @see strings
|
||||
* @see begin
|
||||
* @see end
|
||||
*/
|
||||
inline U_HEADER_NESTED_NAMESPACE::USetRanges ranges() const {
|
||||
return U_HEADER_NESTED_NAMESPACE::USetRanges(toUSet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a C++ "range" for iterating over the empty and multi-character strings of this set.
|
||||
* Returns each string as a std::u16string_view without copying its contents.
|
||||
*
|
||||
* \code
|
||||
* UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode);
|
||||
* for (auto s : set.strings()) {
|
||||
* UnicodeString us(s);
|
||||
* std::string u8;
|
||||
* printf("set.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str());
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* @return a "range" object for iterating over the strings of this set.
|
||||
* @draft ICU 76
|
||||
* @see codePoints
|
||||
* @see ranges
|
||||
* @see begin
|
||||
* @see end
|
||||
*/
|
||||
inline U_HEADER_NESTED_NAMESPACE::USetStrings strings() const {
|
||||
return U_HEADER_NESTED_NAMESPACE::USetStrings(toUSet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a C++ iterator for iterating over all of the elements of this set.
|
||||
* Convenient all-in one iteration, but creates a UnicodeString for each
|
||||
* code point or string.
|
||||
* (Similar to how Java UnicodeSet *is an* Iterable<String>.)
|
||||
*
|
||||
* Code points are returned first, then empty and multi-character strings.
|
||||
*
|
||||
* \code
|
||||
* UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode);
|
||||
* for (auto el : set) {
|
||||
* std::string u8;
|
||||
* printf("set.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str());
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* @return an all-elements iterator.
|
||||
* @draft ICU 76
|
||||
* @see end
|
||||
* @see codePoints
|
||||
* @see ranges
|
||||
* @see strings
|
||||
*/
|
||||
inline U_HEADER_NESTED_NAMESPACE::USetElementIterator begin() const {
|
||||
return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).begin();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return an exclusive-end sentinel for iterating over all of the elements of this set.
|
||||
* @draft ICU 76
|
||||
* @see begin
|
||||
* @see codePoints
|
||||
* @see ranges
|
||||
* @see strings
|
||||
*/
|
||||
inline U_HEADER_NESTED_NAMESPACE::USetElementIterator end() const {
|
||||
return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).end();
|
||||
}
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Adds the specified range to this set if it is not already
|
||||
* present. If this set already contains the specified range,
|
||||
|
|
|
@ -33,7 +33,10 @@
|
|||
#include "unicode/uchar.h"
|
||||
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
#include <string_view>
|
||||
#include "unicode/char16ptr.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/unistr.h"
|
||||
#endif // U_SHOW_CPLUSPLUS_API
|
||||
|
||||
#ifndef USET_DEFINED
|
||||
|
@ -955,7 +958,7 @@ uset_charAt(const USet* set, int32_t charIndex);
|
|||
|
||||
/**
|
||||
* Returns the number of characters and strings contained in this set.
|
||||
* The last (uset_getItemCount() - uset_getRangeCount()) items are strings.
|
||||
* The last uset_getStringCount() == (uset_getItemCount() - uset_getRangeCount()) items are strings.
|
||||
*
|
||||
* This is slower than uset_getRangeCount() and uset_getItemCount() because
|
||||
* it counts the code points of all ranges.
|
||||
|
@ -965,6 +968,8 @@ uset_charAt(const USet* set, int32_t charIndex);
|
|||
* contained in set
|
||||
* @stable ICU 2.4
|
||||
* @see uset_getRangeCount
|
||||
* @see uset_getStringCount
|
||||
* @see uset_getItemCount
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uset_size(const USet* set);
|
||||
|
@ -975,11 +980,42 @@ uset_size(const USet* set);
|
|||
* @stable ICU 70
|
||||
* @see uset_getItemCount
|
||||
* @see uset_getItem
|
||||
* @see uset_getStringCount
|
||||
* @see uset_size
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uset_getRangeCount(const USet *set);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* @param set the set
|
||||
* @return the number of strings in this set.
|
||||
* @draft ICU 76
|
||||
* @see uset_getRangeCount
|
||||
* @see uset_getItemCount
|
||||
* @see uset_size
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uset_getStringCount(const USet *set);
|
||||
|
||||
/**
|
||||
* Returns the index-th string (empty or multi-character) in the set.
|
||||
* The string may not be NUL-terminated.
|
||||
* The output length must be used, and the caller must not read more than that many UChars.
|
||||
*
|
||||
* @param set the set
|
||||
* @param index the string index, 0 .. uset_getStringCount() - 1
|
||||
* @param pLength the output string length; must not be NULL
|
||||
* @return the pointer to the string; NULL if the index is out of range or pLength is NULL
|
||||
* @draft ICU 76
|
||||
* @see uset_getStringCount
|
||||
*/
|
||||
U_CAPI const UChar* U_EXPORT2
|
||||
uset_getString(const USet *set, int32_t index, int32_t *pLength);
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Returns the number of items in this set. An item is either a range
|
||||
* of characters or a single multicharacter string.
|
||||
|
@ -987,6 +1023,8 @@ uset_getRangeCount(const USet *set);
|
|||
* @return a non-negative integer counting the character ranges
|
||||
* and/or strings contained in set
|
||||
* @stable ICU 2.4
|
||||
* @see uset_getRangeCount
|
||||
* @see uset_getStringCount
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uset_getItemCount(const USet* set);
|
||||
|
@ -1001,6 +1039,7 @@ uset_getItemCount(const USet* set);
|
|||
* If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then
|
||||
* this function copies the string into <code>str[strCapacity]</code> and
|
||||
* returns the length of the string (0 for the empty string).
|
||||
* See uset_getString() for a function that does not copy the string contents.
|
||||
*
|
||||
* If <code>itemIndex</code> is out of range, then this function returns -1.
|
||||
*
|
||||
|
@ -1018,6 +1057,7 @@ uset_getItemCount(const USet* set);
|
|||
* @return the length of the string (0 or >= 2), or 0 if the item is a range,
|
||||
* or -1 if the itemIndex is out of range
|
||||
* @stable ICU 2.4
|
||||
* @see uset_getString
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uset_getItem(const USet* set, int32_t itemIndex,
|
||||
|
@ -1285,4 +1325,574 @@ U_CAPI UBool U_EXPORT2
|
|||
uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
|
||||
UChar32* pStart, UChar32* pEnd);
|
||||
|
||||
#endif
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
namespace U_HEADER_ONLY_NAMESPACE {
|
||||
|
||||
// Note: Not U_COMMON_API, and not a subclass of UMemory, because this is a header-only class,
|
||||
// not intended to be used via export from the ICU DLL.
|
||||
|
||||
/**
|
||||
* Iterator returned by USetCodePoints.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
class USetCodePointIterator {
|
||||
public:
|
||||
/** @draft ICU 76 */
|
||||
USetCodePointIterator(const USetCodePointIterator &other) = default;
|
||||
|
||||
/** @draft ICU 76 */
|
||||
bool operator==(const USetCodePointIterator &other) const {
|
||||
// No need to compare rangeCount & end given private constructor
|
||||
// and assuming we don't compare iterators across the set being modified.
|
||||
// And comparing rangeIndex is redundant with comparing c.
|
||||
// We might even skip comparing uset.
|
||||
// Unless we want operator==() to be "correct" for more than iteration.
|
||||
return uset == other.uset && c == other.c;
|
||||
}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
bool operator!=(const USetCodePointIterator &other) const { return !operator==(other); }
|
||||
|
||||
/** @draft ICU 76 */
|
||||
UChar32 operator*() const { return c; }
|
||||
|
||||
/**
|
||||
* Pre-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetCodePointIterator &operator++() {
|
||||
if (c < end) {
|
||||
++c;
|
||||
} else if (rangeIndex < rangeCount) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
int32_t result = uset_getItem(uset, rangeIndex, &c, &end, nullptr, 0, &errorCode);
|
||||
if (U_SUCCESS(errorCode) && result == 0) {
|
||||
++rangeIndex;
|
||||
} else {
|
||||
c = end = U_SENTINEL;
|
||||
}
|
||||
} else {
|
||||
c = end = U_SENTINEL;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Post-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetCodePointIterator operator++(int) {
|
||||
USetCodePointIterator result(*this);
|
||||
operator++();
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class USetCodePoints;
|
||||
|
||||
USetCodePointIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount)
|
||||
: uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount),
|
||||
c(U_SENTINEL), end(U_SENTINEL) {
|
||||
// Fetch the first range.
|
||||
operator++();
|
||||
}
|
||||
|
||||
const USet *uset;
|
||||
int32_t rangeIndex;
|
||||
int32_t rangeCount;
|
||||
UChar32 c, end;
|
||||
};
|
||||
|
||||
/**
|
||||
* C++ "range" for iterating over the code points of a USet.
|
||||
*
|
||||
* \code
|
||||
* using U_HEADER_NESTED_NAMESPACE::USetCodePoints;
|
||||
* LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode));
|
||||
* for (UChar32 c : USetCodePoints(uset.getAlias())) {
|
||||
* printf("uset.codePoint U+%04lx\n", (long)c);
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* C++ UnicodeSet has member functions for iteration, including codePoints().
|
||||
*
|
||||
* @draft ICU 76
|
||||
* @see USetRanges
|
||||
* @see USetStrings
|
||||
* @see USetElements
|
||||
*/
|
||||
class USetCodePoints {
|
||||
public:
|
||||
/**
|
||||
* Constructs a C++ "range" object over the code points of the USet.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetCodePoints(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetCodePoints(const USetCodePoints &other) = default;
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetCodePointIterator begin() const {
|
||||
return USetCodePointIterator(uset, 0, rangeCount);
|
||||
}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetCodePointIterator end() const {
|
||||
return USetCodePointIterator(uset, rangeCount, rangeCount);
|
||||
}
|
||||
|
||||
private:
|
||||
const USet *uset;
|
||||
int32_t rangeCount;
|
||||
};
|
||||
|
||||
/**
|
||||
* A contiguous range of code points in a USet/UnicodeSet.
|
||||
* Returned by USetRangeIterator which is returned by USetRanges.
|
||||
* Both the rangeStart and rangeEnd are in the range.
|
||||
* (end() returns an iterator corresponding to rangeEnd+1.)
|
||||
* @draft ICU 76
|
||||
*/
|
||||
struct CodePointRange {
|
||||
/** @draft ICU 76 */
|
||||
struct iterator {
|
||||
/** @draft ICU 76 */
|
||||
iterator(UChar32 c) : c(c) {}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
bool operator==(const iterator &other) const { return c == other.c; }
|
||||
/** @draft ICU 76 */
|
||||
bool operator!=(const iterator &other) const { return !operator==(other); }
|
||||
|
||||
/** @draft ICU 76 */
|
||||
UChar32 operator*() const { return c; }
|
||||
|
||||
/**
|
||||
* Pre-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
iterator &operator++() {
|
||||
++c;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Post-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
iterator operator++(int) {
|
||||
return c++;
|
||||
}
|
||||
|
||||
/**
|
||||
* The current code point in the range.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
UChar32 c;
|
||||
};
|
||||
|
||||
/** @draft ICU 76 */
|
||||
CodePointRange(UChar32 start, UChar32 end) : rangeStart(start), rangeEnd(end) {}
|
||||
/** @draft ICU 76 */
|
||||
CodePointRange(const CodePointRange &other) = default;
|
||||
/** @draft ICU 76 */
|
||||
size_t size() const { return (rangeEnd + 1) - rangeStart; }
|
||||
/** @draft ICU 76 */
|
||||
iterator begin() const { return rangeStart; }
|
||||
/** @draft ICU 76 */
|
||||
iterator end() const { return rangeEnd + 1; }
|
||||
|
||||
/**
|
||||
* Start of a USet/UnicodeSet range of code points.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
UChar32 rangeStart;
|
||||
/**
|
||||
* Inclusive end of a USet/UnicodeSet range of code points.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
UChar32 rangeEnd;
|
||||
};
|
||||
|
||||
/**
|
||||
* Iterator returned by USetRanges.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
class USetRangeIterator {
|
||||
public:
|
||||
/** @draft ICU 76 */
|
||||
USetRangeIterator(const USetRangeIterator &other) = default;
|
||||
|
||||
/** @draft ICU 76 */
|
||||
bool operator==(const USetRangeIterator &other) const {
|
||||
// No need to compare rangeCount given private constructor
|
||||
// and assuming we don't compare iterators across the set being modified.
|
||||
// We might even skip comparing uset.
|
||||
// Unless we want operator==() to be "correct" for more than iteration.
|
||||
return uset == other.uset && rangeIndex == other.rangeIndex;
|
||||
}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
bool operator!=(const USetRangeIterator &other) const { return !operator==(other); }
|
||||
|
||||
/** @draft ICU 76 */
|
||||
CodePointRange operator*() const {
|
||||
if (rangeIndex < rangeCount) {
|
||||
UChar32 start, end;
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
int32_t result = uset_getItem(uset, rangeIndex, &start, &end, nullptr, 0, &errorCode);
|
||||
if (U_SUCCESS(errorCode) && result == 0) {
|
||||
return CodePointRange(start, end);
|
||||
}
|
||||
}
|
||||
return CodePointRange(U_SENTINEL, U_SENTINEL);
|
||||
}
|
||||
|
||||
/**
|
||||
* Pre-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetRangeIterator &operator++() {
|
||||
++rangeIndex;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Post-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetRangeIterator operator++(int) {
|
||||
USetRangeIterator result(*this);
|
||||
++rangeIndex;
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class USetRanges;
|
||||
|
||||
USetRangeIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount)
|
||||
: uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount) {}
|
||||
|
||||
const USet *uset;
|
||||
int32_t rangeIndex;
|
||||
int32_t rangeCount;
|
||||
};
|
||||
|
||||
/**
|
||||
* C++ "range" for iterating over the code point ranges of a USet.
|
||||
*
|
||||
* \code
|
||||
* using U_HEADER_NESTED_NAMESPACE::USetRanges;
|
||||
* LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, &errorCode));
|
||||
* for (auto [start, end] : USetRanges(uset.getAlias())) {
|
||||
* printf("uset.range U+%04lx..U+%04lx\n", (long)start, (long)end);
|
||||
* }
|
||||
* for (auto range : USetRanges(uset.getAlias())) {
|
||||
* for (UChar32 c : range) {
|
||||
* printf("uset.range.c U+%04lx\n", (long)c);
|
||||
* }
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* C++ UnicodeSet has member functions for iteration, including ranges().
|
||||
*
|
||||
* @draft ICU 76
|
||||
* @see USetCodePoints
|
||||
* @see USetStrings
|
||||
* @see USetElements
|
||||
*/
|
||||
class USetRanges {
|
||||
public:
|
||||
/**
|
||||
* Constructs a C++ "range" object over the code point ranges of the USet.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetRanges(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetRanges(const USetRanges &other) = default;
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetRangeIterator begin() const {
|
||||
return USetRangeIterator(uset, 0, rangeCount);
|
||||
}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetRangeIterator end() const {
|
||||
return USetRangeIterator(uset, rangeCount, rangeCount);
|
||||
}
|
||||
|
||||
private:
|
||||
const USet *uset;
|
||||
int32_t rangeCount;
|
||||
};
|
||||
|
||||
/**
|
||||
* Iterator returned by USetStrings.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
class USetStringIterator {
|
||||
public:
|
||||
/** @draft ICU 76 */
|
||||
USetStringIterator(const USetStringIterator &other) = default;
|
||||
|
||||
/** @draft ICU 76 */
|
||||
bool operator==(const USetStringIterator &other) const {
|
||||
// No need to compare count given private constructor
|
||||
// and assuming we don't compare iterators across the set being modified.
|
||||
// We might even skip comparing uset.
|
||||
// Unless we want operator==() to be "correct" for more than iteration.
|
||||
return uset == other.uset && index == other.index;
|
||||
}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
bool operator!=(const USetStringIterator &other) const { return !operator==(other); }
|
||||
|
||||
/** @draft ICU 76 */
|
||||
std::u16string_view operator*() const {
|
||||
if (index < count) {
|
||||
int32_t length;
|
||||
const UChar *uchars = uset_getString(uset, index, &length);
|
||||
// assert uchars != nullptr;
|
||||
return { ConstChar16Ptr(uchars), (uint32_t)length };
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
/**
|
||||
* Pre-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetStringIterator &operator++() {
|
||||
++index;
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Post-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetStringIterator operator++(int) {
|
||||
USetStringIterator result(*this);
|
||||
++index;
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class USetStrings;
|
||||
|
||||
USetStringIterator(const USet *uset, int32_t index, int32_t count)
|
||||
: uset(uset), index(index), count(count) {}
|
||||
|
||||
const USet *uset;
|
||||
int32_t index;
|
||||
int32_t count;
|
||||
};
|
||||
|
||||
/**
|
||||
* C++ "range" for iterating over the empty and multi-character strings of a USet.
|
||||
*
|
||||
* \code
|
||||
* using U_HEADER_NESTED_NAMESPACE::USetStrings;
|
||||
* LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode));
|
||||
* for (auto s : USetStrings(uset.getAlias())) {
|
||||
* UnicodeString us(s);
|
||||
* std::string u8;
|
||||
* printf("uset.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str());
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* C++ UnicodeSet has member functions for iteration, including strings().
|
||||
*
|
||||
* @draft ICU 76
|
||||
* @see USetCodePoints
|
||||
* @see USetRanges
|
||||
* @see USetElements
|
||||
*/
|
||||
class USetStrings {
|
||||
public:
|
||||
/**
|
||||
* Constructs a C++ "range" object over the strings of the USet.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetStrings(const USet *uset) : uset(uset), count(uset_getStringCount(uset)) {}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetStrings(const USetStrings &other) = default;
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetStringIterator begin() const {
|
||||
return USetStringIterator(uset, 0, count);
|
||||
}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetStringIterator end() const {
|
||||
return USetStringIterator(uset, count, count);
|
||||
}
|
||||
|
||||
private:
|
||||
const USet *uset;
|
||||
int32_t count;
|
||||
};
|
||||
|
||||
/**
|
||||
* Iterator returned by USetElements.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
class USetElementIterator {
|
||||
public:
|
||||
/** @draft ICU 76 */
|
||||
USetElementIterator(const USetElementIterator &other) = default;
|
||||
|
||||
/** @draft ICU 76 */
|
||||
bool operator==(const USetElementIterator &other) const {
|
||||
// No need to compare rangeCount & end given private constructor
|
||||
// and assuming we don't compare iterators across the set being modified.
|
||||
// We might even skip comparing uset.
|
||||
// Unless we want operator==() to be "correct" for more than iteration.
|
||||
return uset == other.uset && c == other.c && index == other.index;
|
||||
}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
bool operator!=(const USetElementIterator &other) const { return !operator==(other); }
|
||||
|
||||
/** @draft ICU 76 */
|
||||
UnicodeString operator*() const {
|
||||
if (c >= 0) {
|
||||
return UnicodeString(c);
|
||||
} else if (index < totalCount) {
|
||||
int32_t length;
|
||||
const UChar *uchars = uset_getString(uset, index - rangeCount, &length);
|
||||
// assert uchars != nullptr;
|
||||
return UnicodeString(uchars, length);
|
||||
} else {
|
||||
return UnicodeString();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pre-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetElementIterator &operator++() {
|
||||
if (c < end) {
|
||||
++c;
|
||||
} else if (index < rangeCount) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
int32_t result = uset_getItem(uset, index, &c, &end, nullptr, 0, &errorCode);
|
||||
if (U_SUCCESS(errorCode) && result == 0) {
|
||||
++index;
|
||||
} else {
|
||||
c = end = U_SENTINEL;
|
||||
}
|
||||
} else if (c >= 0) {
|
||||
// assert index == rangeCount;
|
||||
// Switch from the last range to the first string.
|
||||
c = end = U_SENTINEL;
|
||||
} else {
|
||||
++index;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Post-increment.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetElementIterator operator++(int) {
|
||||
USetElementIterator result(*this);
|
||||
operator++();
|
||||
return result;
|
||||
}
|
||||
|
||||
private:
|
||||
friend class USetElements;
|
||||
|
||||
USetElementIterator(const USet *uset, int32_t index, int32_t rangeCount, int32_t totalCount)
|
||||
: uset(uset), index(index), rangeCount(rangeCount), totalCount(totalCount),
|
||||
c(U_SENTINEL), end(U_SENTINEL) {
|
||||
if (index < rangeCount) {
|
||||
// Fetch the first range.
|
||||
operator++();
|
||||
}
|
||||
// Otherwise don't move beyond the (index - rangeCount)-th string.
|
||||
}
|
||||
|
||||
const USet *uset;
|
||||
int32_t index;
|
||||
/** Number of UnicodeSet/USet code point ranges. */
|
||||
int32_t rangeCount;
|
||||
/**
|
||||
* Number of code point ranges plus number of strings.
|
||||
* index starts from 0, counts ranges while less than rangeCount,
|
||||
* then counts strings while at least rangeCount and less than totalCount.
|
||||
*
|
||||
* Note that totalCount is the same as uset_getItemCount(), but usually
|
||||
* smaller than the number of elements returned by this iterator
|
||||
* because we return each code point of each range.
|
||||
*/
|
||||
int32_t totalCount;
|
||||
UChar32 c, end;
|
||||
};
|
||||
|
||||
/**
|
||||
* A C++ "range" for iterating over all of the elements of a USet.
|
||||
* Convenient all-in one iteration, but creates a UnicodeString for each
|
||||
* code point or string.
|
||||
*
|
||||
* Code points are returned first, then empty and multi-character strings.
|
||||
*
|
||||
* \code
|
||||
* using U_HEADER_NESTED_NAMESPACE::USetElements;
|
||||
* LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, &errorCode));
|
||||
* for (auto el : USetElements(uset.getAlias())) {
|
||||
* std::string u8;
|
||||
* printf("uset.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str());
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* C++ UnicodeSet has member functions for iteration, including begin() and end().
|
||||
*
|
||||
* @return an all-elements iterator.
|
||||
* @draft ICU 76
|
||||
* @see USetCodePoints
|
||||
* @see USetRanges
|
||||
* @see USetStrings
|
||||
*/
|
||||
class USetElements {
|
||||
public:
|
||||
/**
|
||||
* Constructs a C++ "range" object over all of the elements of the USet.
|
||||
* @draft ICU 76
|
||||
*/
|
||||
USetElements(const USet *uset)
|
||||
: uset(uset), rangeCount(uset_getRangeCount(uset)),
|
||||
stringCount(uset_getStringCount(uset)) {}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetElements(const USetElements &other) = default;
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetElementIterator begin() const {
|
||||
return USetElementIterator(uset, 0, rangeCount, rangeCount + stringCount);
|
||||
}
|
||||
|
||||
/** @draft ICU 76 */
|
||||
USetElementIterator end() const {
|
||||
return USetElementIterator(uset, rangeCount + stringCount, rangeCount, rangeCount + stringCount);
|
||||
}
|
||||
|
||||
private:
|
||||
const USet *uset;
|
||||
int32_t rangeCount, stringCount;
|
||||
};
|
||||
|
||||
} // namespace U_HEADER_ONLY_NAMESPACE
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
#endif // U_SHOW_CPLUSPLUS_API
|
||||
|
||||
#endif // __USET_H__
|
||||
|
|
|
@ -124,6 +124,49 @@ typedef uint8_t UVersionInfo[U_MAX_VERSION_LENGTH];
|
|||
# if U_USING_ICU_NAMESPACE
|
||||
U_NAMESPACE_USE
|
||||
# endif
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* \def U_HEADER_NESTED_NAMESPACE
|
||||
* Nested namespace used inside U_ICU_NAMESPACE for header-only APIs.
|
||||
* Different when used inside ICU to prevent public use of internal instantiations:
|
||||
* "header" when compiling calling code; "internal" when compiling ICU library code.
|
||||
*
|
||||
* When compiling for Windows, where DLL exports of APIs are explicit,
|
||||
* this is always "header". Header-only types are not marked for export,
|
||||
* which on Windows already avoids callers linking with library instantiations.
|
||||
*
|
||||
* @draft ICU 76
|
||||
* @see U_HEADER_ONLY_NAMESPACE
|
||||
*/
|
||||
|
||||
/**
|
||||
* \def U_HEADER_ONLY_NAMESPACE
|
||||
* Namespace used for header-only APIs.
|
||||
* Different when used inside ICU to prevent public use of internal instantiations.
|
||||
* "U_ICU_NAMESPACE::header" or "U_ICU_NAMESPACE::internal",
|
||||
* see U_HEADER_NESTED_NAMESPACE for details.
|
||||
*
|
||||
* @draft ICU 76
|
||||
*/
|
||||
|
||||
// The first test is the same as for defining U_EXPORT for Windows.
|
||||
#if defined(_MSC_VER) || (UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllexport__) && \
|
||||
UPRV_HAS_DECLSPEC_ATTRIBUTE(__dllimport__))
|
||||
# define U_HEADER_NESTED_NAMESPACE header
|
||||
#elif defined(U_COMBINED_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) || \
|
||||
defined(U_I18N_IMPLEMENTATION) || defined(U_IO_IMPLEMENTATION) || \
|
||||
defined(U_LAYOUTEX_IMPLEMENTATION) || defined(U_TOOLUTIL_IMPLEMENTATION)
|
||||
# define U_HEADER_NESTED_NAMESPACE internal
|
||||
#else
|
||||
# define U_HEADER_NESTED_NAMESPACE header
|
||||
#endif
|
||||
|
||||
#define U_HEADER_ONLY_NAMESPACE U_ICU_NAMESPACE::U_HEADER_NESTED_NAMESPACE
|
||||
|
||||
namespace U_HEADER_ONLY_NAMESPACE {}
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
#endif /* __cplusplus */
|
||||
|
||||
/*===========================================================================*/
|
||||
|
|
|
@ -118,15 +118,15 @@ static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
|
|||
}
|
||||
|
||||
UBool UnicodeSet::hasStrings() const {
|
||||
return strings != nullptr && !strings->isEmpty();
|
||||
return strings_ != nullptr && !strings_->isEmpty();
|
||||
}
|
||||
|
||||
int32_t UnicodeSet::stringsSize() const {
|
||||
return strings == nullptr ? 0 : strings->size();
|
||||
return strings_ == nullptr ? 0 : strings_->size();
|
||||
}
|
||||
|
||||
UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
|
||||
return strings != nullptr && strings->contains((void*) &s);
|
||||
return strings_ != nullptr && strings_->contains((void*) &s);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
|
@ -171,7 +171,7 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilte
|
|||
if (o.hasStrings()) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
if (!allocateStrings(status) ||
|
||||
(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
|
||||
(strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
|
||||
setToBogus();
|
||||
return;
|
||||
}
|
||||
|
@ -195,7 +195,7 @@ UnicodeSet::~UnicodeSet() {
|
|||
if (buffer != stackList) {
|
||||
uprv_free(buffer);
|
||||
}
|
||||
delete strings;
|
||||
delete strings_;
|
||||
delete stringSpan;
|
||||
releasePattern();
|
||||
}
|
||||
|
@ -233,16 +233,16 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
|
|||
}
|
||||
if (o.hasStrings()) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
if ((strings == nullptr && !allocateStrings(status)) ||
|
||||
(strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
|
||||
if ((strings_ == nullptr && !allocateStrings(status)) ||
|
||||
(strings_->assign(*o.strings_, cloneUnicodeString, status), U_FAILURE(status))) {
|
||||
setToBogus();
|
||||
return *this;
|
||||
}
|
||||
} else if (hasStrings()) {
|
||||
strings->removeAllElements();
|
||||
strings_->removeAllElements();
|
||||
}
|
||||
if (o.stringSpan != nullptr && !asThawed) {
|
||||
stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
|
||||
stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings_);
|
||||
if (stringSpan == nullptr) { // Check for memory allocation error.
|
||||
setToBogus();
|
||||
return *this;
|
||||
|
@ -284,7 +284,7 @@ bool UnicodeSet::operator==(const UnicodeSet& o) const {
|
|||
if (list[i] != o.list[i]) return false;
|
||||
}
|
||||
if (hasStrings() != o.hasStrings()) { return false; }
|
||||
if (hasStrings() && *strings != *o.strings) return false;
|
||||
if (hasStrings() && *strings_ != *o.strings_) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -450,7 +450,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
|
||||
return !c.hasStrings() || (strings_ != nullptr && strings_->containsAll(*c.strings_));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -495,7 +495,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
|
|||
return false;
|
||||
}
|
||||
}
|
||||
return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
|
||||
return strings_ == nullptr || !c.hasStrings() || strings_->containsNone(*c.strings_);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -536,8 +536,8 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
|
|||
}
|
||||
}
|
||||
if (hasStrings()) {
|
||||
for (i=0; i<strings->size(); ++i) {
|
||||
const UnicodeString& s = *static_cast<const UnicodeString*>(strings->elementAt(i));
|
||||
for (i=0; i<strings_->size(); ++i) {
|
||||
const UnicodeString& s = *static_cast<const UnicodeString*>(strings_->elementAt(i));
|
||||
if (s.isEmpty()) {
|
||||
continue; // skip the empty string
|
||||
}
|
||||
|
@ -586,8 +586,8 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text,
|
|||
// return the longest match.
|
||||
int32_t highWaterLength = 0;
|
||||
|
||||
for (i=0; i<strings->size(); ++i) {
|
||||
const UnicodeString& trial = *static_cast<const UnicodeString*>(strings->elementAt(i));
|
||||
for (i=0; i<strings_->size(); ++i) {
|
||||
const UnicodeString& trial = *static_cast<const UnicodeString*>(strings_->elementAt(i));
|
||||
if (trial.isEmpty()) {
|
||||
continue; // skip the empty string
|
||||
}
|
||||
|
@ -962,15 +962,15 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
|
|||
}
|
||||
|
||||
/**
|
||||
* Adds the given string, in order, to 'strings'. The given string
|
||||
* must have been checked by the caller to not already be in 'strings'.
|
||||
* Adds the given string, in order, to 'strings_'. The given string
|
||||
* must have been checked by the caller to not already be in 'strings_'.
|
||||
*/
|
||||
void UnicodeSet::_add(const UnicodeString& s) {
|
||||
if (isFrozen() || isBogus()) {
|
||||
return;
|
||||
}
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
if (strings == nullptr && !allocateStrings(ec)) {
|
||||
if (strings_ == nullptr && !allocateStrings(ec)) {
|
||||
setToBogus();
|
||||
return;
|
||||
}
|
||||
|
@ -979,7 +979,7 @@ void UnicodeSet::_add(const UnicodeString& s) {
|
|||
setToBogus();
|
||||
return;
|
||||
}
|
||||
strings->sortedInsert(t, compareUnicodeString, ec);
|
||||
strings_->sortedInsert(t, compareUnicodeString, ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
setToBogus();
|
||||
}
|
||||
|
@ -1058,7 +1058,7 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
|
|||
|
||||
UnicodeSet& UnicodeSet::removeAllStrings() {
|
||||
if (!isFrozen() && hasStrings()) {
|
||||
strings->removeAllElements();
|
||||
strings_->removeAllElements();
|
||||
releasePattern();
|
||||
}
|
||||
return *this;
|
||||
|
@ -1176,7 +1176,7 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
|
|||
if (isFrozen() || isBogus()) return *this;
|
||||
int32_t cp = getSingleCP(s);
|
||||
if (cp < 0) {
|
||||
if (strings != nullptr && strings->removeElement((void*) &s)) {
|
||||
if (strings_ != nullptr && strings_->removeElement((void*) &s)) {
|
||||
releasePattern();
|
||||
}
|
||||
} else {
|
||||
|
@ -1248,7 +1248,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
|
|||
int32_t cp = getSingleCP(s);
|
||||
if (cp < 0) {
|
||||
if (stringsContains(s)) {
|
||||
strings->removeElement((void*) &s);
|
||||
strings_->removeElement((void*) &s);
|
||||
} else {
|
||||
_add(s);
|
||||
}
|
||||
|
@ -1275,9 +1275,9 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
|
|||
}
|
||||
|
||||
// Add strings in order
|
||||
if ( c.strings!=nullptr ) {
|
||||
for (int32_t i=0; i<c.strings->size(); ++i) {
|
||||
const UnicodeString* s = static_cast<const UnicodeString*>(c.strings->elementAt(i));
|
||||
if ( c.strings_!=nullptr ) {
|
||||
for (int32_t i=0; i<c.strings_->size(); ++i) {
|
||||
const UnicodeString* s = static_cast<const UnicodeString*>(c.strings_->elementAt(i));
|
||||
if (!stringsContains(*s)) {
|
||||
_add(*s);
|
||||
}
|
||||
|
@ -1302,9 +1302,9 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
|
|||
retain(c.list, c.len, 0);
|
||||
if (hasStrings()) {
|
||||
if (!c.hasStrings()) {
|
||||
strings->removeAllElements();
|
||||
strings_->removeAllElements();
|
||||
} else {
|
||||
strings->retainAll(*c.strings);
|
||||
strings_->retainAll(*c.strings_);
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
|
@ -1325,7 +1325,7 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
|
|||
}
|
||||
retain(c.list, c.len, 2);
|
||||
if (hasStrings() && c.hasStrings()) {
|
||||
strings->removeAll(*c.strings);
|
||||
strings_->removeAll(*c.strings_);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
@ -1344,10 +1344,10 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
|
|||
}
|
||||
exclusiveOr(c.list, c.len, 0);
|
||||
|
||||
if (c.strings != nullptr) {
|
||||
for (int32_t i=0; i<c.strings->size(); ++i) {
|
||||
void* e = c.strings->elementAt(i);
|
||||
if (strings == nullptr || !strings->removeElement(e)) {
|
||||
if (c.strings_ != nullptr) {
|
||||
for (int32_t i=0; i<c.strings_->size(); ++i) {
|
||||
void* e = c.strings_->elementAt(i);
|
||||
if (strings_ == nullptr || !strings_->removeElement(e)) {
|
||||
_add(*static_cast<const UnicodeString*>(e));
|
||||
}
|
||||
}
|
||||
|
@ -1366,8 +1366,8 @@ UnicodeSet& UnicodeSet::clear() {
|
|||
list[0] = UNICODESET_HIGH;
|
||||
len = 1;
|
||||
releasePattern();
|
||||
if (strings != nullptr) {
|
||||
strings->removeAllElements();
|
||||
if (strings_ != nullptr) {
|
||||
strings_->removeAllElements();
|
||||
}
|
||||
// Remove bogus
|
||||
fFlags = 0;
|
||||
|
@ -1405,7 +1405,7 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
|
|||
}
|
||||
|
||||
const UnicodeString* UnicodeSet::getString(int32_t index) const {
|
||||
return static_cast<const UnicodeString*>(strings->elementAt(index));
|
||||
return static_cast<const UnicodeString*>(strings_->elementAt(index));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1439,9 +1439,9 @@ UnicodeSet& UnicodeSet::compact() {
|
|||
// else what the heck happened?! We allocated less memory!
|
||||
// Oh well. We'll keep our original array.
|
||||
}
|
||||
if (strings != nullptr && strings->isEmpty()) {
|
||||
delete strings;
|
||||
strings = nullptr;
|
||||
if (strings_ != nullptr && strings_->isEmpty()) {
|
||||
delete strings_;
|
||||
strings_ = nullptr;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
@ -1607,15 +1607,15 @@ UBool UnicodeSet::allocateStrings(UErrorCode &status) {
|
|||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
}
|
||||
strings = new UVector(uprv_deleteUObject,
|
||||
strings_ = new UVector(uprv_deleteUObject,
|
||||
uhash_compareUnicodeString, 1, status);
|
||||
if (strings == nullptr) { // Check for memory allocation error.
|
||||
if (strings_ == nullptr) { // Check for memory allocation error.
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return false;
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
delete strings;
|
||||
strings = nullptr;
|
||||
delete strings_;
|
||||
strings_ = nullptr;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -2131,11 +2131,11 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
|
|||
}
|
||||
}
|
||||
|
||||
if (strings != nullptr) {
|
||||
for (int32_t i = 0; i<strings->size(); ++i) {
|
||||
if (strings_ != nullptr) {
|
||||
for (int32_t i = 0; i<strings_->size(); ++i) {
|
||||
result.append(u'{');
|
||||
_appendToPat(result,
|
||||
*static_cast<const UnicodeString*>(strings->elementAt(i)),
|
||||
*static_cast<const UnicodeString*>(strings_->elementAt(i)),
|
||||
escapeUnprintable);
|
||||
result.append(u'}');
|
||||
}
|
||||
|
@ -2175,7 +2175,7 @@ UnicodeSet *UnicodeSet::freeze() {
|
|||
|
||||
// Optimize contains() and span() and similar functions.
|
||||
if (hasStrings()) {
|
||||
stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
|
||||
stringSpan = new UnicodeSetStringSpan(*this, *strings_, UnicodeSetStringSpan::ALL);
|
||||
if (stringSpan == nullptr) {
|
||||
setToBogus();
|
||||
return this;
|
||||
|
@ -2216,7 +2216,7 @@ int32_t UnicodeSet::span(const char16_t *s, int32_t length, USetSpanCondition sp
|
|||
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
|
||||
UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
|
||||
UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan(*this, *strings, which);
|
||||
UnicodeSetStringSpan strSpan(*this, *strings_, which);
|
||||
if(strSpan.needsStringSpanUTF16()) {
|
||||
return strSpan.span(s, length, spanCondition);
|
||||
}
|
||||
|
@ -2253,7 +2253,7 @@ int32_t UnicodeSet::spanBack(const char16_t *s, int32_t length, USetSpanConditio
|
|||
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
|
||||
UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
|
||||
UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan(*this, *strings, which);
|
||||
UnicodeSetStringSpan strSpan(*this, *strings_, which);
|
||||
if(strSpan.needsStringSpanUTF16()) {
|
||||
return strSpan.spanBack(s, length, spanCondition);
|
||||
}
|
||||
|
@ -2291,7 +2291,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp
|
|||
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
|
||||
UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
|
||||
UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan(*this, *strings, which);
|
||||
UnicodeSetStringSpan strSpan(*this, *strings_, which);
|
||||
if(strSpan.needsStringSpanUTF8()) {
|
||||
return strSpan.spanUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
|
||||
}
|
||||
|
@ -2329,7 +2329,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio
|
|||
uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
|
||||
UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
|
||||
UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
|
||||
UnicodeSetStringSpan strSpan(*this, *strings, which);
|
||||
UnicodeSetStringSpan strSpan(*this, *strings_, which);
|
||||
if(strSpan.needsStringSpanUTF8()) {
|
||||
return strSpan.spanBackUTF8(reinterpret_cast<const uint8_t*>(s), length, spanCondition);
|
||||
}
|
||||
|
|
|
@ -242,7 +242,7 @@ void UnicodeSet::closeOverCaseInsensitive(bool simple) {
|
|||
// therefore, start with no strings and add only those needed.
|
||||
// Do this before processing code points, because they may add strings.
|
||||
if (!simple && foldSet.hasStrings()) {
|
||||
foldSet.strings->removeAllElements();
|
||||
foldSet.strings_->removeAllElements();
|
||||
}
|
||||
|
||||
USetAdder sa = {
|
||||
|
@ -276,8 +276,8 @@ void UnicodeSet::closeOverCaseInsensitive(bool simple) {
|
|||
}
|
||||
if (hasStrings()) {
|
||||
UnicodeString str;
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
const UnicodeString* pStr = static_cast<const UnicodeString*>(strings->elementAt(j));
|
||||
for (int32_t j=0; j<strings_->size(); ++j) {
|
||||
const UnicodeString* pStr = static_cast<const UnicodeString*>(strings_->elementAt(j));
|
||||
if (simple) {
|
||||
if (scfString(*pStr, str)) {
|
||||
foldSet.remove(*pStr).add(str);
|
||||
|
@ -334,8 +334,8 @@ void UnicodeSet::closeOverAddCaseMappings() {
|
|||
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
|
||||
if (U_SUCCESS(status)) {
|
||||
#endif
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
const UnicodeString* pStr = static_cast<const UnicodeString*>(strings->elementAt(j));
|
||||
for (int32_t j=0; j<strings_->size(); ++j) {
|
||||
const UnicodeString* pStr = static_cast<const UnicodeString*>(strings_->elementAt(j));
|
||||
(str = *pStr).toLower(root);
|
||||
foldSet.add(str);
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/char16ptr.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
@ -306,12 +307,32 @@ uset_getRangeCount(const USet *set) {
|
|||
return ((const UnicodeSet *)set)->UnicodeSet::getRangeCount();
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uset_getStringCount(const USet *uset) {
|
||||
const UnicodeSet &set = *(const UnicodeSet *)uset;
|
||||
return USetAccess::getStringCount(set);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uset_getItemCount(const USet* uset) {
|
||||
const UnicodeSet& set = *(const UnicodeSet*)uset;
|
||||
return set.getRangeCount() + USetAccess::getStringCount(set);
|
||||
}
|
||||
|
||||
U_CAPI const UChar* U_EXPORT2
|
||||
uset_getString(const USet *uset, int32_t index, int32_t *pLength) {
|
||||
if (pLength == nullptr) { return nullptr; }
|
||||
const UnicodeSet &set = *(const UnicodeSet *)uset;
|
||||
int32_t count = USetAccess::getStringCount(set);
|
||||
if (index < 0 || count <= index) {
|
||||
*pLength = 0;
|
||||
return nullptr;
|
||||
}
|
||||
const UnicodeString *s = USetAccess::getString(set, index);
|
||||
*pLength = s->length();
|
||||
return toUCharPtr(s->getBuffer());
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uset_getItem(const USet* uset, int32_t itemIndex,
|
||||
UChar32* start, UChar32* end,
|
||||
|
|
|
@ -61,7 +61,7 @@ UBool UnicodeSetIterator::next() {
|
|||
|
||||
if (nextString >= stringCount) return false;
|
||||
codepoint = static_cast<UChar32>(IS_STRING); // signal that value is actually a string
|
||||
string = static_cast<const UnicodeString*>(set->strings->elementAt(nextString++));
|
||||
string = static_cast<const UnicodeString*>(set->strings_->elementAt(nextString++));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -94,7 +94,7 @@ UBool UnicodeSetIterator::nextRange() {
|
|||
|
||||
if (nextString >= stringCount) return false;
|
||||
codepoint = static_cast<UChar32>(IS_STRING); // signal that value is actually a string
|
||||
string = static_cast<const UnicodeString*>(set->strings->elementAt(nextString++));
|
||||
string = static_cast<const UnicodeString*>(set->strings_->elementAt(nextString++));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -12,9 +12,11 @@
|
|||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "usettest.h"
|
||||
#include "unicode/ucnv.h"
|
||||
|
@ -104,6 +106,14 @@ UnicodeSetTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE_AUTO(TestEmptyString);
|
||||
TESTCASE_AUTO(TestSkipToStrings);
|
||||
TESTCASE_AUTO(TestPatternCodePointComplement);
|
||||
TESTCASE_AUTO(TestCodePointIterator);
|
||||
TESTCASE_AUTO(TestUSetCodePointIterator);
|
||||
TESTCASE_AUTO(TestRangeIterator);
|
||||
TESTCASE_AUTO(TestUSetRangeIterator);
|
||||
TESTCASE_AUTO(TestStringIterator);
|
||||
TESTCASE_AUTO(TestUSetStringIterator);
|
||||
TESTCASE_AUTO(TestElementIterator);
|
||||
TESTCASE_AUTO(TestUSetElementIterator);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -4259,3 +4269,223 @@ void UnicodeSetTest::TestPatternCodePointComplement() {
|
|||
assertFalse("[:Basic_Emoji:].complement() --> no bicycle", notBasic.contains(U'🚲'));
|
||||
}
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestCodePointIterator() {
|
||||
IcuTestErrorCode errorCode(*this, "TestCodePointIterator");
|
||||
UnicodeSet set(u"[abcçカ🚴]", errorCode);
|
||||
UnicodeString result;
|
||||
for (UChar32 c : set.codePoints()) {
|
||||
// printf("set.codePoint U+%04lx\n", (long)c);
|
||||
result.append(u' ').append(c);
|
||||
}
|
||||
assertEquals(WHERE, u" a b c ç カ 🚴", result);
|
||||
|
||||
// codePoints() returns USetCodePoints for which explicit APIs are tested via USet.
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestUSetCodePointIterator() {
|
||||
IcuTestErrorCode errorCode(*this, "TestUSetCodePointIterator");
|
||||
using U_HEADER_NESTED_NAMESPACE::USetCodePoints;
|
||||
LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, errorCode));
|
||||
UnicodeString result;
|
||||
for (UChar32 c : USetCodePoints(uset.getAlias())) {
|
||||
// printf("uset.codePoint U+%04lx\n", (long)c);
|
||||
result.append(u' ').append(c);
|
||||
}
|
||||
assertEquals(WHERE, u" a b c ç カ 🚴", result);
|
||||
|
||||
USetCodePoints range1(uset.getAlias());
|
||||
auto range2(range1); // copy constructor
|
||||
auto iter = range1.begin();
|
||||
auto limit = range2.end();
|
||||
// operator* with pre- and post-increment
|
||||
assertEquals(WHERE, u'a', *iter);
|
||||
++iter;
|
||||
assertEquals(WHERE, u'b', *iter);
|
||||
assertEquals(WHERE, u'c', *++iter);
|
||||
auto iter2(iter); // copy constructor
|
||||
assertEquals(WHERE, u'c', *iter2++);
|
||||
assertEquals(WHERE, u'ç', *iter2++);
|
||||
assertEquals(WHERE, u'カ', *iter2);
|
||||
assertTrue(WHERE, ++iter2 != limit);
|
||||
auto iter3(iter2++);
|
||||
assertEquals(WHERE, U'🚴', *iter3);
|
||||
assertTrue(WHERE, iter2 == limit);
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestRangeIterator() {
|
||||
IcuTestErrorCode errorCode(*this, "TestRangeIterator");
|
||||
UnicodeSet set(u"[abcçカ🚴]", errorCode);
|
||||
UnicodeString result;
|
||||
for (auto [start, end] : set.ranges()) {
|
||||
// printf("set.range U+%04lx..U+%04lx\n", (long)start, (long)end);
|
||||
result.append(u' ').append(start).append(u'-').append(end);
|
||||
}
|
||||
assertEquals(WHERE, u" a-c ç-ç カ-カ 🚴-🚴", result);
|
||||
result.remove();
|
||||
for (auto range : set.ranges()) {
|
||||
for (UChar32 c : range) {
|
||||
// printf("set.range.c U+%04lx\n", (long)c);
|
||||
result.append(u' ').append(c);
|
||||
}
|
||||
result.append(u" |");
|
||||
}
|
||||
assertEquals(WHERE, u" a b c | ç | カ | 🚴 |", result);
|
||||
|
||||
// ranges() returns USetRanges for which explicit APIs are tested via USet.
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestUSetRangeIterator() {
|
||||
IcuTestErrorCode errorCode(*this, "TestUSetRangeIterator");
|
||||
using U_HEADER_NESTED_NAMESPACE::USetRanges;
|
||||
using U_HEADER_NESTED_NAMESPACE::CodePointRange;
|
||||
LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴]", -1, errorCode));
|
||||
UnicodeString result;
|
||||
for (auto [start, end] : USetRanges(uset.getAlias())) {
|
||||
// printf("uset.range U+%04lx..U+%04lx\n", (long)start, (long)end);
|
||||
result.append(u' ').append(start).append(u'-').append(end);
|
||||
}
|
||||
assertEquals(WHERE, u" a-c ç-ç カ-カ 🚴-🚴", result);
|
||||
result.remove();
|
||||
for (auto range : USetRanges(uset.getAlias())) {
|
||||
for (UChar32 c : range) {
|
||||
// printf("uset.range.c U+%04lx\n", (long)c);
|
||||
result.append(u' ').append(c);
|
||||
}
|
||||
result.append(u" |");
|
||||
}
|
||||
assertEquals(WHERE, u" a b c | ç | カ | 🚴 |", result);
|
||||
|
||||
USetRanges range1(uset.getAlias());
|
||||
auto range2(range1); // copy constructor
|
||||
auto iter = range1.begin();
|
||||
auto limit = range2.end();
|
||||
// operator* with pre- and post-increment
|
||||
{
|
||||
auto cpRange = *iter;
|
||||
assertEquals(WHERE, u'a', cpRange.rangeStart);
|
||||
assertEquals(WHERE, u'c', cpRange.rangeEnd);
|
||||
assertEquals(WHERE, 3, cpRange.size());
|
||||
auto cpRange2(cpRange);
|
||||
auto cpIter = cpRange.begin();
|
||||
auto cpLimit = cpRange2.end();
|
||||
assertEquals(WHERE, u'a', *cpIter++);
|
||||
assertEquals(WHERE, u'b', *cpIter);
|
||||
assertTrue(WHERE, cpIter != cpLimit);
|
||||
CodePointRange::iterator cpIter2(u'b'); // public constructor
|
||||
assertTrue(WHERE, cpIter == cpIter2);
|
||||
assertEquals(WHERE, u'c', *++cpIter);
|
||||
assertTrue(WHERE, cpIter != cpIter2);
|
||||
assertTrue(WHERE, ++cpIter == cpLimit);
|
||||
}
|
||||
++iter;
|
||||
auto iter2(iter); // copy constructor
|
||||
assertEquals(WHERE, u'ç', (*iter2).rangeStart);
|
||||
assertEquals(WHERE, u'ç', (*iter2).rangeEnd);
|
||||
assertEquals(WHERE, 1, (*iter2).size());
|
||||
assertEquals(WHERE, u'ç', (*iter2++).rangeStart);
|
||||
assertEquals(WHERE, u'カ', (*iter2).rangeStart);
|
||||
assertTrue(WHERE, ++iter2 != limit);
|
||||
auto iter3(iter2++);
|
||||
assertEquals(WHERE, U'🚴', (*iter3).rangeStart);
|
||||
assertTrue(WHERE, iter2 == limit);
|
||||
|
||||
{
|
||||
CodePointRange cpRange(u'h', u'k'); // public constructor
|
||||
// FYI: currently no operator==
|
||||
assertEquals(WHERE, u'h', cpRange.rangeStart);
|
||||
assertEquals(WHERE, u'k', cpRange.rangeEnd);
|
||||
assertEquals(WHERE, 4, cpRange.size());
|
||||
assertEquals(WHERE, u'i', *++(cpRange.begin()));
|
||||
}
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestStringIterator() {
|
||||
IcuTestErrorCode errorCode(*this, "TestStringIterator");
|
||||
UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode);
|
||||
UnicodeString result;
|
||||
for (auto s : set.strings()) {
|
||||
// UnicodeString us(s);
|
||||
// std::string u8;
|
||||
// printf("set.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str());
|
||||
result.append(u" \"").append(s).append(u'"');
|
||||
}
|
||||
assertEquals(WHERE, uR"( "" "abc" "de")", result);
|
||||
|
||||
// strings() returns USetStrins for which explicit APIs are tested via USet.
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestUSetStringIterator() {
|
||||
IcuTestErrorCode errorCode(*this, "TestUSetStringIterator");
|
||||
using U_HEADER_NESTED_NAMESPACE::USetStrings;
|
||||
LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, errorCode));
|
||||
UnicodeString result;
|
||||
for (auto s : USetStrings(uset.getAlias())) {
|
||||
// UnicodeString us(s);
|
||||
// std::string u8;
|
||||
// printf("uset.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str());
|
||||
result.append(u" \"").append(s).append(u'"');
|
||||
}
|
||||
assertEquals(WHERE, uR"( "" "abc" "de")", result);
|
||||
|
||||
USetStrings range1(uset.getAlias());
|
||||
auto range2(range1); // copy constructor
|
||||
auto iter = range1.begin();
|
||||
auto limit = range2.end();
|
||||
// operator* with pre- and post-increment
|
||||
assertEquals(WHERE, UnicodeString(), UnicodeString(*iter));
|
||||
assertEquals(WHERE, u"abc", UnicodeString(*++iter));
|
||||
auto iter2(iter); // copy constructor
|
||||
assertEquals(WHERE, u"abc", UnicodeString(*iter2++));
|
||||
assertTrue(WHERE, iter2 != limit);
|
||||
auto iter3(iter2++);
|
||||
assertEquals(WHERE, u"de", UnicodeString(*iter3));
|
||||
assertTrue(WHERE, iter2 == limit);
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestElementIterator() {
|
||||
IcuTestErrorCode errorCode(*this, "TestElementIterator");
|
||||
UnicodeSet set(u"[abcçカ🚴{}{abc}{de}]", errorCode);
|
||||
UnicodeString result;
|
||||
for (auto el : set) {
|
||||
// std::string u8;
|
||||
// printf("set.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str());
|
||||
result.append(u" \"").append(el).append(u'"');
|
||||
}
|
||||
assertEquals(WHERE, uR"( "a" "b" "c" "ç" "カ" "🚴" "" "abc" "de")", result);
|
||||
|
||||
// begin() & end() return USetElementIterator for which explicit APIs are tested via USet.
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestUSetElementIterator() {
|
||||
IcuTestErrorCode errorCode(*this, "TestUSetElementIterator");
|
||||
using U_HEADER_NESTED_NAMESPACE::USetElements;
|
||||
LocalUSetPointer uset(uset_openPattern(u"[abcçカ🚴{}{abc}{de}]", -1, errorCode));
|
||||
UnicodeString result;
|
||||
for (auto el : USetElements(uset.getAlias())) {
|
||||
// std::string u8;
|
||||
// printf("uset.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str());
|
||||
result.append(u" \"").append(el).append(u'"');
|
||||
}
|
||||
assertEquals(WHERE, uR"( "a" "b" "c" "ç" "カ" "🚴" "" "abc" "de")", result);
|
||||
|
||||
USetElements range1(uset.getAlias());
|
||||
auto range2(range1); // copy constructor
|
||||
auto iter = range1.begin();
|
||||
auto limit = range2.end();
|
||||
// operator* with pre- and post-increment
|
||||
assertEquals(WHERE, u"a", *iter);
|
||||
++iter;
|
||||
assertEquals(WHERE, u"b", *iter);
|
||||
assertEquals(WHERE, u"c", *++iter);
|
||||
auto iter2(iter); // copy constructor
|
||||
assertEquals(WHERE, u"c", *iter2++);
|
||||
// skip çカ🚴
|
||||
++++++iter2;
|
||||
assertEquals(WHERE, UnicodeString(), *iter2++);
|
||||
assertEquals(WHERE, u"abc", *iter2);
|
||||
assertTrue(WHERE, ++iter2 != limit);
|
||||
auto iter3(iter2++);
|
||||
assertEquals(WHERE, u"de", *iter3);
|
||||
assertTrue(WHERE, iter2 == limit);
|
||||
}
|
||||
|
|
|
@ -105,6 +105,15 @@ private:
|
|||
void TestSkipToStrings();
|
||||
void TestPatternCodePointComplement();
|
||||
|
||||
void TestCodePointIterator();
|
||||
void TestUSetCodePointIterator();
|
||||
void TestRangeIterator();
|
||||
void TestUSetRangeIterator();
|
||||
void TestStringIterator();
|
||||
void TestUSetStringIterator();
|
||||
void TestElementIterator();
|
||||
void TestUSetElementIterator();
|
||||
|
||||
private:
|
||||
|
||||
UBool toPatternAux(UChar32 start, UChar32 end);
|
||||
|
|
Loading…
Add table
Reference in a new issue