From 7c2d19d82803b391801cea1587f4000749dc0244 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Wed, 12 Apr 2000 19:36:30 +0000 Subject: [PATCH] ICU-176 UTF-16 support with CharacterIterator; new functions for more efficient iteration X-SVN-Rev: 1117 --- icu4c/source/common/chariter.cpp | 2 - icu4c/source/common/schriter.cpp | 208 ++++----------- icu4c/source/common/uchriter.cpp | 352 +++++++++++++++++++------ icu4c/source/common/unicode/chariter.h | 102 +++++-- icu4c/source/common/unicode/schriter.h | 107 ++------ icu4c/source/common/unicode/uchriter.h | 154 ++++++++--- 6 files changed, 553 insertions(+), 372 deletions(-) diff --git a/icu4c/source/common/chariter.cpp b/icu4c/source/common/chariter.cpp index db86df954ae..bfaeed34159 100644 --- a/icu4c/source/common/chariter.cpp +++ b/icu4c/source/common/chariter.cpp @@ -8,7 +8,5 @@ #include "unicode/chariter.h" -const UChar CharacterIterator::DONE = 0xffff; - CharacterIterator::~CharacterIterator() {} diff --git a/icu4c/source/common/schriter.cpp b/icu4c/source/common/schriter.cpp index 437d0212f4f..a50c82856e9 100644 --- a/icu4c/source/common/schriter.cpp +++ b/icu4c/source/common/schriter.cpp @@ -20,188 +20,94 @@ UClassID StringCharacterIterator::fgClassID = 0; StringCharacterIterator::StringCharacterIterator() - : CharacterIterator(), - text(), - pos(0), - begin(0), - end(0) + : UCharCharacterIterator(), + text() { // NEVER DEFAULT CONSTRUCT! } StringCharacterIterator::StringCharacterIterator(const UnicodeString& text) - : CharacterIterator(), - text(text), - pos(0), - begin(0), - end(text.length()) -{} - -StringCharacterIterator::StringCharacterIterator(const UnicodeString& text, - UTextOffset pos) - : CharacterIterator(), - text(text), - pos(pos), - begin(0), - end(text.length()) + : UCharCharacterIterator(text.fArray, text.length()), + text(text) { - // the Java code checks the parameters and throws exceptions we've - // decided to punt on this for the time being because changing this - // constructor to accept an error code is an API change with - // significant impact + // we had set the input parameter's array, now we need to set our copy's array + UCharCharacterIterator::text = this->text.fArray; } -StringCharacterIterator::StringCharacterIterator(const UnicodeString& text, - UTextOffset begin, - UTextOffset end, - UTextOffset pos) - : CharacterIterator(), - text(text), - pos(pos), - begin(begin), - end(end) +StringCharacterIterator::StringCharacterIterator(const UnicodeString& text, + UTextOffset pos) + : UCharCharacterIterator(text.fArray, text.length(), pos), + text(text) { - // the Java code checks the parameters and throws exceptions we've - // decided to punt on this for the time being because changing this - // constructor to accept an error code is an API change with - // significant impact + // we had set the input parameter's array, now we need to set our copy's array + UCharCharacterIterator::text = this->text.fArray; +} + +StringCharacterIterator::StringCharacterIterator(const UnicodeString& text, + UTextOffset begin, + UTextOffset end, + UTextOffset pos) + : UCharCharacterIterator(text.fArray, text.length(), begin, end, pos), + text(text) +{ + // we had set the input parameter's array, now we need to set our copy's array + UCharCharacterIterator::text = this->text.fArray; } StringCharacterIterator::StringCharacterIterator(const StringCharacterIterator& that) - : CharacterIterator(that), - text(that.text), - pos(that.pos), - begin(that.begin), - end(that.end) + : UCharCharacterIterator(that), + text(that.text) { + // we had set the input parameter's array, now we need to set our copy's array + UCharCharacterIterator::text = this->text.fArray; } -StringCharacterIterator::~StringCharacterIterator() -{} +StringCharacterIterator::~StringCharacterIterator() { +} StringCharacterIterator& -StringCharacterIterator::operator=(const StringCharacterIterator& that) -{ - text = that.text; - pos = that.pos; - begin = that.begin; - end = that.end; - return *this; +StringCharacterIterator::operator=(const StringCharacterIterator& that) { + UCharCharacterIterator::operator=(that); + text = that.text; + // we had set the input parameter's array, now we need to set our copy's array + UCharCharacterIterator::text = this->text.fArray; + return *this; } bool_t -StringCharacterIterator::operator==(const CharacterIterator& that) const -{ - if (this == &that) - return TRUE; - - if (getDynamicClassID() != that.getDynamicClassID()) - return FALSE; +StringCharacterIterator::operator==(const CharacterIterator& that) const { + if (this == &that) { + return TRUE; + } - StringCharacterIterator& realThat = (StringCharacterIterator&)that; + // do not call UCharCharacterIterator::operator==() + // because that checks for array pointer equality + // while we compare UnicodeString objects - return text == realThat.text - && pos == realThat.pos - && begin == realThat.begin - && end == realThat.end; -} + if (getDynamicClassID() != that.getDynamicClassID()) { + return FALSE; + } -int32_t -StringCharacterIterator::hashCode() const -{ - return text.hashCode() ^ pos ^ begin ^ end; + StringCharacterIterator& realThat = (StringCharacterIterator&)that; + + return text == realThat.text + && pos == realThat.pos + && begin == realThat.begin + && end == realThat.end; } CharacterIterator* -StringCharacterIterator::clone() const -{ - return new StringCharacterIterator(*this); -} - -UChar -StringCharacterIterator::first() -{ - pos = begin; - return text.charAt(pos); -} - -UChar -StringCharacterIterator::last() -{ - pos = end - 1; - return text.charAt(pos); -} - -UChar -StringCharacterIterator::setIndex(UTextOffset pos) -{ - // should check "pos" here and return an error code, but changing - // this function would have significant impact across TIFC, so we - // decided to hold off - this->pos = pos; - return text.charAt(pos); -} - -UChar -StringCharacterIterator::current() const -{ - if (pos >= begin && pos < end) - return text.charAt(pos); - else - return CharacterIterator::DONE; -} - -UChar -StringCharacterIterator::next() -{ - if(pos < end - 1) { - return text.charAt(++pos); - } - else { - pos = end; - return CharacterIterator::DONE; - } -} - -UChar -StringCharacterIterator::previous() -{ - if (pos > begin) - return text.charAt(--pos); - else - return DONE; -} - -UTextOffset -StringCharacterIterator::startIndex() const -{ - return begin; -} - -UTextOffset -StringCharacterIterator::endIndex() const -{ - return end; -} - -UTextOffset -StringCharacterIterator::getIndex() const -{ - return pos; +StringCharacterIterator::clone() const { + return new StringCharacterIterator(*this); } void -StringCharacterIterator::setText(const UnicodeString& newText) -{ +StringCharacterIterator::setText(const UnicodeString& newText) { text = newText; - begin = 0; - end = newText.length(); - pos = begin; + UCharCharacterIterator::setText(text.fArray, text.length()); } void -StringCharacterIterator::getText(UnicodeString& result) -{ - result = text; +StringCharacterIterator::getText(UnicodeString& result) { + result = text; } - diff --git a/icu4c/source/common/uchriter.cpp b/icu4c/source/common/uchriter.cpp index 9df65b3c208..8e42c994e91 100644 --- a/icu4c/source/common/uchriter.cpp +++ b/icu4c/source/common/uchriter.cpp @@ -6,20 +6,89 @@ */ #include "unicode/uchriter.h" +#include "uhash.h" + +UCharCharacterIterator::UCharCharacterIterator() + : CharacterIterator(), + text(0), + textLength(0), + pos(0), + begin(0), + end(0) +{ + // never default construct! +} UCharCharacterIterator::UCharCharacterIterator(const UChar* text, - int32_t textLength) + int32_t textLength) : CharacterIterator(), text(text), + textLength(textLength), pos(0), begin(0), end(textLength) { + if(text == 0 || textLength < 0) { + textLength = end = 0; + } +} + +UCharCharacterIterator::UCharCharacterIterator(const UChar* text, + int32_t textLength, + UTextOffset pos) + : CharacterIterator(), + text(text), + textLength(textLength), + pos(pos), + begin(0), + end(textLength) +{ + if(text == 0 || textLength < 0) { + textLength = end = 0; + } + if(pos < 0) { + pos = 0; + } else if(pos > end) { + pos = end; + } +} + +UCharCharacterIterator::UCharCharacterIterator(const UChar* text, + int32_t textLength, + UTextOffset begin, + UTextOffset end, + UTextOffset pos) + : CharacterIterator(), + text(text), + textLength(textLength), + pos(pos), + begin(begin), + end(end) +{ + if(text == 0 || textLength < 0) { + textLength = 0; + } + if(begin < 0) { + begin = 0; + } else if(begin > textLength) { + begin = textLength; + } + if(end < begin) { + end = begin; + } else if(end > textLength) { + end = textLength; + } + if(pos < begin) { + pos = begin; + } else if(pos > end) { + pos = end; + } } UCharCharacterIterator::UCharCharacterIterator(const UCharCharacterIterator& that) : CharacterIterator(that), text(that.text), + textLength(that.textLength), pos(that.pos), begin(that.begin), end(that.end) @@ -27,134 +96,261 @@ UCharCharacterIterator::UCharCharacterIterator(const UCharCharacterIterator& tha } UCharCharacterIterator& -UCharCharacterIterator::operator=(const UCharCharacterIterator& that) -{ +UCharCharacterIterator::operator=(const UCharCharacterIterator& that) { text = that.text; + textLength = that.textLength; pos = that.pos; begin = that.begin; end = that.end; return *this; } -UCharCharacterIterator::~UCharCharacterIterator() -{} +UCharCharacterIterator::~UCharCharacterIterator() { +} bool_t -UCharCharacterIterator::operator==(const CharacterIterator& that) const -{ - if (this == &that) +UCharCharacterIterator::operator==(const CharacterIterator& that) const { + if (this == &that) { return TRUE; + } - if (getDynamicClassID() != that.getDynamicClassID()) + if (getDynamicClassID() != that.getDynamicClassID()) { return FALSE; + } UCharCharacterIterator& realThat = (UCharCharacterIterator&)that; return text == realThat.text + && textLength == realThat.textLength && pos == realThat.pos && begin == realThat.begin && end == realThat.end; } int32_t -UCharCharacterIterator::hashCode() const -{ - return pos ^ begin ^ end; +UCharCharacterIterator::hashCode() const { + return uhash_hashUCharsN(text, textLength) ^ pos ^ begin ^ end; } CharacterIterator* -UCharCharacterIterator::clone() const -{ +UCharCharacterIterator::clone() const { return new UCharCharacterIterator(*this); } UChar -UCharCharacterIterator::first() -{ +UCharCharacterIterator::first() { pos = begin; - return text[pos]; -} - -UChar -UCharCharacterIterator::last() -{ - pos = end - 1; - return text[pos]; -} - -UChar -UCharCharacterIterator::setIndex(UTextOffset pos) -{ - // should check "pos" here and return an error code, but changing this - // function would have significant impact across TIFC, so we decided to hold off - this->pos = pos; - return text[pos]; -} - -UChar -UCharCharacterIterator::current() const -{ - if (pos >= begin && pos < end) + if(pos < end) { return text[pos]; - else - return CharacterIterator::DONE; -} - -UChar -UCharCharacterIterator::next() -{ - if (pos < end - 1) - { - pos += 1; - return text[pos]; - } - else - { - pos = end; - return CharacterIterator::DONE; - } -} - -UChar -UCharCharacterIterator::previous() -{ - if (pos > begin) - return text[--pos]; - else + } else { return DONE; + } } UTextOffset -UCharCharacterIterator::startIndex() const -{ +UCharCharacterIterator::setToStart() { + return pos = begin; +} + +UChar +UCharCharacterIterator::last() { + pos = end; + if(pos > begin) { + return text[--pos]; + } else { + return DONE; + } +} + +UTextOffset +UCharCharacterIterator::setToEnd() { + return pos = end; +} + +UChar +UCharCharacterIterator::setIndex(UTextOffset pos) { + if(pos < begin) { + pos = begin; + } else if(pos > end) { + pos = end; + } + this->pos = pos; + if(pos < end) { + return text[pos]; + } else { + return DONE; + } +} + +UChar +UCharCharacterIterator::current() const { + if (pos >= begin && pos < end) { + return text[pos]; + } else { + return DONE; + } +} + +UChar +UCharCharacterIterator::next() { + if (pos + 1 < end) { + return text[++pos]; + } else { + /* make current() return DONE */ + pos = end; + return DONE; + } +} + +UChar +UCharCharacterIterator::nextPostInc() { + if (pos < end) { + return text[pos++]; + } else { + return DONE; + } +} + +bool_t +UCharCharacterIterator::hasNext() { + return pos < end ? TRUE : FALSE; +} + +UChar +UCharCharacterIterator::previous() { + if (pos > begin) { + return text[--pos]; + } else { + return DONE; + } +} + +bool_t +UCharCharacterIterator::hasPrevious() { + return pos > begin ? TRUE : FALSE; +} + +UChar32 +UCharCharacterIterator::first32() { + pos = begin; + if(pos < end) { + UTextOffset i = pos; + UChar32 c; + UTF_NEXT_CHAR(text, i, end, c); + return c; + } else { + return DONE; + } +} + +UChar32 +UCharCharacterIterator::last32() { + pos = end; + if(pos > begin) { + UChar32 c; + UTF_PREV_CHAR(text, begin, pos, c); + return c; + } else { + return DONE; + } +} + +UChar32 +UCharCharacterIterator::setIndex32(UTextOffset pos) { + if(pos < begin) { + pos = begin; + } else if(pos > end) { + pos = end; + } + if(pos < end) { + UTF_SET_CHAR_START(text, begin, pos); + UTextOffset i = this->pos = pos; + UChar32 c; + UTF_NEXT_CHAR(text, i, end, c); + return c; + } else { + this->pos = pos; + return DONE; + } +} + +UChar32 +UCharCharacterIterator::current32() const { + if (pos >= begin && pos < end) { + UChar32 c; + UTF_GET_CHAR(text, begin, pos, end, c); + return c; + } else { + return DONE; + } +} + +UChar32 +UCharCharacterIterator::next32() { + if (pos < end) { + UTF_FWD_1(text, pos, end); + if(pos < end) { + UTextOffset i = pos; + UChar32 c; + UTF_NEXT_CHAR(text, i, end, c); + return c; + } + } + /* make current() return DONE */ + pos = end; + return DONE; +} + +UChar32 +UCharCharacterIterator::next32PostInc() { + if (pos < end) { + UChar32 c; + UTF_NEXT_CHAR(text, pos, end, c); + return c; + } else { + return DONE; + } +} + +UChar32 +UCharCharacterIterator::previous32() { + if (pos > begin) { + UChar32 c; + UTF_PREV_CHAR(text, begin, pos, c); + return c; + } else { + return DONE; + } +} + +UTextOffset +UCharCharacterIterator::startIndex() const { return begin; } UTextOffset -UCharCharacterIterator::endIndex() const -{ +UCharCharacterIterator::endIndex() const { return end; } UTextOffset -UCharCharacterIterator::getIndex() const -{ +UCharCharacterIterator::getIndex() const { return pos; } void UCharCharacterIterator::setText(const UChar* newText, - int32_t newTextLength) -{ + int32_t newTextLength) { text = newText; - begin = 0; - end = newTextLength; - pos = begin; + if(newText == 0 || newTextLength < 0) { + newTextLength = 0; + } + end = textLength = newTextLength; + pos = begin = 0; } void -UCharCharacterIterator::getText(UnicodeString& result) -{ - result = UnicodeString(text, end); +UCharCharacterIterator::getText(UnicodeString& result) { + result = UnicodeString(text, textLength); } char UCharCharacterIterator::fgClassID = 0; diff --git a/icu4c/source/common/unicode/chariter.h b/icu4c/source/common/unicode/chariter.h index 4a1bb9a0f21..e5f191055a0 100644 --- a/icu4c/source/common/unicode/chariter.h +++ b/icu4c/source/common/unicode/chariter.h @@ -88,7 +88,7 @@ public: /** * Value returned by most of CharacterIterator's functions * when the iterator has reached the limits of its iteration. */ - static const UChar DONE; + enum { DONE = 0xffff }; /** * Destructor. @@ -128,49 +128,120 @@ public: virtual int32_t hashCode(void) const = 0; /** - * Sets the iterator to refer to the first character in its - * iteration range, and returns that character, + * Sets the iterator to refer to the first code unit in its + * iteration range, and returns that code unit, * @draft */ virtual UChar first(void) = 0; /** - * Sets the iterator to refer to the last character in its - * iteration range, and returns that character. + * Sets the iterator to refer to the first code point in its + * iteration range, and returns that code unit, + * @draft + */ + virtual UChar32 first32(void) = 0; + + virtual UTextOffset setToStart() = 0; + + /** + * Sets the iterator to refer to the last code unit in its + * iteration range, and returns that code unit. * @draft */ virtual UChar last(void) = 0; /** - * Sets the iterator to refer to the "position"-th character + * Sets the iterator to refer to the last code point in its + * iteration range, and returns that code unit. + * @draft + */ + virtual UChar32 last32(void) = 0; + + virtual UTextOffset setToEnd() = 0; + + /** + * Sets the iterator to refer to the "position"-th code unit * in the text-storage object the iterator refers to, and - * returns that character. + * returns that code unit. * @draft */ virtual UChar setIndex(UTextOffset position) = 0; /** - * Returns the character the iterator currently refers to. + * Sets the iterator to refer to the beginning of the code point + * that contains the "position"-th code unit + * in the text-storage object the iterator refers to, and + * returns that code point. + * @draft + */ + virtual UChar32 setIndex32(UTextOffset position) = 0; + + /** + * Returns the code unit the iterator currently refers to. * @draft */ virtual UChar current(void) const = 0; /** - * Advances to the next character in the iteration range - * (toward last()), and returns that character. If there are - * no more characters to return, returns DONE. + * Returns the code point the iterator currently refers to. + * @draft + */ + virtual UChar32 current32(void) const = 0; + + /** + * Advances to the next code unit in the iteration range + * (toward last()), and returns that code unit. If there are + * no more code units to return, returns DONE. * @draft */ virtual UChar next(void) = 0; /** - * Advances to the previous character in the iteration rance - * (toward first()), and returns that character. If there are - * no more characters to return, returns DONE. + * Gets the current code unit for returning and advances to the next code unit + * in the iteration range + * (toward last()). If there are + * no more code units to return, returns DONE. + * @draft + */ + virtual UChar nextPostInc(void) = 0; + + /** + * Advances to the next code point in the iteration range + * (toward last()), and returns that code point. If there are + * no more code points to return, returns DONE. + * @draft + */ + virtual UChar32 next32(void) = 0; + + /** + * Gets the current code point for returning and advances to the next code point + * in the iteration range + * (toward last()). If there are + * no more code points to return, returns DONE. + * @draft + */ + virtual UChar32 next32PostInc(void) = 0; + + virtual bool_t hasNext() = 0; + + /** + * Advances to the previous code unit in the iteration rance + * (toward first()), and returns that code unit. If there are + * no more code units to return, returns DONE. * @draft */ virtual UChar previous(void) = 0; + /** + * Advances to the previous code point in the iteration rance + * (toward first()), and returns that code point. If there are + * no more code points to return, returns DONE. + * @draft + */ + virtual UChar32 previous32(void) = 0; + + virtual bool_t hasPrevious() = 0; + /** * Returns the numeric index in the underlying text-storage * object of the character returned by first(). Since it's @@ -221,6 +292,3 @@ protected: }; #endif - - - diff --git a/icu4c/source/common/unicode/schriter.h b/icu4c/source/common/unicode/schriter.h index 80c68d36a68..e9dec4b744f 100644 --- a/icu4c/source/common/unicode/schriter.h +++ b/icu4c/source/common/unicode/schriter.h @@ -20,18 +20,22 @@ #include "unicode/utypes.h" #include "unicode/chariter.h" +#include "unicode/uchriter.h" /** * A concrete subclass of CharacterIterator that iterates over the - * characters in a UnicodeString. It's possible not only to create an + * characters (code units or code points) in a UnicodeString. + * It's possible not only to create an * iterator that iterates over an entire UnicodeString, but also to - * create only that iterates over only a subrange of a UnicodeString + * create one that iterates over only a subrange of a UnicodeString * (iterators over different subranges of the same UnicodeString don't - * compare equal). */ -class U_COMMON_API StringCharacterIterator : public CharacterIterator { + * compare equal). + */ +class U_COMMON_API StringCharacterIterator : public UCharCharacterIterator { public: /** * Create an iterator over the UnicodeString referred to by "text". + * The UnicodeString object is copied. * The iteration range is the whole string, and the starting position is 0. * @stable */ @@ -49,8 +53,9 @@ public: /** * Create an iterator over the UnicodeString referred to by "text". - * The iteration range begins with the character specified by - * "begin" and ends with the character BEFORE the character specfied + * The UnicodeString object is copied. + * The iteration range begins with the code unit specified by + * "begin" and ends with the code unit BEFORE the code unit specfied * by "end". The starting position is specified by "pos". If * "begin" and "end" don't form a valid range on "text" (i.e., begin * >= end or either is negative or greater than text.size()), or @@ -67,6 +72,7 @@ public: * Copy constructor. The new iterator iterates over the same range * of the same string as "that", and its initial position is the * same as "that"'s current position. + * The UnicodeString object in "that" is copied. * @stable */ StringCharacterIterator(const StringCharacterIterator& that); @@ -78,7 +84,7 @@ public: virtual ~StringCharacterIterator(); /** - * Assignment operator. *this is altered to iterate over the sane + * Assignment operator. *this is altered to iterate over the same * range of the same string as "that", and refers to the same * character within that string as "that" does. * @stable @@ -93,12 +99,6 @@ public: */ virtual bool_t operator==(const CharacterIterator& that) const; - /** - * Generates a hash code for this iterator. - * @stable - */ - virtual int32_t hashCode(void) const; - /** * Returns a new StringCharacterIterator referring to the same * character in the same range of the same string as this one. The @@ -107,79 +107,12 @@ public: */ virtual CharacterIterator* clone(void) const; - /** - * Sets the iterator to refer to the first character in its - * iteration range, and returns that character, - * @draft - */ - virtual UChar first(void); - - /** - * Sets the iterator to refer to the last character in its iteration - * range, and returns that character. - * @draft - */ - virtual UChar last(void); - - /** - * Sets the iterator to refer to the "position"-th character in the - * UnicodeString the iterator refers to, and returns that character. - * If the index is outside the iterator's iteration range, the - * behavior of the iterator is undefined. - * @draft - */ - virtual UChar setIndex(UTextOffset pos); - - /** - * Returns the character the iterator currently refers to. - * @draft - */ - virtual UChar current(void) const; - - /** - * Advances to the next character in the iteration range (toward - * last()), and returns that character. If there are no more - * characters to return, returns DONE. - * @draft - */ - virtual UChar next(void); - - /** - * Advances to the previous character in the iteration rance (toward - * first()), and returns that character. If there are no more - * characters to return, returns DONE. - * @draft - */ - virtual UChar previous(void); - - /** - * Returns the numeric index of the first character in this - * iterator's iteration range. - * @stable - */ - virtual UTextOffset startIndex(void) const; - - /** - * Returns the numeric index of the character immediately BEYOND the - * last character in this iterator's iteration range. - * @stable - */ - virtual UTextOffset endIndex(void) const; - - /** - * Returns the numeric index in the underlying UnicodeString of the - * character the iterator currently refers to (i.e., the character - * returned by current()). - * @stable - */ - virtual UTextOffset getIndex(void) const; - /** * Sets the iterator to iterate over the provided string. * @draft */ - virtual void setText(const UnicodeString& newText); - + void setText(const UnicodeString& newText); + /** * Copies the UnicodeString under iteration into the UnicodeString * referred to by "result". Even if this iterator iterates across @@ -203,19 +136,13 @@ public: static UClassID getStaticClassID(void) { return (UClassID)(&fgClassID); } -private: +protected: StringCharacterIterator(); + void setText(const UChar* newText, int32_t newTextLength); UnicodeString text; - UTextOffset pos; - UTextOffset begin; - UTextOffset end; static UClassID fgClassID; }; #endif - - - - diff --git a/icu4c/source/common/unicode/uchriter.h b/icu4c/source/common/unicode/uchriter.h index 0dc61e9d2ad..0ca77f777c2 100644 --- a/icu4c/source/common/unicode/uchriter.h +++ b/icu4c/source/common/unicode/uchriter.h @@ -14,21 +14,53 @@ /** * A concrete subclass of CharacterIterator that iterates over the - * characters in a UnicodeString. It's possible not only to create an - * iterator that iterates over an entire UnicodeString, but also to - * create only that iterates over only a subrange of a UnicodeString - * (iterators over different subranges of the same UnicodeString don't - * compare equal). */ + * characters (code units or code points) in a UChar array. + * It's possible not only to create an + * iterator that iterates over an entire UChar array, but also to + * create one that iterates over only a subrange of a UChar array + * (iterators over different subranges of the same UChar array don't + * compare equal). + */ class U_COMMON_API UCharCharacterIterator : public CharacterIterator { public: /** - * Create an iterator over the UnicodeString referred to by "text". - * The iteration range is the whole string, and the starting - * position is 0. + * Create an iterator over the UChar array referred to by "text". + * The iteration range is 0 to len-1. + * text is only aliased, not adopted (the + * destructor will not delete it). * @stable */ UCharCharacterIterator(const UChar* text, int32_t len); + /** + * Create an iterator over the UChar array referred to by "text". + * The iteration range is 0 to len-1. + * text is only aliased, not adopted (the + * destructor will not delete it). + * The starting + * position is specified by "pos". If "pos" is outside the valid + * iteration range, the behavior of this object is undefined. + * @stable + */ + UCharCharacterIterator(const UChar* text, int32_t len, + UTextOffset pos); + + /** + * Create an iterator over the UChar array referred to by "text". + * The iteration range is 0 to end-1. + * text is only aliased, not adopted (the + * destructor will not delete it). + * The starting + * position is specified by "pos". If begin and end do not + * form a valid iteration range or "pos" is outside the valid + * iteration range, the behavior of this object is undefined. + * @stable + */ + UCharCharacterIterator(const UChar* text, int32_t len, + UTextOffset begin, + UTextOffset end, + UTextOffset pos); + /** * Copy constructor. The new iterator iterates over the same range * of the same string as "that", and its initial position is the @@ -66,7 +98,7 @@ public: virtual int32_t hashCode(void) const; /** - * Returns a new StringCharacterIterator referring to the same + * Returns a new UCharCharacterIterator referring to the same * character in the same range of the same string as this one. The * caller must delete the new iterator. * @stable @@ -74,22 +106,40 @@ public: virtual CharacterIterator* clone(void) const; /** - * Sets the iterator to refer to the first character in its - * iteration range, and returns that character, + * Sets the iterator to refer to the first code unit in its + * iteration range, and returns that code unit, * @draft */ virtual UChar first(void); /** - * Sets the iterator to refer to the last character in its iteration - * range, and returns that character. + * Sets the iterator to refer to the first code point in its + * iteration range, and returns that code point, + * @draft + */ + virtual UChar32 first32(void); + + virtual UTextOffset setToStart(); + + /** + * Sets the iterator to refer to the last code unit in its iteration + * range, and returns that code unit. * @draft */ virtual UChar last(void); /** - * Sets the iterator to refer to the "position"-th character in the - * UnicodeString the iterator refers to, and returns that character. + * Sets the iterator to refer to the last code point in its iteration + * range, and returns that code point. + * @draft + */ + virtual UChar32 last32(void); + + virtual UTextOffset setToEnd(); + + /** + * Sets the iterator to refer to the "position"-th code unit in the + * UChar array the iterator refers to, and returns that code unit. * If the index is outside the iterator's iteration range, the * behavior of the iterator is undefined. * @draft @@ -97,44 +147,83 @@ public: virtual UChar setIndex(UTextOffset pos); /** - * Returns the character the iterator currently refers to. + * Sets the iterator to refer to the "position"-th code point in the + * UChar array the iterator refers to, and returns that code point. + * If the index is outside the iterator's iteration range, the + * behavior of the iterator is undefined. + * @draft + */ + virtual UChar32 setIndex32(UTextOffset pos); + + /** + * Returns the code unit the iterator currently refers to. * @draft */ virtual UChar current(void) const; /** - * Advances to the next character in the iteration range (toward - * last()), and returns that character. If there are no more - * characters to return, returns DONE. + * Returns the code point the iterator currently refers to. + * @draft + */ + virtual UChar32 current32(void) const; + + /** + * Advances to the next code unit in the iteration range (toward + * last()), and returns that code unit. If there are no more + * code units to return, returns DONE. * @draft */ virtual UChar next(void); + virtual UChar nextPostInc(void); + /** - * Advances to the previous character in the iteration rance (toward - * first()), and returns that character. If there are no more - * characters to return, returns DONE. + * Advances to the next code point in the iteration range (toward + * last()), and returns that code point. If there are no more + * code points to return, returns DONE. + * @draft + */ + virtual UChar32 next32(void); + + virtual UChar32 next32PostInc(void); + + virtual bool_t hasNext(); + + /** + * Advances to the previous code unit in the iteration rance (toward + * first()), and returns that code unit. If there are no more + * code units to return, returns DONE. * @draft */ virtual UChar previous(void); /** - * Returns the numeric index of the first character in this + * Advances to the previous code point in the iteration rance (toward + * first()), and returns that code point. If there are no more + * code points to return, returns DONE. + * @draft + */ + virtual UChar32 previous32(void); + + virtual bool_t hasPrevious(); + + /** + * Returns the numeric index of the first code unit in this * iterator's iteration range. * @stable */ virtual UTextOffset startIndex(void) const; /** - * Returns the numeric index of the character immediately BEYOND the - * last character in this iterator's iteration range. + * Returns the numeric index of the code unit immediately BEYOND the + * last code unit in this iterator's iteration range. * @stable */ virtual UTextOffset endIndex(void) const; /** - * Returns the numeric index in the underlying UnicodeString of the - * character the iterator currently refers to (i.e., the character + * Returns the numeric index in the underlying UChar array of the + * code unit the iterator currently refers to (i.e., the code unit * returned by current()). * @stable */ @@ -144,11 +233,10 @@ public: * Sets the iterator to iterate over a new range of text * @draft */ - virtual void setText(const UChar* newText, - int32_t newTextLength); + void setText(const UChar* newText, int32_t newTextLength); /** - * Copies the UnicodeString under iteration into the UnicodeString + * Copies the UChar array under iteration into the UnicodeString * referred to by "result". Even if this iterator iterates across * only a part of this string, the whole string is copied. @param * result Receives a copy of the text under iteration. @@ -170,10 +258,11 @@ public: static UClassID getStaticClassID(void) { return (UClassID)(&fgClassID); } -private: +protected: UCharCharacterIterator(); const UChar* text; + int32_t textLength; // need this for correct getText() and hashCode() UTextOffset pos; UTextOffset begin; UTextOffset end; @@ -182,6 +271,3 @@ private: }; #endif - - -