From 2cfa2ab7bbfadf171aead7159df147d3604b79be Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 14 Jan 2003 21:18:52 +0000 Subject: [PATCH] ICU-2397 move u_strCompareIter to ustring.c X-SVN-Rev: 10856 --- icu4c/source/common/unicode/ustring.h | 25 ++++++ icu4c/source/common/ustring.c | 105 +++++++++++++++++++++++++- icu4c/source/test/cintltst/custrtst.c | 104 +------------------------ 3 files changed, 131 insertions(+), 103 deletions(-) diff --git a/icu4c/source/common/unicode/ustring.h b/icu4c/source/common/unicode/ustring.h index 19e7a90991b..c3d9930a5f6 100644 --- a/icu4c/source/common/unicode/ustring.h +++ b/icu4c/source/common/unicode/ustring.h @@ -15,7 +15,9 @@ #ifndef USTRING_H #define USTRING_H + #include "unicode/utypes.h" +#include "unicode/uiter.h" /** Simple declaration for u_strToTitle() to avoid including unicode/ubrk.h. @draft ICU 2.1*/ #ifndef UBRK_TYPEDEF_UBREAK_ITERATOR @@ -458,6 +460,29 @@ u_strCompare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, UBool codePointOrder); +/** + * Compare two Unicode strings (binary order) + * as presented by UCharIterator objects. + * Works otherwise just like u_strCompare(). + * + * Both iterators are reset to their start positions. + * When the function returns, it is undefined where the iterators + * have stopped. + * + * @param iter1 First source string iterator. + * @param s2 Second source string iterator. + * @param codePointOrder Choose between code unit order (FALSE) + * and code point order (TRUE). + * + * @return <0 or 0 or >0 as usual for string comparisons + * + * @see u_strCompare + * + * @draft ICU 2.6 + */ +U_CAPI int32_t U_EXPORT2 +u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder); + #ifndef U_COMPARE_CODE_POINT_ORDER /* see also unistr.h and unorm.h */ /** diff --git a/icu4c/source/common/ustring.c b/icu4c/source/common/ustring.c index a54e83cdeb0..d569a896df9 100644 --- a/icu4c/source/common/ustring.c +++ b/icu4c/source/common/ustring.c @@ -17,6 +17,7 @@ #include "unicode/utypes.h" #include "unicode/uchar.h" +#include "unicode/uiter.h" #include "unicode/ustring.h" #include "unicode/putil.h" #include "unicode/ucnv.h" @@ -822,10 +823,112 @@ uprv_strCompare(const UChar *s1, int32_t length1, } } - /* now c1 and c2 are in UTF-32-compatible order */ + /* now c1 and c2 are in the requested (code unit or code point) order */ return (int32_t)c1-(int32_t)c2; } +/* + * Compare two strings as presented by UCharIterators. + * Use code unit or code point order. + * When the function returns, it is undefined where the iterators + * have stopped. + */ +U_CAPI int32_t U_EXPORT2 +u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { + UChar32 c1, c2; + + /* argument checking */ + if(iter1==NULL || iter2==NULL) { + return 0; /* bad arguments */ + } + if(iter1==iter2) { + return 0; /* identical iterators */ + } + + /* reset iterators to start? */ + iter1->move(iter1, 0, UITER_START); + iter2->move(iter2, 0, UITER_START); + + /* compare identical prefixes - they do not need to be fixed up */ + for(;;) { + c1=iter1->next(iter1); + c2=iter2->next(iter2); + if(c1!=c2) { + break; + } + if(c1==-1) { + return 0; + } + } + + /* if both values are in or above the surrogate range, fix them up */ + if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { + /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ + if( + (c1<=0xdbff && UTF_IS_TRAIL(iter1->current(iter1))) || + (UTF_IS_TRAIL(c1) && (iter1->previous(iter1), UTF_IS_LEAD(iter1->previous(iter1)))) + ) { + /* part of a surrogate pair, leave >=d800 */ + } else { + /* BMP code point - may be surrogate code point - make current(iter2))) || + (UTF_IS_TRAIL(c2) && (iter2->previous(iter2), UTF_IS_LEAD(iter2->previous(iter2)))) + ) { + /* part of a surrogate pair, leave >=d800 */ + } else { + /* BMP code point - may be surrogate code point - make =0. + * + * Consistently leaving them _behind_ the different units is not an option + * because the current "unit" is the end of the string if that is reached, + * and in such a case the iterator does not move. + * For example, when comparing "ab" with "abc", both iterators rest _on_ the end + * of their strings. Calling previous() on each does not move them to where + * the comparison fails. + * + * So the simplest semantics is to not define where the iterators end up. + * + * The following fragment is part of what would need to be done for backing up. + */ +void fragment { + /* iff a surrogate is part of a surrogate pair, leave >=d800 */ + if(c1<=0xdbff) { + if(!UTF_IS_TRAIL(iter1->current(iter1))) { + /* lead surrogate code point - make getIndex(iter1, UITER_CURRENT); + iter1->previous(iter1); /* ==c1 */ + if(!UTF_IS_LEAD(iter1->previous(iter1))) { + /* trail surrogate code point - make move(iter1, index, UITER_ZERO); + } else /* 0xe000<=c1<=0xffff */ { + /* BMP code point - make #include -/* ### TODO prototype ------------------------------------------------------- */ - -#include "unicode/uiter.h" - -/* - * Compare two strings as presented by UCharIterators. - * Use code unit or code point order. - * When the function returns, it is undefined where the iterators - * have stopped. - */ -U_CAPI int32_t U_EXPORT2 -u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { - UChar32 c1, c2; - - /* ### TODO: iterate from current positions or reset to start? reset for now */ - iter1->move(iter1, 0, UITER_START); - iter2->move(iter2, 0, UITER_START); - - /* compare identical prefixes - they do not need to be fixed up */ - for(;;) { - c1=iter1->next(iter1); - c2=iter2->next(iter2); - if(c1!=c2) { - break; - } - if(c1==-1) { - return 0; - } - } - - /* if both values are in or above the surrogate range, fix them up */ - if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { - /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ - if( - (c1<=0xdbff && UTF_IS_TRAIL(iter1->current(iter1))) || - (UTF_IS_TRAIL(c1) && (iter1->previous(iter1), UTF_IS_LEAD(iter1->previous(iter1)))) - ) { - /* part of a surrogate pair, leave >=d800 */ - } else { - /* BMP code point - may be surrogate code point - make current(iter2))) || - (UTF_IS_TRAIL(c2) && (iter2->previous(iter2), UTF_IS_LEAD(iter2->previous(iter2)))) - ) { - /* part of a surrogate pair, leave >=d800 */ - } else { - /* BMP code point - may be surrogate code point - make =0. - * - * Consistently leaving them _behind_ the different units is not an option - * because the current "unit" is the end of the string if that is reached, - * and in such a case the iterator does not move. - * For example, when comparing "ab" with "abc", both iterators rest _on_ the end - * of their strings. Calling previous() on each does not move them to where - * the comparison fails. - * - * So the simplest semantics for now is to not define where the iterators end up. - * - * The following fragment is part of what needs to be done for backing up. - */ -void fragment { - /* iff a surrogate is part of a surrogate pair, leave >=d800 */ - if(c1<=0xdbff) { - if(!UTF_IS_TRAIL(iter1->current(iter1))) { - /* lead surrogate code point - make getIndex(iter1, UITER_CURRENT); - iter1->previous(iter1); /* ==c1 */ - if(!UTF_IS_LEAD(iter1->previous(iter1))) { - /* trail surrogate code point - make move(iter1, index, UITER_ZERO); - } else /* 0xe000<=c1<=0xffff */ { - /* BMP code point - make =0) {