From b92c4979a1b81f429ed58d9073f12de8109a4c9b Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Sat, 11 Sep 2004 14:24:31 +0000 Subject: [PATCH] ICU-4078 make case-insensitive string comparisons independent of normalization code by duplicating parts of the unorm_compare() implementation X-SVN-Rev: 16300 --- icu4c/source/common/unistr_case.cpp | 4 +- icu4c/source/common/unorm.cpp | 263 +++++++++++---------- icu4c/source/common/unormimp.h | 13 -- icu4c/source/common/ustr_imp.h | 12 + icu4c/source/common/ustrcase.c | 341 ++++++++++++++++++++++++++-- 5 files changed, 482 insertions(+), 151 deletions(-) diff --git a/icu4c/source/common/unistr_case.cpp b/icu4c/source/common/unistr_case.cpp index be4eb820fed..f6f11f200f0 100644 --- a/icu4c/source/common/unistr_case.cpp +++ b/icu4c/source/common/unistr_case.cpp @@ -63,8 +63,8 @@ UnicodeString::doCaseCompare(int32_t start, if(chars != srcChars) { UErrorCode errorCode=U_ZERO_ERROR; - int32_t result=unorm_cmpEquivFold(chars, length, srcChars, srcLength, - options|U_COMPARE_IGNORE_CASE, &errorCode); + int32_t result=u_strcmpFold(chars, length, srcChars, srcLength, + options|U_COMPARE_IGNORE_CASE, &errorCode); if(result!=0) { return (int8_t)(result >> 24 | 1); } diff --git a/icu4c/source/common/unorm.cpp b/icu4c/source/common/unorm.cpp index 10820023fd9..44e2845ee4a 100644 --- a/icu4c/source/common/unorm.cpp +++ b/icu4c/source/common/unorm.cpp @@ -24,19 +24,17 @@ #include "unicode/utypes.h" -// moved up to make unorm_cmpEquivFold work without normalization -#include "unicode/ustring.h" -#include "unormimp.h" -#include "ucase.h" - #if !UCONFIG_NO_NORMALIZATION #include "unicode/udata.h" #include "unicode/uchar.h" +#include "unicode/ustring.h" #include "unicode/uiter.h" #include "unicode/uniset.h" #include "unicode/usetiter.h" #include "unicode/unorm.h" +#include "unormimp.h" +#include "ucase.h" #include "cmemory.h" #include "umutex.h" #include "utrie.h" @@ -4032,54 +4030,57 @@ _decompose(UChar32 /*c*/, UChar /*buffer*/[4], int32_t &/*length*/) { * would be a partial NFD before the case folding, which does not work. * Note that all of this is only a problem when case-folding _and_ * canonical equivalence come together. + * (Comments in unorm_compare() are more up to date than this TODO.) * * This function could be moved to a different source file, at increased cost * for calling the decomposition access function. */ -// stack element for previous-level source/decomposition pointers +/* stack element for previous-level source/decomposition pointers */ struct CmpEquivLevel { const UChar *start, *s, *limit; }; typedef struct CmpEquivLevel CmpEquivLevel; -// internal function -U_CAPI int32_t U_EXPORT2 +/* internal function */ +static int32_t unorm_cmpEquivFold(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, uint32_t options, UErrorCode *pErrorCode) { UCaseProps *csp; - // current-level start/limit - s1/s2 as current + /* current-level start/limit - s1/s2 as current */ const UChar *start1, *start2, *limit1, *limit2; - // decomposition variables + /* decomposition and case folding variables */ const UChar *p; int32_t length; - // stacks of previous-level start/current/limit + /* stacks of previous-level start/current/limit */ CmpEquivLevel stack1[2], stack2[2]; - // decomposition buffers for Hangul + /* decomposition buffers for Hangul */ UChar decomp1[4], decomp2[4]; - // case folding buffers, only use current-level start/limit + /* case folding buffers, only use current-level start/limit */ UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; - // track which is the current level per string + /* track which is the current level per string */ int32_t level1, level2; - // current code units, and code points for lookups - int32_t c1, c2, cp1, cp2; + /* current code units, and code points for lookups */ + UChar32 c1, c2, cp1, cp2; - // no argument error checking because this itself is not an API + /* no argument error checking because this itself is not an API */ - // assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set - // otherwise this function must behave exactly as uprv_strCompare() - // not checking for that here makes testing this function easier + /* + * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set + * otherwise this function must behave exactly as uprv_strCompare() + * not checking for that here makes testing this function easier + */ - // normalization/properties data loaded? + /* normalization/properties data loaded? */ if( ((options&_COMPARE_EQUIV)!=0 && !_haveData(*pErrorCode)) || U_FAILURE(*pErrorCode) ) { @@ -4094,7 +4095,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, csp=NULL; } - // initialize + /* initialize */ start1=s1; if(length1==-1) { limit1=NULL; @@ -4112,13 +4113,15 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, level1=level2=0; c1=c2=-1; - // comparison loop + /* comparison loop */ for(;;) { - // here a code unit value of -1 means "get another code unit" - // below it will mean "this source is finished" + /* + * here a code unit value of -1 means "get another code unit" + * below it will mean "this source is finished" + */ if(c1<0) { - // get next code unit from string 1, post-increment + /* get next code unit from string 1, post-increment */ for(;;) { if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { if(level1==0) { @@ -4130,7 +4133,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, break; } - // reached end of level buffer, pop one level + /* reached end of level buffer, pop one level */ do { --level1; start1=stack1[level1].start; @@ -4141,7 +4144,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, } if(c2<0) { - // get next code unit from string 2, post-increment + /* get next code unit from string 2, post-increment */ for(;;) { if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { if(level2==0) { @@ -4153,7 +4156,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, break; } - // reached end of level buffer, pop one level + /* reached end of level buffer, pop one level */ do { --level2; start2=stack2[level2].start; @@ -4163,83 +4166,89 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, } } - // compare c1 and c2 - // either variable c1, c2 is -1 only if the corresponding string is finished + /* + * compare c1 and c2 + * either variable c1, c2 is -1 only if the corresponding string is finished + */ if(c1==c2) { if(c1<0) { - return 0; // c1==c2==-1 indicating end of strings + return 0; /* c1==c2==-1 indicating end of strings */ } - c1=c2=-1; // make us fetch new code units + c1=c2=-1; /* make us fetch new code units */ continue; } else if(c1<0) { - return -1; // string 1 ends before string 2 + return -1; /* string 1 ends before string 2 */ } else if(c2<0) { - return 1; // string 2 ends before string 1 + return 1; /* string 2 ends before string 1 */ } - // c1!=c2 && c1>=0 && c2>=0 + /* c1!=c2 && c1>=0 && c2>=0 */ - // get complete code points for c1, c2 for lookups if either is a surrogate + /* get complete code points for c1, c2 for lookups if either is a surrogate */ cp1=c1; - if(UTF_IS_SURROGATE(c1)) { + if(U_IS_SURROGATE(c1)) { UChar c; - if(UTF_IS_SURROGATE_FIRST(c1)) { - if(s1!=limit1 && UTF_IS_TRAIL(c=*s1)) { - // advance ++s1; only below if cp1 decomposes/case-folds - cp1=UTF16_GET_PAIR_VALUE(c1, c); + if(U_IS_SURROGATE_LEAD(c1)) { + if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { + /* advance ++s1; only below if cp1 decomposes/case-folds */ + cp1=U16_GET_SUPPLEMENTARY(c1, c); } } else /* isTrail(c1) */ { - if(start1<=(s1-2) && UTF_IS_LEAD(c=*(s1-2))) { - cp1=UTF16_GET_PAIR_VALUE(c, c1); + if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { + cp1=U16_GET_SUPPLEMENTARY(c, c1); } } } cp2=c2; - if(UTF_IS_SURROGATE(c2)) { + if(U_IS_SURROGATE(c2)) { UChar c; - if(UTF_IS_SURROGATE_FIRST(c2)) { - if(s2!=limit2 && UTF_IS_TRAIL(c=*s2)) { - // advance ++s2; only below if cp2 decomposes/case-folds - cp2=UTF16_GET_PAIR_VALUE(c2, c); + if(U_IS_SURROGATE_LEAD(c2)) { + if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { + /* advance ++s2; only below if cp2 decomposes/case-folds */ + cp2=U16_GET_SUPPLEMENTARY(c2, c); } } else /* isTrail(c2) */ { - if(start2<=(s2-2) && UTF_IS_LEAD(c=*(s2-2))) { - cp2=UTF16_GET_PAIR_VALUE(c, c2); + if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { + cp2=U16_GET_SUPPLEMENTARY(c, c2); } } } - // go down one level for each string - // continue with the main loop as soon as there is a real change + /* + * go down one level for each string + * continue with the main loop as soon as there is a real change + */ if( level1==0 && (options&U_COMPARE_IGNORE_CASE) && (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 ) { - // cp1 case-folds to the code point "length" or to p[length] - if(UTF_IS_SURROGATE(c1)) { - if(UTF_IS_SURROGATE_FIRST(c1)) { - // advance beyond source surrogate pair if it case-folds + /* cp1 case-folds to the code point "length" or to p[length] */ + if(U_IS_SURROGATE(c1)) { + if(U_IS_SURROGATE_LEAD(c1)) { + /* advance beyond source surrogate pair if it case-folds */ ++s1; } else /* isTrail(c1) */ { - // we got a supplementary code point when hitting its trail surrogate, - // therefore the lead surrogate must have been the same as in the other string; - // compare this decomposition with the lead surrogate in the other string - // remember that this simulates bulk text replacement: - // the decomposition would replace the entire code point + /* + * we got a supplementary code point when hitting its trail surrogate, + * therefore the lead surrogate must have been the same as in the other string; + * compare this decomposition with the lead surrogate in the other string + * remember that this simulates bulk text replacement: + * the decomposition would replace the entire code point + */ --s2; c2=*(s2-1); } } - // push current level pointers + /* push current level pointers */ stack1[0].start=start1; stack1[0].s=s1; stack1[0].limit=limit1; ++level1; - // copy the folding result to fold1[] + /* copy the folding result to fold1[] */ if(length<=UCASE_MAX_STRING_LENGTH) { u_memcpy(fold1, p, length); } else { @@ -4248,11 +4257,11 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, length=i; } - // set next level pointers to case folding + /* set next level pointers to case folding */ start1=s1=fold1; limit1=fold1+length; - // get ready to read from decomposition, continue with loop + /* get ready to read from decomposition, continue with loop */ c1=-1; continue; } @@ -4260,29 +4269,31 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, if( level2==0 && (options&U_COMPARE_IGNORE_CASE) && (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 ) { - // cp2 case-folds to the code point "length" or to p[length] - if(UTF_IS_SURROGATE(c2)) { - if(UTF_IS_SURROGATE_FIRST(c2)) { - // advance beyond source surrogate pair if it case-folds + /* cp2 case-folds to the code point "length" or to p[length] */ + if(U_IS_SURROGATE(c2)) { + if(U_IS_SURROGATE_LEAD(c2)) { + /* advance beyond source surrogate pair if it case-folds */ ++s2; } else /* isTrail(c2) */ { - // we got a supplementary code point when hitting its trail surrogate, - // therefore the lead surrogate must have been the same as in the other string; - // compare this decomposition with the lead surrogate in the other string - // remember that this simulates bulk text replacement: - // the decomposition would replace the entire code point + /* + * we got a supplementary code point when hitting its trail surrogate, + * therefore the lead surrogate must have been the same as in the other string; + * compare this decomposition with the lead surrogate in the other string + * remember that this simulates bulk text replacement: + * the decomposition would replace the entire code point + */ --s1; c1=*(s1-1); } } - // push current level pointers + /* push current level pointers */ stack2[0].start=start2; stack2[0].s=s2; stack2[0].limit=limit2; ++level2; - // copy the folding result to fold2[] + /* copy the folding result to fold2[] */ if(length<=UCASE_MAX_STRING_LENGTH) { u_memcpy(fold2, p, length); } else { @@ -4291,11 +4302,11 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, length=i; } - // set next level pointers to case folding + /* set next level pointers to case folding */ start2=s2=fold2; limit2=fold2+length; - // get ready to read from decomposition, continue with loop + /* get ready to read from decomposition, continue with loop */ c2=-1; continue; } @@ -4303,38 +4314,40 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, if( level1<2 && (options&_COMPARE_EQUIV) && 0!=(p=_decompose((UChar32)cp1, decomp1, length)) ) { - // cp1 decomposes into p[length] - if(UTF_IS_SURROGATE(c1)) { - if(UTF_IS_SURROGATE_FIRST(c1)) { - // advance beyond source surrogate pair if it decomposes + /* cp1 decomposes into p[length] */ + if(U_IS_SURROGATE(c1)) { + if(U_IS_SURROGATE_LEAD(c1)) { + /* advance beyond source surrogate pair if it decomposes */ ++s1; } else /* isTrail(c1) */ { - // we got a supplementary code point when hitting its trail surrogate, - // therefore the lead surrogate must have been the same as in the other string; - // compare this decomposition with the lead surrogate in the other string - // remember that this simulates bulk text replacement: - // the decomposition would replace the entire code point + /* + * we got a supplementary code point when hitting its trail surrogate, + * therefore the lead surrogate must have been the same as in the other string; + * compare this decomposition with the lead surrogate in the other string + * remember that this simulates bulk text replacement: + * the decomposition would replace the entire code point + */ --s2; c2=*(s2-1); } } - // push current level pointers + /* push current level pointers */ stack1[level1].start=start1; stack1[level1].s=s1; stack1[level1].limit=limit1; ++level1; - // set empty intermediate level if skipped + /* set empty intermediate level if skipped */ if(level1<2) { stack1[level1++].start=NULL; } - // set next level pointers to decomposition + /* set next level pointers to decomposition */ start1=s1=p; limit1=p+length; - // get ready to read from decomposition, continue with loop + /* get ready to read from decomposition, continue with loop */ c1=-1; continue; } @@ -4342,62 +4355,66 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, if( level2<2 && (options&_COMPARE_EQUIV) && 0!=(p=_decompose((UChar32)cp2, decomp2, length)) ) { - // cp2 decomposes into p[length] - if(UTF_IS_SURROGATE(c2)) { - if(UTF_IS_SURROGATE_FIRST(c2)) { - // advance beyond source surrogate pair if it decomposes + /* cp2 decomposes into p[length] */ + if(U_IS_SURROGATE(c2)) { + if(U_IS_SURROGATE_LEAD(c2)) { + /* advance beyond source surrogate pair if it decomposes */ ++s2; } else /* isTrail(c2) */ { - // we got a supplementary code point when hitting its trail surrogate, - // therefore the lead surrogate must have been the same as in the other string; - // compare this decomposition with the lead surrogate in the other string - // remember that this simulates bulk text replacement: - // the decomposition would replace the entire code point + /* + * we got a supplementary code point when hitting its trail surrogate, + * therefore the lead surrogate must have been the same as in the other string; + * compare this decomposition with the lead surrogate in the other string + * remember that this simulates bulk text replacement: + * the decomposition would replace the entire code point + */ --s1; c1=*(s1-1); } } - // push current level pointers + /* push current level pointers */ stack2[level2].start=start2; stack2[level2].s=s2; stack2[level2].limit=limit2; ++level2; - // set empty intermediate level if skipped + /* set empty intermediate level if skipped */ if(level2<2) { stack2[level2++].start=NULL; } - // set next level pointers to decomposition + /* set next level pointers to decomposition */ start2=s2=p; limit2=p+length; - // get ready to read from decomposition, continue with loop + /* get ready to read from decomposition, continue with loop */ c2=-1; continue; } - // no decomposition/case folding, max level for both sides: - // return difference result - - // code point order comparison must not just return cp1-cp2 - // because when single surrogates are present then the surrogate pairs - // that formed cp1 and cp2 may be from different string indexes - - // example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units - // c1=d800 cp1=10001 c2=dc00 cp2=10000 - // cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } - - // therefore, use same fix-up as in ustring.c/uprv_strCompare() - // except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ - // so we have slightly different pointer/start/limit comparisons here + /* + * no decomposition/case folding, max level for both sides: + * return difference result + * + * code point order comparison must not just return cp1-cp2 + * because when single surrogates are present then the surrogate pairs + * that formed cp1 and cp2 may be from different string indexes + * + * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units + * c1=d800 cp1=10001 c2=dc00 cp2=10000 + * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } + * + * therefore, use same fix-up as in ustring.c/uprv_strCompare() + * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ + * so we have slightly different pointer/start/limit comparisons here + */ if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ if( - (c1<=0xdbff && s1!=limit1 && UTF_IS_TRAIL(*s1)) || - (UTF_IS_TRAIL(c1) && start1!=(s1-1) && UTF_IS_LEAD(*(s1-2))) + (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || + (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) ) { /* part of a surrogate pair, leave >=d800 */ } else { @@ -4406,8 +4423,8 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1, } if( - (c2<=0xdbff && s2!=limit2 && UTF_IS_TRAIL(*s2)) || - (UTF_IS_TRAIL(c2) && start2!=(s2-1) && UTF_IS_LEAD(*(s2-2))) + (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || + (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) ) { /* part of a surrogate pair, leave >=d800 */ } else { diff --git a/icu4c/source/common/unormimp.h b/icu4c/source/common/unormimp.h index 180fb1ffbe4..acc58d8618b 100644 --- a/icu4c/source/common/unormimp.h +++ b/icu4c/source/common/unormimp.h @@ -259,19 +259,6 @@ unorm_compose(UChar *dest, int32_t destCapacity, */ #define _STRNCMP_STYLE 0x1000 -/** - * Internal API, used by u_strcasecmp() etc. - * Compare strings for canonical equivalence (optional), - * case-insensitively (optional), - * in code point order or code unit order. - * @internal - */ -U_CAPI int32_t U_EXPORT2 -unorm_cmpEquivFold(const UChar *s1, int32_t length1, - const UChar *s2, int32_t length2, - uint32_t options, - UErrorCode *pErrorCode); - #if !UCONFIG_NO_NORMALIZATION /** diff --git a/icu4c/source/common/ustr_imp.h b/icu4c/source/common/ustr_imp.h index c3d87b70655..3b162e8ab27 100644 --- a/icu4c/source/common/ustr_imp.h +++ b/icu4c/source/common/ustr_imp.h @@ -37,6 +37,18 @@ uprv_strCompare(const UChar *s1, int32_t length1, const UChar *s2, int32_t length2, UBool strncmpStyle, UBool codePointOrder); +/** + * Internal API, used by u_strcasecmp() etc. + * Compare strings case-insensitively, + * in code point order or code unit order. + * @internal + */ +U_CFUNC int32_t +u_strcmpFold(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + uint32_t options, + UErrorCode *pErrorCode); + /** * Are the Unicode properties loaded? * This must be used before internal functions are called that do diff --git a/icu4c/source/common/ustrcase.c b/icu4c/source/common/ustrcase.c index 0c04ff97470..d7531f2d3c4 100644 --- a/icu4c/source/common/ustrcase.c +++ b/icu4c/source/common/ustrcase.c @@ -504,7 +504,322 @@ u_strFoldCase(UChar *dest, int32_t destCapacity, FOLD_CASE, pErrorCode); } -/* case-insensitive string comparisons */ +/* case-insensitive string comparisons -------------------------------------- */ + +/* + * This function is a copy of unorm_cmpEquivFold() minus the parts for + * canonical equivalence. + * Keep the functions in sync, and see there for how this works. + * The duplication is for modularization: + * It makes caseless (but not canonical caseless) matches independent of + * the normalization code. + */ + +/* stack element for previous-level source/decomposition pointers */ +struct CmpEquivLevel { + const UChar *start, *s, *limit; +}; +typedef struct CmpEquivLevel CmpEquivLevel; + +/* internal function */ +U_CFUNC int32_t +u_strcmpFold(const UChar *s1, int32_t length1, + const UChar *s2, int32_t length2, + uint32_t options, + UErrorCode *pErrorCode) { + UCaseProps *csp; + + /* current-level start/limit - s1/s2 as current */ + const UChar *start1, *start2, *limit1, *limit2; + + /* case folding variables */ + const UChar *p; + int32_t length; + + /* stacks of previous-level start/current/limit */ + CmpEquivLevel stack1[2], stack2[2]; + + /* case folding buffers, only use current-level start/limit */ + UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; + + /* track which is the current level per string */ + int32_t level1, level2; + + /* current code units, and code points for lookups */ + UChar32 c1, c2, cp1, cp2; + + /* no argument error checking because this itself is not an API */ + + /* + * assume that at least the option U_COMPARE_IGNORE_CASE is set + * otherwise this function would have to behave exactly as uprv_strCompare() + */ + csp=ucase_getSingleton(pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return 0; + } + + /* initialize */ + start1=s1; + if(length1==-1) { + limit1=NULL; + } else { + limit1=s1+length1; + } + + start2=s2; + if(length2==-1) { + limit2=NULL; + } else { + limit2=s2+length2; + } + + level1=level2=0; + c1=c2=-1; + + /* comparison loop */ + for(;;) { + /* + * here a code unit value of -1 means "get another code unit" + * below it will mean "this source is finished" + */ + + if(c1<0) { + /* get next code unit from string 1, post-increment */ + for(;;) { + if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { + if(level1==0) { + c1=-1; + break; + } + } else { + ++s1; + break; + } + + /* reached end of level buffer, pop one level */ + do { + --level1; + start1=stack1[level1].start; + } while(start1==NULL); + s1=stack1[level1].s; + limit1=stack1[level1].limit; + } + } + + if(c2<0) { + /* get next code unit from string 2, post-increment */ + for(;;) { + if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { + if(level2==0) { + c2=-1; + break; + } + } else { + ++s2; + break; + } + + /* reached end of level buffer, pop one level */ + do { + --level2; + start2=stack2[level2].start; + } while(start2==NULL); + s2=stack2[level2].s; + limit2=stack2[level2].limit; + } + } + + /* + * compare c1 and c2 + * either variable c1, c2 is -1 only if the corresponding string is finished + */ + if(c1==c2) { + if(c1<0) { + return 0; /* c1==c2==-1 indicating end of strings */ + } + c1=c2=-1; /* make us fetch new code units */ + continue; + } else if(c1<0) { + return -1; /* string 1 ends before string 2 */ + } else if(c2<0) { + return 1; /* string 2 ends before string 1 */ + } + /* c1!=c2 && c1>=0 && c2>=0 */ + + /* get complete code points for c1, c2 for lookups if either is a surrogate */ + cp1=c1; + if(U_IS_SURROGATE(c1)) { + UChar c; + + if(U_IS_SURROGATE_LEAD(c1)) { + if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { + /* advance ++s1; only below if cp1 decomposes/case-folds */ + cp1=U16_GET_SUPPLEMENTARY(c1, c); + } + } else /* isTrail(c1) */ { + if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { + cp1=U16_GET_SUPPLEMENTARY(c, c1); + } + } + } + + cp2=c2; + if(U_IS_SURROGATE(c2)) { + UChar c; + + if(U_IS_SURROGATE_LEAD(c2)) { + if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { + /* advance ++s2; only below if cp2 decomposes/case-folds */ + cp2=U16_GET_SUPPLEMENTARY(c2, c); + } + } else /* isTrail(c2) */ { + if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { + cp2=U16_GET_SUPPLEMENTARY(c, c2); + } + } + } + + /* + * go down one level for each string + * continue with the main loop as soon as there is a real change + */ + + if( level1==0 && + (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 + ) { + /* cp1 case-folds to the code point "length" or to p[length] */ + if(U_IS_SURROGATE(c1)) { + if(U_IS_SURROGATE_LEAD(c1)) { + /* advance beyond source surrogate pair if it case-folds */ + ++s1; + } else /* isTrail(c1) */ { + /* + * we got a supplementary code point when hitting its trail surrogate, + * therefore the lead surrogate must have been the same as in the other string; + * compare this decomposition with the lead surrogate in the other string + * remember that this simulates bulk text replacement: + * the decomposition would replace the entire code point + */ + --s2; + c2=*(s2-1); + } + } + + /* push current level pointers */ + stack1[0].start=start1; + stack1[0].s=s1; + stack1[0].limit=limit1; + ++level1; + + /* copy the folding result to fold1[] */ + if(length<=UCASE_MAX_STRING_LENGTH) { + u_memcpy(fold1, p, length); + } else { + int32_t i=0; + U16_APPEND_UNSAFE(fold1, i, length); + length=i; + } + + /* set next level pointers to case folding */ + start1=s1=fold1; + limit1=fold1+length; + + /* get ready to read from decomposition, continue with loop */ + c1=-1; + continue; + } + + if( level2==0 && + (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 + ) { + /* cp2 case-folds to the code point "length" or to p[length] */ + if(U_IS_SURROGATE(c2)) { + if(U_IS_SURROGATE_LEAD(c2)) { + /* advance beyond source surrogate pair if it case-folds */ + ++s2; + } else /* isTrail(c2) */ { + /* + * we got a supplementary code point when hitting its trail surrogate, + * therefore the lead surrogate must have been the same as in the other string; + * compare this decomposition with the lead surrogate in the other string + * remember that this simulates bulk text replacement: + * the decomposition would replace the entire code point + */ + --s1; + c1=*(s1-1); + } + } + + /* push current level pointers */ + stack2[0].start=start2; + stack2[0].s=s2; + stack2[0].limit=limit2; + ++level2; + + /* copy the folding result to fold2[] */ + if(length<=UCASE_MAX_STRING_LENGTH) { + u_memcpy(fold2, p, length); + } else { + int32_t i=0; + U16_APPEND_UNSAFE(fold2, i, length); + length=i; + } + + /* set next level pointers to case folding */ + start2=s2=fold2; + limit2=fold2+length; + + /* get ready to read from decomposition, continue with loop */ + c2=-1; + continue; + } + + /* + * no decomposition/case folding, max level for both sides: + * return difference result + * + * code point order comparison must not just return cp1-cp2 + * because when single surrogates are present then the surrogate pairs + * that formed cp1 and cp2 may be from different string indexes + * + * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units + * c1=d800 cp1=10001 c2=dc00 cp2=10000 + * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } + * + * therefore, use same fix-up as in ustring.c/uprv_strCompare() + * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ + * so we have slightly different pointer/start/limit comparisons here + */ + + if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { + /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ + if( + (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || + (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) + ) { + /* part of a surrogate pair, leave >=d800 */ + } else { + /* BMP code point - may be surrogate code point - make =d800 */ + } else { + /* BMP code point - may be surrogate code point - make