ICU-1728 re-implement CodePointOrder compare functions to make them work with surrogate code points

X-SVN-Rev: 8612
This commit is contained in:
Markus Scherer 2002-05-13 23:03:09 +00:00
parent 63f6acf6d7
commit 65eb5971c0
2 changed files with 136 additions and 117 deletions

View file

@ -633,41 +633,13 @@ UnicodeString::doCompareCodePointOrder(int32_t start,
// pin indices to legal values
pinIndices(start, length);
// get the correct pointer
const UChar *chars = getArrayStart();
chars += start;
srcChars += srcStart;
int32_t minLength;
int8_t lengthResult;
// get the srcLength if necessary
if(srcLength < 0) {
srcLength = u_strlen(srcChars + srcStart);
}
// are we comparing different lengths?
if(length != srcLength) {
if(length < srcLength) {
minLength = length;
lengthResult = -1;
} else {
minLength = srcLength;
lengthResult = 1;
}
int32_t diff = u_strCompareCodePointOrder(fArray + start, length, srcChars + srcStart, srcLength, FALSE);
/* translate the 32-bit result into an 8-bit one */
if(diff!=0) {
return (int8_t)(diff >> 15 | 1);
} else {
minLength = length;
lengthResult = 0;
return 0;
}
if(minLength > 0 && chars != srcChars) {
int32_t diff = u_memcmpCodePointOrder(chars, srcChars, minLength);
if(diff!=0) {
return (int8_t)(diff >> 15 | 1);
}
}
return lengthResult;
}
int8_t

View file

@ -402,43 +402,148 @@ u_strcmp(const UChar *s1,
return (int32_t)c1 - (int32_t)c2;
}
/* rotate surrogates to the top to get code point order; assume c>=0xd800 */
#define UTF16FIXUP(c) { \
if ((c) >= 0xe000) { \
(c) -= 0x800; \
} else { \
(c) += 0x2000; \
} \
}
/* String compare in code point order - u_strcmp() compares in code unit order. */
U_CAPI int32_t U_EXPORT2
u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
U_CFUNC int32_t
u_strCompareCodePointOrder(const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
UBool strncmpStyle) {
const UChar *start1, *start2, *limit1, *limit2;
UChar c1, c2;
/* setup for fix-up */
start1=s1;
start2=s2;
/* compare identical prefixes - they do not need to be fixed up */
for(;;) {
c1=*s1++;
c2=*s2++;
if (c1 != c2) {
break;
}
if (c1 == 0) {
if(length1<0 && length2<0) {
/* strcmp style, both NUL-terminated */
if(s1==s2) {
return 0;
}
for(;;) {
c1=*s1;
c2=*s2;
if(c1!=c2) {
break;
}
if(c1==0) {
return 0;
}
++s1;
++s2;
}
/* setup for fix-up */
limit1=limit2=NULL;
} else if(strncmpStyle) {
/* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
if(s1==s2) {
return 0;
}
limit1=start1+length1;
for(;;) {
/* both lengths are same, check only one limit */
if(s1==limit1) {
return 0;
}
c1=*s1;
c2=*s2;
if(c1!=c2) {
break;
}
if(c1==0) {
return 0;
}
++s1;
++s2;
}
/* setup for fix-up */
limit2=start2+length1; /* use length1 here, too, to enforce assumption */
} else {
/* memcmp/UnicodeString style, both length-specified */
int32_t lengthResult;
if(length1<0) {
length1=u_strlen(s1);
}
if(length2<0) {
length2=u_strlen(s2);
}
/* limit1=start1+min(lenght1, length2) */
if(length1<length2) {
lengthResult=-1;
limit1=start1+length1;
} else if(length1==length2) {
lengthResult=0;
limit1=start1+length1;
} else /* length1>length2 */ {
lengthResult=1;
limit1=start1+length2;
}
if(s1==s2) {
return lengthResult;
}
for(;;) {
/* check pseudo-limit */
if(s1==limit1) {
return lengthResult;
}
c1=*s1;
c2=*s2;
if(c1!=c2) {
break;
}
++s1;
++s2;
}
/* setup for fix-up */
limit1=start1+length1;
limit2=start2+length2;
}
/* if both values are in or above the surrogate range, Fix them up. */
if (c1 >= 0xD800 && c2 >= 0xD800) {
UTF16FIXUP(c1);
UTF16FIXUP(c2);
/* if both values are in or above the surrogate range, fix them up */
if(c1>=0xd800 && c2>=0xd800) {
/* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
if(
(c1<=0xdbff && (++s1)!=limit1 && UTF_IS_TRAIL(*s1)) ||
(UTF_IS_TRAIL(c1) && start1!=s1 && UTF_IS_LEAD(*(s1-1)))
) {
/* part of a surrogate pair, leave >=d800 */
} else {
/* BMP code point - may be surrogate code point - make <d800 */
c1-=0x2800;
}
if(
(c2<=0xdbff && (++s2)!=limit2 && UTF_IS_TRAIL(*s2)) ||
(UTF_IS_TRAIL(c2) && start2!=s2 && UTF_IS_LEAD(*(s2-1)))
) {
/* part of a surrogate pair, leave >=d800 */
} else {
/* BMP code point - may be surrogate code point - make <d800 */
c2-=0x2800;
}
}
/* now c1 and c2 are in UTF-32-compatible order */
return (int32_t)c1-(int32_t)c2;
}
/* String compare in code point order - u_strcmp() compares in code unit order. */
U_CAPI int32_t U_EXPORT2
u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
return u_strCompareCodePointOrder(s1, -1, s2, -1, FALSE);
}
U_CAPI int32_t U_EXPORT2
u_strncmp(const UChar *s1,
const UChar *s2,
@ -461,35 +566,7 @@ u_strncmp(const UChar *s1,
U_CAPI int32_t U_EXPORT2
u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
UChar c1, c2;
if(n<=0) {
return 0;
}
/* compare identical prefixes - they do not need to be fixed up */
for(;;) {
c1=*s1;
c2=*s2;
if(c1==c2) {
if(c1==0 || --n==0) {
return 0;
}
++s1;
++s2;
} else {
break;
}
}
/* c1!=c2, fix up each one if they're both in or above the surrogate range, then compare them */
if (c1 >= 0xD800 && c2 >= 0xD800) {
UTF16FIXUP(c1);
UTF16FIXUP(c2);
}
/* now c1 and c2 are in UTF-32-compatible order */
return (int32_t)c1-(int32_t)c2;
return u_strCompareCodePointOrder(s1, n, s2, n, TRUE);
}
U_CAPI UChar* U_EXPORT2
@ -617,37 +694,7 @@ u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
U_CAPI int32_t U_EXPORT2
u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
const UChar *limit;
UChar c1, c2;
if(count<=0) {
return 0;
}
limit=s1+count;
/* compare identical prefixes - they do not need to be fixed up */
for(;;) {
c1=*s1;
c2=*s2;
if(c1!=c2) {
break;
}
++s1;
++s2;
if(s1==limit) {
return 0;
}
}
/* c1!=c2, fix up each one if they're both in or above the surrogate range, then compare them */
if (c1 >= 0xD800 && c2 >= 0xD800) {
UTF16FIXUP(c1);
UTF16FIXUP(c2);
}
/* now c1 and c2 are in UTF-32-compatible order */
return (int32_t)c1-(int32_t)c2;
return u_strCompareCodePointOrder(s1, count, s2, count, FALSE);
}
U_CAPI UChar * U_EXPORT2