mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-1728 re-implement CodePointOrder compare functions to make them work with surrogate code points
X-SVN-Rev: 8612
This commit is contained in:
parent
63f6acf6d7
commit
65eb5971c0
2 changed files with 136 additions and 117 deletions
|
@ -633,41 +633,13 @@ UnicodeString::doCompareCodePointOrder(int32_t start,
|
|||
// pin indices to legal values
|
||||
pinIndices(start, length);
|
||||
|
||||
// get the correct pointer
|
||||
const UChar *chars = getArrayStart();
|
||||
|
||||
chars += start;
|
||||
srcChars += srcStart;
|
||||
|
||||
int32_t minLength;
|
||||
int8_t lengthResult;
|
||||
|
||||
// get the srcLength if necessary
|
||||
if(srcLength < 0) {
|
||||
srcLength = u_strlen(srcChars + srcStart);
|
||||
}
|
||||
|
||||
// are we comparing different lengths?
|
||||
if(length != srcLength) {
|
||||
if(length < srcLength) {
|
||||
minLength = length;
|
||||
lengthResult = -1;
|
||||
} else {
|
||||
minLength = srcLength;
|
||||
lengthResult = 1;
|
||||
}
|
||||
int32_t diff = u_strCompareCodePointOrder(fArray + start, length, srcChars + srcStart, srcLength, FALSE);
|
||||
/* translate the 32-bit result into an 8-bit one */
|
||||
if(diff!=0) {
|
||||
return (int8_t)(diff >> 15 | 1);
|
||||
} else {
|
||||
minLength = length;
|
||||
lengthResult = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(minLength > 0 && chars != srcChars) {
|
||||
int32_t diff = u_memcmpCodePointOrder(chars, srcChars, minLength);
|
||||
if(diff!=0) {
|
||||
return (int8_t)(diff >> 15 | 1);
|
||||
}
|
||||
}
|
||||
return lengthResult;
|
||||
}
|
||||
|
||||
int8_t
|
||||
|
|
|
@ -402,43 +402,148 @@ u_strcmp(const UChar *s1,
|
|||
return (int32_t)c1 - (int32_t)c2;
|
||||
}
|
||||
|
||||
/* rotate surrogates to the top to get code point order; assume c>=0xd800 */
|
||||
#define UTF16FIXUP(c) { \
|
||||
if ((c) >= 0xe000) { \
|
||||
(c) -= 0x800; \
|
||||
} else { \
|
||||
(c) += 0x2000; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
/* String compare in code point order - u_strcmp() compares in code unit order. */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
|
||||
U_CFUNC int32_t
|
||||
u_strCompareCodePointOrder(const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
UBool strncmpStyle) {
|
||||
const UChar *start1, *start2, *limit1, *limit2;
|
||||
UChar c1, c2;
|
||||
|
||||
/* setup for fix-up */
|
||||
start1=s1;
|
||||
start2=s2;
|
||||
|
||||
/* compare identical prefixes - they do not need to be fixed up */
|
||||
for(;;) {
|
||||
c1=*s1++;
|
||||
c2=*s2++;
|
||||
if (c1 != c2) {
|
||||
break;
|
||||
}
|
||||
if (c1 == 0) {
|
||||
if(length1<0 && length2<0) {
|
||||
/* strcmp style, both NUL-terminated */
|
||||
if(s1==s2) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for(;;) {
|
||||
c1=*s1;
|
||||
c2=*s2;
|
||||
if(c1!=c2) {
|
||||
break;
|
||||
}
|
||||
if(c1==0) {
|
||||
return 0;
|
||||
}
|
||||
++s1;
|
||||
++s2;
|
||||
}
|
||||
|
||||
/* setup for fix-up */
|
||||
limit1=limit2=NULL;
|
||||
} else if(strncmpStyle) {
|
||||
/* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
|
||||
if(s1==s2) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
limit1=start1+length1;
|
||||
|
||||
for(;;) {
|
||||
/* both lengths are same, check only one limit */
|
||||
if(s1==limit1) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
c1=*s1;
|
||||
c2=*s2;
|
||||
if(c1!=c2) {
|
||||
break;
|
||||
}
|
||||
if(c1==0) {
|
||||
return 0;
|
||||
}
|
||||
++s1;
|
||||
++s2;
|
||||
}
|
||||
|
||||
/* setup for fix-up */
|
||||
limit2=start2+length1; /* use length1 here, too, to enforce assumption */
|
||||
} else {
|
||||
/* memcmp/UnicodeString style, both length-specified */
|
||||
int32_t lengthResult;
|
||||
|
||||
if(length1<0) {
|
||||
length1=u_strlen(s1);
|
||||
}
|
||||
if(length2<0) {
|
||||
length2=u_strlen(s2);
|
||||
}
|
||||
|
||||
/* limit1=start1+min(lenght1, length2) */
|
||||
if(length1<length2) {
|
||||
lengthResult=-1;
|
||||
limit1=start1+length1;
|
||||
} else if(length1==length2) {
|
||||
lengthResult=0;
|
||||
limit1=start1+length1;
|
||||
} else /* length1>length2 */ {
|
||||
lengthResult=1;
|
||||
limit1=start1+length2;
|
||||
}
|
||||
|
||||
if(s1==s2) {
|
||||
return lengthResult;
|
||||
}
|
||||
|
||||
for(;;) {
|
||||
/* check pseudo-limit */
|
||||
if(s1==limit1) {
|
||||
return lengthResult;
|
||||
}
|
||||
|
||||
c1=*s1;
|
||||
c2=*s2;
|
||||
if(c1!=c2) {
|
||||
break;
|
||||
}
|
||||
++s1;
|
||||
++s2;
|
||||
}
|
||||
|
||||
/* setup for fix-up */
|
||||
limit1=start1+length1;
|
||||
limit2=start2+length2;
|
||||
}
|
||||
|
||||
/* if both values are in or above the surrogate range, Fix them up. */
|
||||
if (c1 >= 0xD800 && c2 >= 0xD800) {
|
||||
UTF16FIXUP(c1);
|
||||
UTF16FIXUP(c2);
|
||||
/* if both values are in or above the surrogate range, fix them up */
|
||||
if(c1>=0xd800 && c2>=0xd800) {
|
||||
/* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
|
||||
if(
|
||||
(c1<=0xdbff && (++s1)!=limit1 && UTF_IS_TRAIL(*s1)) ||
|
||||
(UTF_IS_TRAIL(c1) && start1!=s1 && UTF_IS_LEAD(*(s1-1)))
|
||||
) {
|
||||
/* part of a surrogate pair, leave >=d800 */
|
||||
} else {
|
||||
/* BMP code point - may be surrogate code point - make <d800 */
|
||||
c1-=0x2800;
|
||||
}
|
||||
|
||||
if(
|
||||
(c2<=0xdbff && (++s2)!=limit2 && UTF_IS_TRAIL(*s2)) ||
|
||||
(UTF_IS_TRAIL(c2) && start2!=s2 && UTF_IS_LEAD(*(s2-1)))
|
||||
) {
|
||||
/* part of a surrogate pair, leave >=d800 */
|
||||
} else {
|
||||
/* BMP code point - may be surrogate code point - make <d800 */
|
||||
c2-=0x2800;
|
||||
}
|
||||
}
|
||||
|
||||
/* now c1 and c2 are in UTF-32-compatible order */
|
||||
return (int32_t)c1-(int32_t)c2;
|
||||
}
|
||||
|
||||
/* String compare in code point order - u_strcmp() compares in code unit order. */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) {
|
||||
return u_strCompareCodePointOrder(s1, -1, s2, -1, FALSE);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_strncmp(const UChar *s1,
|
||||
const UChar *s2,
|
||||
|
@ -461,35 +566,7 @@ u_strncmp(const UChar *s1,
|
|||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) {
|
||||
UChar c1, c2;
|
||||
|
||||
if(n<=0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* compare identical prefixes - they do not need to be fixed up */
|
||||
for(;;) {
|
||||
c1=*s1;
|
||||
c2=*s2;
|
||||
if(c1==c2) {
|
||||
if(c1==0 || --n==0) {
|
||||
return 0;
|
||||
}
|
||||
++s1;
|
||||
++s2;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* c1!=c2, fix up each one if they're both in or above the surrogate range, then compare them */
|
||||
if (c1 >= 0xD800 && c2 >= 0xD800) {
|
||||
UTF16FIXUP(c1);
|
||||
UTF16FIXUP(c2);
|
||||
}
|
||||
|
||||
/* now c1 and c2 are in UTF-32-compatible order */
|
||||
return (int32_t)c1-(int32_t)c2;
|
||||
return u_strCompareCodePointOrder(s1, n, s2, n, TRUE);
|
||||
}
|
||||
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
|
@ -617,37 +694,7 @@ u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) {
|
|||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) {
|
||||
const UChar *limit;
|
||||
UChar c1, c2;
|
||||
|
||||
if(count<=0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
limit=s1+count;
|
||||
|
||||
/* compare identical prefixes - they do not need to be fixed up */
|
||||
for(;;) {
|
||||
c1=*s1;
|
||||
c2=*s2;
|
||||
if(c1!=c2) {
|
||||
break;
|
||||
}
|
||||
++s1;
|
||||
++s2;
|
||||
if(s1==limit) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* c1!=c2, fix up each one if they're both in or above the surrogate range, then compare them */
|
||||
if (c1 >= 0xD800 && c2 >= 0xD800) {
|
||||
UTF16FIXUP(c1);
|
||||
UTF16FIXUP(c2);
|
||||
}
|
||||
|
||||
/* now c1 and c2 are in UTF-32-compatible order */
|
||||
return (int32_t)c1-(int32_t)c2;
|
||||
return u_strCompareCodePointOrder(s1, count, s2, count, FALSE);
|
||||
}
|
||||
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
|
|
Loading…
Add table
Reference in a new issue