mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 15:05:53 +00:00
ICU-9567 Merged ucol_strcollUTF8 to trunk.
X-SVN-Rev: 32534
This commit is contained in:
parent
f7cefb19f1
commit
f82276faf1
5 changed files with 927 additions and 66 deletions
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2006, International Business Machines
|
||||
* Copyright (C) 2003-2012, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -86,7 +86,8 @@ typedef enum UTraceFunctionNumber {
|
|||
UTRACE_UCOL_NEXTSORTKEYPART,
|
||||
UTRACE_UCOL_STRCOLLITER,
|
||||
UTRACE_UCOL_OPEN_FROM_SHORT_STRING,
|
||||
UTRACE_COLLATION_LIMIT
|
||||
UTRACE_COLLATION_LIMIT,
|
||||
UTRACE_UCOL_STRCOLLUTF8
|
||||
} UTraceFunctionNumber;
|
||||
|
||||
/**
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include "unicode/unorm.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf8.h"
|
||||
|
||||
#include "ucol_imp.h"
|
||||
#include "bocsu.h"
|
||||
|
@ -53,10 +54,11 @@ U_NAMESPACE_USE
|
|||
|
||||
#define ZERO_CC_LIMIT_ 0xC0
|
||||
|
||||
// This is static pointer to the NFC implementation instance.
|
||||
// it is always the same between calls to u_cleanup
|
||||
// These are static pointers to the NFC/NFD implementation instance.
|
||||
// Each of them is always the same between calls to u_cleanup
|
||||
// and therefore writing to it is not synchronized.
|
||||
// It is cleaned in ucol_cleanup
|
||||
// They are cleaned in ucol_cleanup
|
||||
static const Normalizer2 *g_nfd = NULL;
|
||||
static const Normalizer2Impl *g_nfcImpl = NULL;
|
||||
|
||||
// These are values from UCA required for
|
||||
|
@ -71,6 +73,7 @@ U_CDECL_BEGIN
|
|||
static UBool U_CALLCONV
|
||||
ucol_cleanup(void)
|
||||
{
|
||||
g_nfd = NULL;
|
||||
g_nfcImpl = NULL;
|
||||
return TRUE;
|
||||
}
|
||||
|
@ -82,6 +85,18 @@ _getFoldingOffset(uint32_t data) {
|
|||
|
||||
U_CDECL_END
|
||||
|
||||
static inline
|
||||
UBool initializeNFD(UErrorCode *status) {
|
||||
if (g_nfd != NULL) {
|
||||
return TRUE;
|
||||
} else {
|
||||
// The result is constant, until the library is reloaded.
|
||||
g_nfd = Normalizer2Factory::getNFDInstance(*status);
|
||||
ucln_i18n_registerCleanup(UCLN_I18N_UCOL, ucol_cleanup);
|
||||
return U_SUCCESS(*status);
|
||||
}
|
||||
}
|
||||
|
||||
// init FCD data
|
||||
static inline
|
||||
UBool initializeFCD(UErrorCode *status) {
|
||||
|
@ -121,7 +136,11 @@ inline void IInit_collIterate(const UCollator *collator, const UChar *sourceStri
|
|||
(s)->offsetReturn = (s)->offsetStore = NULL;
|
||||
(s)->offsetRepeatCount = (s)->offsetRepeatValue = 0;
|
||||
(s)->coll = (collator);
|
||||
(s)->nfd = Normalizer2Factory::getNFDInstance(*status);
|
||||
if (initializeNFD(status)) {
|
||||
(s)->nfd = g_nfd;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
(s)->fcdPosition = 0;
|
||||
if(collator->normalizationMode == UCOL_ON) {
|
||||
(s)->flags |= UCOL_ITER_NORM;
|
||||
|
@ -8035,6 +8054,573 @@ endOfSecLoop:
|
|||
return UCOL_EQUAL;
|
||||
}
|
||||
|
||||
/*
|
||||
Slightly modified version of U8_NEXT macro defined in utf8.h. U8_NEXT requires
|
||||
the length of UTF-8 string. This version assumes that the UTF-8 string is null
|
||||
terminated and does not require the length as input.
|
||||
|
||||
Note: ucol_strcollUTF8 supports null terminated input. Calculating length of
|
||||
null terminated input string takes extra amount of CPU cycles.
|
||||
*/
|
||||
static const UChar32
|
||||
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
|
||||
|
||||
#define UTF8_ERROR_VALUE_1 0x15
|
||||
#define UTF8_ERROR_VALUE_2 0x9f
|
||||
#define UTF_ERROR_VALUE 0xffff
|
||||
|
||||
static const UChar32
|
||||
utf8_errorValue[6]={
|
||||
UTF8_ERROR_VALUE_1, UTF8_ERROR_VALUE_2, UTF_ERROR_VALUE, 0x10ffff,
|
||||
0x3ffffff, 0x7fffffff
|
||||
};
|
||||
|
||||
static
|
||||
UChar32 utf8_nextCharSafeBodyNullTerm(const uint8_t *s, int32_t *pi, UChar32 c, UBool strict) {
|
||||
int32_t i=*pi;
|
||||
uint8_t count=U8_COUNT_TRAIL_BYTES(c);
|
||||
U_ASSERT(count <= 5); /* U8_COUNT_TRAIL_BYTES returns value 0...5 */
|
||||
|
||||
if (c) {
|
||||
uint8_t trail, illegal=0;
|
||||
|
||||
U8_MASK_LEAD_BYTE((c), count);
|
||||
/* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
|
||||
switch(count) {
|
||||
/* each branch falls through to the next one */
|
||||
case 5:
|
||||
case 4:
|
||||
/* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
|
||||
illegal=1;
|
||||
break;
|
||||
case 3:
|
||||
trail=s[(i)];
|
||||
if (trail==0) {
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
(c)=((c)<<6)|(trail&0x3f);
|
||||
if(c<0x110) {
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
} else {
|
||||
/* code point>0x10ffff, outside Unicode */
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
++(i);
|
||||
case 2:
|
||||
trail=s[(i)];
|
||||
if (trail==0) {
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
(c)=((c)<<6)|(trail&0x3f);
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
++(i);
|
||||
case 1:
|
||||
trail=s[(i)];
|
||||
if (trail==0) {
|
||||
illegal=1;
|
||||
break;
|
||||
}
|
||||
(c)=((c)<<6)|(trail&0x3f);
|
||||
illegal|=(trail&0xc0)^0x80;
|
||||
++(i);
|
||||
break;
|
||||
case 0:
|
||||
if(strict>=0) {
|
||||
return UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
/* no default branch to optimize switch() - all values are covered */
|
||||
}
|
||||
|
||||
/*
|
||||
* All the error handling should return a value
|
||||
* that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
|
||||
*
|
||||
* Starting with Unicode 3.0.1, non-shortest forms are illegal.
|
||||
* Starting with Unicode 3.2, surrogate code points must not be
|
||||
* encoded in UTF-8, and there are no irregular sequences any more.
|
||||
*
|
||||
* U8_ macros (new in ICU 2.4) return negative values for error conditions.
|
||||
*/
|
||||
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
||||
/* illegal is also set if count>=4 */
|
||||
if(illegal || (c)<utf8_minLegal[count] || (U_IS_SURROGATE(c) && strict!=-2)) {
|
||||
/* error handling */
|
||||
uint8_t errorCount=count;
|
||||
/* don't go beyond this sequence */
|
||||
i=*pi;
|
||||
while(count>0 && U8_IS_TRAIL(s[i])) {
|
||||
++(i);
|
||||
--count;
|
||||
}
|
||||
if(strict>=0) {
|
||||
c=utf8_errorValue[errorCount-count];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
} else if((strict)>0 && U_IS_UNICODE_NONCHAR(c)) {
|
||||
/* strict: forbid non-characters like U+fffe */
|
||||
c=utf8_errorValue[count];
|
||||
}
|
||||
}
|
||||
*pi=i;
|
||||
return c;
|
||||
}
|
||||
|
||||
#define U8_NEXT_NULLTERM(s, i, c) { \
|
||||
(c)=(uint8_t)(s)[(i)]; \
|
||||
if((c)>=0x80) { \
|
||||
uint8_t __t1, __t2; \
|
||||
if( /* handle U+1000..U+CFFF inline */ \
|
||||
(0xe0<(c) && (c)<=0xec) && \
|
||||
(__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 && \
|
||||
(__t2=(uint8_t)((s)[(i)+2]-0x80))<= 0x3f && __t2 != 0 \
|
||||
) { \
|
||||
/* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \
|
||||
(c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \
|
||||
(i)+=3; \
|
||||
} else if( /* handle U+0080..U+07FF inline */ \
|
||||
((c)<0xe0 && (c)>=0xc2) && \
|
||||
(__t1=(uint8_t)((s)[(i)+1]-0x80))<=0x3f && __t1 != 0 \
|
||||
) { \
|
||||
(c)=(UChar)((((c)&0x1f)<<6)|__t1); \
|
||||
(i)+=2; \
|
||||
} else if(U8_IS_LEAD(c)) { \
|
||||
/* function call for "complicated" and error cases */ \
|
||||
++(i); \
|
||||
(c)=utf8_nextCharSafeBodyNullTerm((const uint8_t *)s, &(i), c, -1); \
|
||||
} else { \
|
||||
(c)=U_SENTINEL; \
|
||||
++(i); \
|
||||
} \
|
||||
} else { \
|
||||
if ((c)) { \
|
||||
++(i); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define U8_GET_NULLTERM(s, start, i, c) { \
|
||||
int32_t _u8_get_index=(int32_t)(i); \
|
||||
U8_SET_CP_START(s, start, _u8_get_index); \
|
||||
U8_NEXT_NULLTERM(s, _u8_get_index, c); \
|
||||
}
|
||||
|
||||
|
||||
static UCollationResult
|
||||
ucol_strcollRegularUTF8(
|
||||
const UCollator *coll,
|
||||
const char *source,
|
||||
int32_t sourceLength,
|
||||
const char *target,
|
||||
int32_t targetLength,
|
||||
UErrorCode *status)
|
||||
{
|
||||
UCharIterator src;
|
||||
UCharIterator tgt;
|
||||
|
||||
uiter_setUTF8(&src, source, sourceLength);
|
||||
uiter_setUTF8(&tgt, target, targetLength);
|
||||
|
||||
// Preparing the context objects for iterating over strings
|
||||
collIterate sColl, tColl;
|
||||
IInit_collIterate(coll, NULL, -1, &sColl, status);
|
||||
IInit_collIterate(coll, NULL, -1, &tColl, status);
|
||||
if(U_FAILURE(*status)) {
|
||||
UTRACE_EXIT_VALUE_STATUS(UCOL_EQUAL, *status)
|
||||
return UCOL_EQUAL;
|
||||
}
|
||||
// The division for the array length may truncate the array size to
|
||||
// a little less than UNORM_ITER_SIZE, but that size is dimensioned too high
|
||||
// for all platforms anyway.
|
||||
UAlignedMemory stackNormIter1[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
|
||||
UAlignedMemory stackNormIter2[UNORM_ITER_SIZE/sizeof(UAlignedMemory)];
|
||||
UNormIterator *sNormIter = NULL, *tNormIter = NULL;
|
||||
|
||||
sColl.iterator = &src;
|
||||
sColl.flags |= UCOL_USE_ITERATOR;
|
||||
tColl.flags |= UCOL_USE_ITERATOR;
|
||||
tColl.iterator = &tgt;
|
||||
|
||||
if(ucol_getAttribute(coll, UCOL_NORMALIZATION_MODE, status) == UCOL_ON) {
|
||||
sNormIter = unorm_openIter(stackNormIter1, sizeof(stackNormIter1), status);
|
||||
sColl.iterator = unorm_setIter(sNormIter, &src, UNORM_FCD, status);
|
||||
sColl.flags &= ~UCOL_ITER_NORM;
|
||||
|
||||
tNormIter = unorm_openIter(stackNormIter2, sizeof(stackNormIter2), status);
|
||||
tColl.iterator = unorm_setIter(tNormIter, &tgt, UNORM_FCD, status);
|
||||
tColl.flags &= ~UCOL_ITER_NORM;
|
||||
}
|
||||
|
||||
return ucol_strcollRegular(&sColl, &tColl, status);
|
||||
}
|
||||
|
||||
static inline uint32_t
|
||||
ucol_getLatinOneContractionUTF8(const UCollator *coll, int32_t strength,
|
||||
uint32_t CE, const char *s, int32_t *index, int32_t len)
|
||||
{
|
||||
const UChar *UCharOffset = (UChar *)coll->image+getContractOffset(CE&0xFFF);
|
||||
int32_t latinOneOffset = (CE & 0x00FFF000) >> 12;
|
||||
int32_t offset = 1;
|
||||
UChar32 schar = 0, tchar = 0;
|
||||
|
||||
for(;;) {
|
||||
if (len == -1) {
|
||||
U8_GET_NULLTERM((const uint8_t*)s, 0, *index, schar);
|
||||
if (schar == 0) {
|
||||
return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
|
||||
}
|
||||
} else {
|
||||
if (*index == len) {
|
||||
return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
|
||||
}
|
||||
U8_GET((const uint8_t*)s, 0, *index, len, schar);
|
||||
}
|
||||
if (schar == -1) {
|
||||
schar = 0xfffd;
|
||||
}
|
||||
|
||||
while(schar > (tchar = *(UCharOffset+offset))) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
|
||||
offset++;
|
||||
}
|
||||
|
||||
if (schar == tchar) {
|
||||
U8_FWD_1(s, *index, len);
|
||||
return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset+offset]);
|
||||
}
|
||||
else
|
||||
{
|
||||
if(schar & 0xFF00 /*> UCOL_ENDOFLATIN1RANGE*/) {
|
||||
return UCOL_BAIL_OUT_CE;
|
||||
}
|
||||
// skip completely ignorables
|
||||
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, schar);
|
||||
if(isZeroCE == 0) { // we have to ignore completely ignorables
|
||||
U8_FWD_1(s, *index, len);
|
||||
continue;
|
||||
}
|
||||
|
||||
return(coll->latinOneCEs[strength*coll->latinOneTableLen+latinOneOffset]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline UCollationResult
|
||||
ucol_strcollUseLatin1UTF8(
|
||||
const UCollator *coll,
|
||||
const char *source,
|
||||
int32_t sLen,
|
||||
const char *target,
|
||||
int32_t tLen,
|
||||
UErrorCode *status)
|
||||
{
|
||||
U_ALIGN_CODE(16);
|
||||
int32_t strength = coll->strength;
|
||||
|
||||
int32_t sIndex = 0, tIndex = 0;
|
||||
UChar32 sChar = 0, tChar = 0;
|
||||
uint32_t sOrder=0, tOrder=0;
|
||||
|
||||
UBool endOfSource = FALSE;
|
||||
|
||||
uint32_t *elements = coll->latinOneCEs;
|
||||
|
||||
UBool haveContractions = FALSE; // if we have contractions in our string
|
||||
// we cannot do French secondary
|
||||
|
||||
// Do the primary level
|
||||
for(;;) {
|
||||
while(sOrder==0) { // this loop skips primary ignorables
|
||||
// sOrder=getNextlatinOneCE(source);
|
||||
if (sLen==-1) {
|
||||
U8_NEXT_NULLTERM(source, sIndex, sChar);
|
||||
if (sChar == 0) {
|
||||
endOfSource = TRUE;
|
||||
sLen = sIndex;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (sIndex == sLen) {
|
||||
endOfSource = TRUE;
|
||||
break;
|
||||
}
|
||||
U8_NEXT(source, sIndex, sLen ,sChar);
|
||||
}
|
||||
if (sChar == -1) {
|
||||
sChar = 0xfffd; // fallback for the bad code
|
||||
}
|
||||
if(sChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
|
||||
//fprintf(stderr, "R");
|
||||
return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
|
||||
}
|
||||
sOrder = elements[sChar];
|
||||
if(sOrder >= UCOL_NOT_FOUND) { // if we got a special
|
||||
// specials can basically be either contractions or bail-out signs. If we get anything
|
||||
// else, we'll bail out anywasy
|
||||
if(getCETag(sOrder) == CONTRACTION_TAG) {
|
||||
sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, sOrder, source, &sIndex, sLen);
|
||||
haveContractions = TRUE; // if there are contractions, we cannot do French secondary
|
||||
// However, if there are contractions in the table, but we always use just one char,
|
||||
// we might be able to do French. This should be checked out.
|
||||
}
|
||||
if(sOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
|
||||
//fprintf(stderr, "S");
|
||||
return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while(tOrder==0) { // this loop skips primary ignorables
|
||||
// tOrder=getNextlatinOneCE(target);
|
||||
if (tLen == -1) {
|
||||
U8_NEXT_NULLTERM(target, tIndex, tChar);
|
||||
if (tChar == 0) {
|
||||
if(endOfSource) {
|
||||
tLen = tIndex;
|
||||
goto endOfPrimLoopU8;
|
||||
} else {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (tIndex == tLen) {
|
||||
if(endOfSource) {
|
||||
goto endOfPrimLoopU8;
|
||||
} else {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
U8_NEXT(target, tIndex, tLen, tChar);
|
||||
}
|
||||
if (tChar == -1) {
|
||||
tChar = 0xfffd;
|
||||
}
|
||||
if(tChar&0xFFFFFF00) { // if we encounter non-latin-1, we bail out (sChar > 0xFF, but this is faster on win32)
|
||||
//fprintf(stderr, "R");
|
||||
return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
|
||||
}
|
||||
tOrder = elements[tChar];
|
||||
if(tOrder >= UCOL_NOT_FOUND) {
|
||||
// Handling specials, see the comments for source
|
||||
if(getCETag(tOrder) == CONTRACTION_TAG) {
|
||||
tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_PRIMARY, tOrder, target, &tIndex, tLen);
|
||||
haveContractions = TRUE;
|
||||
}
|
||||
if(tOrder >= UCOL_NOT_FOUND /*== UCOL_BAIL_OUT_CE*/) {
|
||||
//fprintf(stderr, "S");
|
||||
return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(endOfSource) { // source is finished, but target is not, say the result.
|
||||
return UCOL_LESS;
|
||||
}
|
||||
|
||||
if(sOrder == tOrder) { // if we have same CEs, we continue the loop
|
||||
sOrder = 0; tOrder = 0;
|
||||
continue;
|
||||
} else {
|
||||
// compare current top bytes
|
||||
if(((sOrder^tOrder)&0xFF000000)!=0) {
|
||||
// top bytes differ, return difference
|
||||
if(sOrder < tOrder) {
|
||||
return UCOL_LESS;
|
||||
} else if(sOrder > tOrder) {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
// instead of return (int32_t)(sOrder>>24)-(int32_t)(tOrder>>24);
|
||||
// since we must return enum value
|
||||
}
|
||||
|
||||
// top bytes match, continue with following bytes
|
||||
sOrder<<=8;
|
||||
tOrder<<=8;
|
||||
}
|
||||
}
|
||||
|
||||
endOfPrimLoopU8:
|
||||
// after primary loop, we definitely know the sizes of strings,
|
||||
// so we set it and use simpler loop for secondaries and tertiaries
|
||||
sLen = sIndex; tLen = tIndex;
|
||||
if(strength >= UCOL_SECONDARY) {
|
||||
// adjust the table beggining
|
||||
elements += coll->latinOneTableLen;
|
||||
endOfSource = FALSE;
|
||||
|
||||
if(coll->frenchCollation == UCOL_OFF) { // non French
|
||||
// This loop is a simplified copy of primary loop
|
||||
// at this point we know that whole strings are latin-1, so we don't
|
||||
// check for that. We also know that we only have contractions as
|
||||
// specials.
|
||||
sIndex = 0; tIndex = 0;
|
||||
for(;;) {
|
||||
while(sOrder==0) {
|
||||
if(sIndex==sLen) {
|
||||
endOfSource = TRUE;
|
||||
break;
|
||||
}
|
||||
U_ASSERT(sLen >= 0);
|
||||
U8_NEXT(source, sIndex, sLen, sChar);
|
||||
U_ASSERT(sChar >= 0 && sChar <= 0xFF);
|
||||
sOrder = elements[sChar];
|
||||
if(sOrder > UCOL_NOT_FOUND) {
|
||||
sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, sOrder, source, &sIndex, sLen);
|
||||
}
|
||||
}
|
||||
|
||||
while(tOrder==0) {
|
||||
if(tIndex==tLen) {
|
||||
if(endOfSource) {
|
||||
goto endOfSecLoopU8;
|
||||
} else {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
U_ASSERT(tLen >= 0);
|
||||
U8_NEXT(target, tIndex, tLen, tChar);
|
||||
U_ASSERT(tChar >= 0 && tChar <= 0xFF);
|
||||
tOrder = elements[tChar];
|
||||
if(tOrder > UCOL_NOT_FOUND) {
|
||||
tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_SECONDARY, tOrder, target, &tIndex, tLen);
|
||||
}
|
||||
}
|
||||
if(endOfSource) {
|
||||
return UCOL_LESS;
|
||||
}
|
||||
|
||||
if(sOrder == tOrder) {
|
||||
sOrder = 0; tOrder = 0;
|
||||
continue;
|
||||
} else {
|
||||
// see primary loop for comments on this
|
||||
if(((sOrder^tOrder)&0xFF000000)!=0) {
|
||||
if(sOrder < tOrder) {
|
||||
return UCOL_LESS;
|
||||
} else if(sOrder > tOrder) {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
sOrder<<=8;
|
||||
tOrder<<=8;
|
||||
}
|
||||
}
|
||||
} else { // French
|
||||
if(haveContractions) { // if we have contractions, we have to bail out
|
||||
// since we don't really know how to handle them here
|
||||
return ucol_strcollRegularUTF8(coll, source, sLen, target, tLen, status);
|
||||
}
|
||||
// For French, we go backwards
|
||||
sIndex = sLen; tIndex = tLen;
|
||||
for(;;) {
|
||||
while(sOrder==0) {
|
||||
if(sIndex==0) {
|
||||
endOfSource = TRUE;
|
||||
break;
|
||||
}
|
||||
U8_PREV(source, 0, sIndex, sChar);
|
||||
U_ASSERT(sChar >= 0 && sChar <= 0xFF);
|
||||
sOrder = elements[sChar];
|
||||
// don't even look for contractions
|
||||
}
|
||||
|
||||
while(tOrder==0) {
|
||||
if(tIndex==0) {
|
||||
if(endOfSource) {
|
||||
goto endOfSecLoopU8;
|
||||
} else {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
U8_PREV(target, 0, tIndex, tChar);
|
||||
U_ASSERT(tChar >= 0 && tChar <= 0xFF);
|
||||
tOrder = elements[tChar];
|
||||
// don't even look for contractions
|
||||
}
|
||||
if(endOfSource) {
|
||||
return UCOL_LESS;
|
||||
}
|
||||
|
||||
if(sOrder == tOrder) {
|
||||
sOrder = 0; tOrder = 0;
|
||||
continue;
|
||||
} else {
|
||||
// see the primary loop for comments
|
||||
if(((sOrder^tOrder)&0xFF000000)!=0) {
|
||||
if(sOrder < tOrder) {
|
||||
return UCOL_LESS;
|
||||
} else if(sOrder > tOrder) {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
sOrder<<=8;
|
||||
tOrder<<=8;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
endOfSecLoopU8:
|
||||
if(strength >= UCOL_TERTIARY) {
|
||||
// tertiary loop is the same as secondary (except no French)
|
||||
elements += coll->latinOneTableLen;
|
||||
sIndex = 0; tIndex = 0;
|
||||
endOfSource = FALSE;
|
||||
for(;;) {
|
||||
while(sOrder==0) {
|
||||
if(sIndex==sLen) {
|
||||
endOfSource = TRUE;
|
||||
break;
|
||||
}
|
||||
U_ASSERT(sLen >= 0);
|
||||
U8_NEXT(source, sIndex, sLen, sChar);
|
||||
U_ASSERT(sChar >= 0 && sChar <= 0xFF);
|
||||
sOrder = elements[sChar];
|
||||
if(sOrder > UCOL_NOT_FOUND) {
|
||||
sOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, sOrder, source, &sIndex, sLen);
|
||||
}
|
||||
}
|
||||
while(tOrder==0) {
|
||||
if(tIndex==tLen) {
|
||||
if(endOfSource) {
|
||||
return UCOL_EQUAL; // if both strings are at the end, they are equal
|
||||
} else {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
U_ASSERT(tLen >= 0);
|
||||
U8_NEXT(target, tIndex, tLen, tChar);
|
||||
U_ASSERT(tChar >= 0 && tChar <= 0xFF);
|
||||
tOrder = elements[tChar];
|
||||
if(tOrder > UCOL_NOT_FOUND) {
|
||||
tOrder = ucol_getLatinOneContractionUTF8(coll, UCOL_TERTIARY, tOrder, target, &tIndex, tLen);
|
||||
}
|
||||
}
|
||||
if(endOfSource) {
|
||||
return UCOL_LESS;
|
||||
}
|
||||
if(sOrder == tOrder) {
|
||||
sOrder = 0; tOrder = 0;
|
||||
continue;
|
||||
} else {
|
||||
if(((sOrder^tOrder)&0xff000000)!=0) {
|
||||
if(sOrder < tOrder) {
|
||||
return UCOL_LESS;
|
||||
} else if(sOrder > tOrder) {
|
||||
return UCOL_GREATER;
|
||||
}
|
||||
}
|
||||
sOrder<<=8;
|
||||
tOrder<<=8;
|
||||
}
|
||||
}
|
||||
}
|
||||
return UCOL_EQUAL;
|
||||
}
|
||||
|
||||
U_CAPI UCollationResult U_EXPORT2
|
||||
ucol_strcollIter( const UCollator *coll,
|
||||
|
@ -8272,6 +8858,194 @@ ucol_strcoll( const UCollator *coll,
|
|||
return returnVal;
|
||||
}
|
||||
|
||||
U_DRAFT UCollationResult U_EXPORT2
|
||||
ucol_strcollUTF8(
|
||||
const UCollator *coll,
|
||||
const char *source,
|
||||
int32_t sourceLength,
|
||||
const char *target,
|
||||
int32_t targetLength,
|
||||
UErrorCode *status)
|
||||
{
|
||||
U_ALIGN_CODE(16);
|
||||
|
||||
UTRACE_ENTRY(UTRACE_UCOL_STRCOLLUTF8);
|
||||
if (UTRACE_LEVEL(UTRACE_VERBOSE)) {
|
||||
UTRACE_DATA3(UTRACE_VERBOSE, "coll=%p, source=%p, target=%p", coll, source, target);
|
||||
UTRACE_DATA2(UTRACE_VERBOSE, "source string = %vb ", source, sourceLength);
|
||||
UTRACE_DATA2(UTRACE_VERBOSE, "target string = %vb ", target, targetLength);
|
||||
}
|
||||
|
||||
if(source == NULL || target == NULL) {
|
||||
// do not crash, but return. Should have
|
||||
// status argument to return error.
|
||||
UTRACE_EXIT_VALUE(UCOL_EQUAL);
|
||||
return UCOL_EQUAL;
|
||||
}
|
||||
|
||||
/* Quick check if source and target are same strings. */
|
||||
/* They should either both be NULL terminated or the explicit length should be set on both. */
|
||||
if (source==target && sourceLength==targetLength) {
|
||||
UTRACE_EXIT_VALUE(UCOL_EQUAL);
|
||||
return UCOL_EQUAL;
|
||||
}
|
||||
|
||||
// TODO - provider support
|
||||
/*
|
||||
if(coll->delegate != NULL) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
return ((const Collator*)coll->delegate)->compare(source,sourceLength,target,targetLength, status);
|
||||
}
|
||||
*/
|
||||
|
||||
/* Scan the strings. Find: */
|
||||
/* The length of any leading portion that is equal */
|
||||
/* Whether they are exactly equal. (in which case we just return) */
|
||||
const char *pSrc = source;
|
||||
const char *pTarg = target;
|
||||
UBool bSrcLimit = FALSE;
|
||||
UBool bTargLimit = FALSE;
|
||||
|
||||
if (sourceLength == -1 && targetLength == -1) {
|
||||
// Both strings are null terminated.
|
||||
// Scan through any leading equal portion.
|
||||
while (*pSrc == *pTarg && *pSrc != 0) {
|
||||
pSrc++;
|
||||
pTarg++;
|
||||
}
|
||||
if (*pSrc == 0 && *pTarg == 0) {
|
||||
UTRACE_EXIT_VALUE(UCOL_EQUAL);
|
||||
return UCOL_EQUAL;
|
||||
}
|
||||
bSrcLimit = (*pSrc == 0);
|
||||
bTargLimit = (*pTarg == 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
// One or both strings has an explicit length.
|
||||
const char *pSrcEnd = source + sourceLength;
|
||||
const char *pTargEnd = target + targetLength;
|
||||
|
||||
// Scan while the strings are bitwise ==, or until one is exhausted.
|
||||
for (;;) {
|
||||
if (pSrc == pSrcEnd || pTarg == pTargEnd) {
|
||||
break;
|
||||
}
|
||||
if ((*pSrc == 0 && sourceLength == -1) || (*pTarg == 0 && targetLength == -1)) {
|
||||
break;
|
||||
}
|
||||
if (*pSrc != *pTarg) {
|
||||
break;
|
||||
}
|
||||
pSrc++;
|
||||
pTarg++;
|
||||
}
|
||||
bSrcLimit = (pSrc ==pSrcEnd || (pSrcEnd <pSrc && *pSrc==0));
|
||||
bTargLimit = (pTarg==pTargEnd || (pTargEnd<pTarg && *pTarg==0));
|
||||
|
||||
// If we made it all the way through both strings, we are done. They are ==
|
||||
if (bSrcLimit && /* At end of src string, however it was specified. */
|
||||
bTargLimit) /* and also at end of dest string */
|
||||
{
|
||||
UTRACE_EXIT_VALUE(UCOL_EQUAL);
|
||||
return UCOL_EQUAL;
|
||||
}
|
||||
}
|
||||
|
||||
U_ASSERT(!(bSrcLimit && bTargLimit));
|
||||
|
||||
int32_t equalLength = pSrc - source;
|
||||
UBool bSawNonLatin1 = FALSE;
|
||||
|
||||
if (equalLength > 0) {
|
||||
// Align position to the start of UTF-8 code point.
|
||||
if (bTargLimit) {
|
||||
U8_SET_CP_START((const uint8_t*)source, 0, equalLength);
|
||||
} else {
|
||||
U8_SET_CP_START((const uint8_t*)target, 0, equalLength);
|
||||
}
|
||||
pSrc = source + equalLength;
|
||||
pTarg = target + equalLength;
|
||||
}
|
||||
|
||||
if (equalLength > 0) {
|
||||
/* There is an identical portion at the beginning of the two strings. */
|
||||
/* If the identical portion ends within a contraction or a comibining */
|
||||
/* character sequence, back up to the start of that sequence. */
|
||||
UBool bUnsafeCP = FALSE;
|
||||
UChar32 uc32 = -1;
|
||||
|
||||
if (!bSrcLimit) {
|
||||
if (sourceLength >= 0) {
|
||||
U8_GET((uint8_t*)source, 0, equalLength, sourceLength, uc32);
|
||||
} else {
|
||||
U8_GET_NULLTERM((uint8_t*)source, 0, equalLength, uc32);
|
||||
}
|
||||
if (uc32 == -1) {
|
||||
uc32 = 0xfffd;
|
||||
bSawNonLatin1 |= TRUE;
|
||||
} else {
|
||||
if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
|
||||
bUnsafeCP = TRUE;
|
||||
}
|
||||
bSawNonLatin1 |= (uc32 > 0xff);
|
||||
}
|
||||
}
|
||||
if (!bTargLimit) {
|
||||
if (targetLength >= 0) {
|
||||
U8_GET((uint8_t*)target, 0, equalLength, targetLength, uc32);
|
||||
} else {
|
||||
U8_GET_NULLTERM((uint8_t*)target, 0, equalLength, uc32);
|
||||
}
|
||||
if (uc32 == -1) {
|
||||
uc32 = 0xfffd;
|
||||
bSawNonLatin1 |= TRUE;
|
||||
} else {
|
||||
if (uc32 >= 0x10000 || ucol_unsafeCP((UChar)uc32, coll)) {
|
||||
bUnsafeCP = TRUE;
|
||||
}
|
||||
bSawNonLatin1 |= (uc32 > 0xff);
|
||||
}
|
||||
}
|
||||
|
||||
if (bUnsafeCP) {
|
||||
while (equalLength > 0) {
|
||||
// We are stopped in the middle of a contraction.
|
||||
// Scan backwards through the == part of the string looking for the start of the contraction.
|
||||
// It doesn't matter which string we scan, since they are the same in this region.
|
||||
U8_PREV((uint8_t*)source, 0, equalLength, uc32);
|
||||
bSawNonLatin1 |= (uc32 > 0xff);
|
||||
if (uc32 < 0x10000 && !ucol_unsafeCP((UChar)uc32, coll)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
source += equalLength;
|
||||
target += equalLength;
|
||||
if (sourceLength > 0) {
|
||||
sourceLength -= equalLength;
|
||||
}
|
||||
if (targetLength > 0) {
|
||||
targetLength -= equalLength;
|
||||
}
|
||||
} else {
|
||||
// Lead byte of Latin 1 character is 0x00 - 0xC3
|
||||
bSawNonLatin1 = (source && (sourceLength != 0) && (*source > -61 && *source < 0));
|
||||
bSawNonLatin1 |= (target && (targetLength != 0) && (*target > -61 && *target < 0));
|
||||
}
|
||||
|
||||
UCollationResult returnVal;
|
||||
|
||||
if(!coll->latinOneUse || bSawNonLatin1) {
|
||||
returnVal = ucol_strcollRegularUTF8(coll, source, sourceLength, target, targetLength, status);
|
||||
} else {
|
||||
returnVal = ucol_strcollUseLatin1UTF8(coll, source, sourceLength, target, targetLength, status);
|
||||
}
|
||||
UTRACE_EXIT_VALUE(returnVal);
|
||||
return returnVal;
|
||||
}
|
||||
|
||||
|
||||
/* convenience function for comparing strings */
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucol_greater( const UCollator *coll,
|
||||
|
|
|
@ -533,6 +533,33 @@ ucol_strcoll( const UCollator *coll,
|
|||
const UChar *target,
|
||||
int32_t targetLength);
|
||||
|
||||
/**
|
||||
* Compare two strings in UTF-8.
|
||||
* The strings will be compared using the options already specified.
|
||||
* Note: When input string contains malformed a UTF-8 byte sequence,
|
||||
* this function treats these bytes as REPLACEMENT CHARACTER (U+FFFD).
|
||||
* @param coll The UCollator containing the comparison rules.
|
||||
* @param source The source UTF-8 string.
|
||||
* @param sourceLength The length of source, or -1 if null-terminated.
|
||||
* @param target The target UTF-8 string.
|
||||
* @param targetLength The length of target, or -1 if null-terminated.
|
||||
* @param status A pointer to an UErrorCode to receive any errors
|
||||
* @return The result of comparing the strings; one of UCOL_EQUAL,
|
||||
* UCOL_GREATER, UCOL_LESS
|
||||
* @see ucol_greater
|
||||
* @see ucol_greaterOrEqual
|
||||
* @see ucol_equal
|
||||
* @draft ICU 50
|
||||
*/
|
||||
U_DRAFT UCollationResult U_EXPORT2
|
||||
ucol_strcollUTF8(
|
||||
const UCollator *coll,
|
||||
const char *source,
|
||||
int32_t sourceLength,
|
||||
const char *target,
|
||||
int32_t targetLength,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Determine if one string is greater than another.
|
||||
* This function is equivalent to {@link #ucol_strcoll } == UCOL_GREATER
|
||||
|
|
|
@ -279,6 +279,18 @@ static void doTestVariant(UCollator* myCollation, const UChar source[], const UC
|
|||
UColAttributeValue norm = ucol_getAttribute(myCollation, UCOL_NORMALIZATION_MODE, &status);
|
||||
|
||||
UCharIterator sIter, tIter;
|
||||
|
||||
compareResult = ucol_strcoll(myCollation, source, sLen, target, tLen);
|
||||
if (compareResult != result) {
|
||||
log_err("ucol_strcoll with explicit length returned wrong result (%i exp. %i): %s, %s\n",
|
||||
compareResult, result, aescstrdup(source,-1), aescstrdup(target,-1));
|
||||
}
|
||||
compareResulta = ucol_strcoll(myCollation, source, -1, target, -1);
|
||||
if (compareResulta != result) {
|
||||
log_err("ucol_strcoll with null terminated strings returned wrong result (%i exp. %i): %s, %s\n",
|
||||
compareResult, result, aescstrdup(source,-1), aescstrdup(target,-1));
|
||||
}
|
||||
|
||||
uiter_setString(&sIter, source, sLen);
|
||||
uiter_setString(&tIter, target, tLen);
|
||||
compareResultIter = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
|
||||
|
@ -286,42 +298,65 @@ static void doTestVariant(UCollator* myCollation, const UChar source[], const UC
|
|||
log_err("different results in iterative comparison for UTF-16 encoded strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
|
||||
}
|
||||
|
||||
/* convert the strings to UTF-8 and do try comparing with char iterator */
|
||||
if(getTestOption(QUICK_OPTION) <= 0) { /*!QUICK*/
|
||||
char utf8Source[256], utf8Target[256];
|
||||
int32_t utf8SourceLen = 0, utf8TargetLen = 0;
|
||||
u_strToUTF8(utf8Source, 256, &utf8SourceLen, source, sLen, &status);
|
||||
if(U_FAILURE(status)) { /* probably buffer is not big enough */
|
||||
log_verbose("Src UTF-8 buffer too small! Will not compare!\n");
|
||||
} else {
|
||||
u_strToUTF8(utf8Target, 256, &utf8TargetLen, target, tLen, &status);
|
||||
if(U_SUCCESS(status)) { /* probably buffer is not big enough */
|
||||
UCollationResult compareResultUTF8 = result, compareResultUTF8Norm = result;
|
||||
/*UCharIterator sIter, tIter;*/
|
||||
/*log_verbose("Strings converted to UTF-8:%s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));*/
|
||||
uiter_setUTF8(&sIter, utf8Source, utf8SourceLen);
|
||||
uiter_setUTF8(&tIter, utf8Target, utf8TargetLen);
|
||||
/*uiter_setString(&sIter, source, sLen);
|
||||
uiter_setString(&tIter, target, tLen);*/
|
||||
compareResultUTF8 = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
|
||||
ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
|
||||
sIter.move(&sIter, 0, UITER_START);
|
||||
tIter.move(&tIter, 0, UITER_START);
|
||||
compareResultUTF8Norm = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
|
||||
ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, norm, &status);
|
||||
if(compareResultUTF8 != compareResultIter) {
|
||||
log_err("different results in iterative comparison for UTF-16 and UTF-8 encoded strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
|
||||
}
|
||||
if(compareResultUTF8 != compareResultUTF8Norm) {
|
||||
log_err("different results in iterative when normalization is turned on with UTF-8 strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
|
||||
}
|
||||
/* convert the strings to UTF-8 and do try comparing with char iterator and ucol_strcollUTF8 */
|
||||
{
|
||||
char utf8Source[256], utf8Target[256];
|
||||
int32_t utf8SourceLen = 0, utf8TargetLen = 0;
|
||||
|
||||
u_strToUTF8(utf8Source, 256, &utf8SourceLen, source, sLen, &status);
|
||||
if(U_FAILURE(status)) { /* probably buffer is not big enough */
|
||||
log_verbose("Src UTF-8 buffer too small! Will not compare!\n");
|
||||
} else {
|
||||
log_verbose("Target UTF-8 buffer too small! Did not compare!\n");
|
||||
u_strToUTF8(utf8Target, 256, &utf8TargetLen, target, tLen, &status);
|
||||
if(U_SUCCESS(status)) {
|
||||
{
|
||||
/* ucol_strcollUTF8 */
|
||||
compareResulta = ucol_strcollUTF8(myCollation, utf8Source, utf8SourceLen, utf8Target, utf8TargetLen, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err("Error in ucol_strcollUTF8 with explicit length\n");
|
||||
status = U_ZERO_ERROR;
|
||||
} else if (compareResulta != result) {
|
||||
log_err("ucol_strcollUTF8 with explicit length returned wrong result (%i exp. %i): %s, %s\n",
|
||||
compareResulta, result, aescstrdup(source,-1), aescstrdup(target,-1));
|
||||
}
|
||||
compareResulta = ucol_strcollUTF8(myCollation, utf8Source, -1, utf8Target, -1, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
log_err("Error in ucol_strcollUTF8 with null terminated strings\n");
|
||||
status = U_ZERO_ERROR;
|
||||
} else if (compareResulta != result) {
|
||||
log_err("ucol_strcollUTF8 with null terminated strings returned wrong result (%i exp. %i): %s, %s\n",
|
||||
compareResulta, result, aescstrdup(source,-1), aescstrdup(target,-1));
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
/* char iterator over UTF8 */
|
||||
UCollationResult compareResultUTF8Iter = result, compareResultUTF8IterNorm = result;
|
||||
|
||||
uiter_setUTF8(&sIter, utf8Source, utf8SourceLen);
|
||||
uiter_setUTF8(&tIter, utf8Target, utf8TargetLen);
|
||||
compareResultUTF8Iter = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
|
||||
|
||||
ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
|
||||
sIter.move(&sIter, 0, UITER_START);
|
||||
tIter.move(&tIter, 0, UITER_START);
|
||||
compareResultUTF8IterNorm = ucol_strcollIter(myCollation, &sIter, &tIter, &status);
|
||||
|
||||
ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, norm, &status);
|
||||
if(compareResultUTF8Iter != compareResultIter) {
|
||||
log_err("different results in iterative comparison for UTF-16 and UTF-8 encoded strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
|
||||
}
|
||||
if(compareResultUTF8Iter != compareResultUTF8IterNorm) {
|
||||
log_err("different results in iterative when normalization is turned on with UTF-8 strings. %s, %s\n", aescstrdup(source,-1), aescstrdup(target,-1));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log_verbose("Target UTF-8 buffer too small! Did not compare!\n");
|
||||
}
|
||||
if(U_FAILURE(status)) {
|
||||
log_verbose("UTF-8 strcoll failed! Ignoring result\n");
|
||||
}
|
||||
}
|
||||
if(U_FAILURE(status)) {
|
||||
log_verbose("UTF-8 strcoll failed! Ignoring result\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* testing the partial sortkeys */
|
||||
|
@ -358,13 +393,6 @@ static void doTestVariant(UCollator* myCollation, const UChar source[], const UC
|
|||
/*log_verbose("\n");*/
|
||||
}
|
||||
|
||||
|
||||
compareResult = ucol_strcoll(myCollation, source, sLen, target, tLen);
|
||||
compareResulta = ucol_strcoll(myCollation, source, -1, target, -1);
|
||||
if (compareResult != compareResulta) {
|
||||
log_err("ucol_strcoll result from null terminated and explicit length strings differs.\n");
|
||||
}
|
||||
|
||||
sortklen1=ucol_getSortKey(myCollation, source, sLen, NULL, 0);
|
||||
sortklen2=ucol_getSortKey(myCollation, target, tLen, NULL, 0);
|
||||
|
||||
|
|
|
@ -169,6 +169,42 @@ static void doAssert(int condition, const char *message)
|
|||
}
|
||||
}
|
||||
|
||||
#define UTF8_BUF_SIZE 128
|
||||
|
||||
static void doStrcoll(const UCollator* coll, const UChar* src, int32_t srcLen, const UChar* tgt, int32_t tgtLen,
|
||||
UCollationResult expected, const char *message) {
|
||||
UErrorCode err = U_ZERO_ERROR;
|
||||
char srcU8[UTF8_BUF_SIZE], tgtU8[UTF8_BUF_SIZE];
|
||||
int32_t srcU8Len = -1, tgtU8Len = -1;
|
||||
int32_t len = 0;
|
||||
|
||||
if (ucol_strcoll(coll, src, srcLen, tgt, tgtLen) != expected) {
|
||||
log_err("ERROR : %s\n", message);
|
||||
}
|
||||
|
||||
u_strToUTF8(srcU8, UTF8_BUF_SIZE, &len, src, srcLen, &err);
|
||||
if (U_FAILURE(err) || len >= UTF8_BUF_SIZE) {
|
||||
log_err("ERROR : UTF-8 conversion error\n");
|
||||
return;
|
||||
}
|
||||
if (srcLen >= 0) {
|
||||
srcU8Len = len;
|
||||
}
|
||||
u_strToUTF8(tgtU8, UTF8_BUF_SIZE, &len, tgt, tgtLen, &err);
|
||||
if (U_FAILURE(err) || len >= UTF8_BUF_SIZE) {
|
||||
log_err("ERROR : UTF-8 conversion error\n");
|
||||
return;
|
||||
}
|
||||
if (tgtLen >= 0) {
|
||||
tgtU8Len = len;
|
||||
}
|
||||
|
||||
if (ucol_strcollUTF8(coll, srcU8, srcU8Len, tgtU8, tgtU8Len, &err) != expected
|
||||
|| U_FAILURE(err)) {
|
||||
log_err("ERROR: %s (strcollUTF8)\n", message);
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* We don't have default rules, at least not in the previous sense */
|
||||
void TestGetDefaultRules(){
|
||||
|
@ -233,7 +269,8 @@ void TestProperty()
|
|||
UCollator *col, *ruled;
|
||||
UChar *disName;
|
||||
int32_t len = 0;
|
||||
UChar *source, *target;
|
||||
UChar source[12], target[12];
|
||||
char sourceU8[36], targetU8[36];
|
||||
int32_t tempLength;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
/*
|
||||
|
@ -279,37 +316,31 @@ void TestProperty()
|
|||
versionUCAArray[0], versionUCAArray[1], versionUCAArray[2], versionUCAArray[3]);
|
||||
}
|
||||
|
||||
source=(UChar*)malloc(sizeof(UChar) * 12);
|
||||
target=(UChar*)malloc(sizeof(UChar) * 12);
|
||||
|
||||
|
||||
u_uastrcpy(source, "ab");
|
||||
u_uastrcpy(target, "abc");
|
||||
|
||||
doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_LESS), "ab < abc comparison failed");
|
||||
doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_LESS, "ab < abc comparison failed");
|
||||
|
||||
u_uastrcpy(source, "ab");
|
||||
u_uastrcpy(target, "AB");
|
||||
|
||||
doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_LESS), "ab < AB comparison failed");
|
||||
/* u_uastrcpy(source, "black-bird");
|
||||
u_uastrcpy(target, "blackbird"); */
|
||||
u_uastrcpy(target, "black-bird");
|
||||
u_uastrcpy(source, "blackbird");
|
||||
doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_LESS, "ab < AB comparison failed");
|
||||
|
||||
u_uastrcpy(source, "blackbird");
|
||||
u_uastrcpy(target, "black-bird");
|
||||
|
||||
doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_GREATER, "black-bird > blackbird comparison failed");
|
||||
|
||||
doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_GREATER),
|
||||
"black-bird > blackbird comparison failed");
|
||||
u_uastrcpy(source, "black bird");
|
||||
u_uastrcpy(target, "black-bird");
|
||||
doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_LESS),
|
||||
"black bird < black-bird comparison failed");
|
||||
|
||||
doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_LESS, "black bird < black-bird comparison failed");
|
||||
|
||||
u_uastrcpy(source, "Hello");
|
||||
u_uastrcpy(target, "hello");
|
||||
|
||||
doAssert((ucol_strcoll(col, source, u_strlen(source), target, u_strlen(target)) == UCOL_GREATER),
|
||||
"Hello > hello comparison failed");
|
||||
free(source);
|
||||
free(target);
|
||||
doStrcoll(col, source, u_strlen(source), target, u_strlen(target), UCOL_GREATER, "Hello > hello comparison failed");
|
||||
|
||||
log_verbose("Test ucol_strcoll ends.\n");
|
||||
|
||||
log_verbose("testing ucol_getStrength() method ...\n");
|
||||
|
|
Loading…
Add table
Reference in a new issue