mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 23:10:40 +00:00
ICU-7264 UCA 6.0 data, test data, and bug fixes; from branches/markus/uca60 -r 28826:28857
X-SVN-Rev: 28875
This commit is contained in:
parent
1c7566e3db
commit
d29bfdf854
21 changed files with 77360 additions and 44460 deletions
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -204,7 +204,8 @@ Unicode 6.0 update
|
|||
- update source/data/unidata/UCARules.txt with UCA_Rules_SHORT.txt
|
||||
- update Han-implicit ranges for new CJK extensions:
|
||||
swapCJK() in ucol.cpp & ImplicitCEGenerator.java
|
||||
- genuca: allow bytes 02 for U+FFFE, new merge-sort character
|
||||
- genuca: allow bytes 02 for U+FFFE, new merge-sort character;
|
||||
do not add it into invuca so that tailoring primary-after an ignorable works
|
||||
- genuca: permit space between [variable top] bytes
|
||||
- ucol.cpp: treat noncharacters like unassigned rather than ignorable
|
||||
- run makeuca.sh:
|
||||
|
|
|
@ -63,7 +63,7 @@ static UChar32 fcdHighStart = 0;
|
|||
// implicit generation and supressing sort key compression
|
||||
// they should regularly be in the UCA, but if one
|
||||
// is running without UCA, it could be a problem
|
||||
static const int32_t maxRegularPrimary = 0xA0;
|
||||
static const int32_t maxRegularPrimary = 0x7A;
|
||||
static const int32_t minImplicitPrimary = 0xE0;
|
||||
static const int32_t maxImplicitPrimary = 0xE4;
|
||||
|
||||
|
@ -295,6 +295,29 @@ ensureCEsCapacity(collIterate *data, int32_t minCapacity) {
|
|||
return reallocCEs(data, minCapacity > oldCapacity ? minCapacity : oldCapacity);
|
||||
}
|
||||
|
||||
void collIterate::appendOffset(int32_t offset, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
int32_t length = offsetStore == NULL ? 0 : (int32_t)(offsetStore - offsetBuffer);
|
||||
if(length >= offsetBufferSize) {
|
||||
int32_t newCapacity = 2 * offsetBufferSize + UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
int32_t *newBuffer = reinterpret_cast<int32_t *>(uprv_malloc(newCapacity * 4));
|
||||
if(newBuffer == NULL) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
if(length > 0) {
|
||||
uprv_memcpy(newBuffer, offsetBuffer, length * 4);
|
||||
}
|
||||
uprv_free(offsetBuffer);
|
||||
offsetBuffer = newBuffer;
|
||||
offsetStore = offsetBuffer + length;
|
||||
offsetBufferSize = newCapacity;
|
||||
}
|
||||
*offsetStore++ = offset;
|
||||
}
|
||||
|
||||
/*
|
||||
* collIter_eos()
|
||||
* Checks for a collIterate being positioned at the end of
|
||||
|
@ -965,35 +988,63 @@ static int32_t
|
|||
min4Boundary = 0;
|
||||
|
||||
static const UChar32
|
||||
// 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
||||
CJK_BASE = 0x4E00,
|
||||
CJK_LIMIT = 0x9FFF+1,
|
||||
CJK_LIMIT = 0x9FCB+1,
|
||||
// Unified CJK ideographs in the compatibility ideographs block.
|
||||
CJK_COMPAT_USED_BASE = 0xFA0E,
|
||||
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
|
||||
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
CJK_A_BASE = 0x3400,
|
||||
CJK_A_LIMIT = 0x4DBF+1,
|
||||
CJK_A_LIMIT = 0x4DB5+1,
|
||||
// 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
|
||||
CJK_B_BASE = 0x20000,
|
||||
CJK_B_LIMIT = 0x2A6DF+1;
|
||||
CJK_B_LIMIT = 0x2A6D6+1,
|
||||
// 2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;
|
||||
CJK_C_BASE = 0x2A700,
|
||||
CJK_C_LIMIT = 0x2B734+1,
|
||||
// 2B740;<CJK Ideograph Extension D, First>;Lo;0;L;;;;;N;;;;;
|
||||
// 2B81D;<CJK Ideograph Extension D, Last>;Lo;0;L;;;;;N;;;;;
|
||||
CJK_D_BASE = 0x2B740,
|
||||
CJK_D_LIMIT = 0x2B81D+1;
|
||||
// when adding to this list, look for all occurrences (in project)
|
||||
// of CJK_C_BASE and CJK_C_LIMIT, etc. to check for code that needs changing!!!!
|
||||
|
||||
static UChar32 swapCJK(UChar32 i) {
|
||||
|
||||
if (i >= CJK_BASE) {
|
||||
if (i < CJK_LIMIT) return i - CJK_BASE;
|
||||
|
||||
if (i < CJK_COMPAT_USED_BASE) return i + NON_CJK_OFFSET;
|
||||
|
||||
if (i < CJK_COMPAT_USED_LIMIT) return i - CJK_COMPAT_USED_BASE
|
||||
+ (CJK_LIMIT - CJK_BASE);
|
||||
if (i < CJK_B_BASE) return i + NON_CJK_OFFSET;
|
||||
|
||||
if (i < CJK_B_LIMIT) return i; // non-BMP-CJK
|
||||
|
||||
return i + NON_CJK_OFFSET; // non-CJK
|
||||
if (i < CJK_A_BASE) {
|
||||
// non-CJK
|
||||
} else if (i < CJK_A_LIMIT) {
|
||||
// Extension A has lower code points than the original Unihan+compat
|
||||
// but sorts higher.
|
||||
return i - CJK_A_BASE
|
||||
+ (CJK_LIMIT - CJK_BASE)
|
||||
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
|
||||
} else if (i < CJK_BASE) {
|
||||
// non-CJK
|
||||
} else if (i < CJK_LIMIT) {
|
||||
return i - CJK_BASE;
|
||||
} else if (i < CJK_COMPAT_USED_BASE) {
|
||||
// non-CJK
|
||||
} else if (i < CJK_COMPAT_USED_LIMIT) {
|
||||
return i - CJK_COMPAT_USED_BASE
|
||||
+ (CJK_LIMIT - CJK_BASE);
|
||||
} else if (i < CJK_B_BASE) {
|
||||
// non-CJK
|
||||
} else if (i < CJK_B_LIMIT) {
|
||||
return i; // non-BMP-CJK
|
||||
} else if (i < CJK_C_BASE) {
|
||||
// non-CJK
|
||||
} else if (i < CJK_C_LIMIT) {
|
||||
return i; // non-BMP-CJK
|
||||
} else if (i < CJK_D_BASE) {
|
||||
// non-CJK
|
||||
} else if (i < CJK_D_LIMIT) {
|
||||
return i; // non-BMP-CJK
|
||||
}
|
||||
if (i < CJK_A_BASE) return i + NON_CJK_OFFSET;
|
||||
|
||||
if (i < CJK_A_LIMIT) return i - CJK_A_BASE
|
||||
+ (CJK_LIMIT - CJK_BASE)
|
||||
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
|
||||
return i + NON_CJK_OFFSET; // non-CJK
|
||||
}
|
||||
|
||||
|
@ -1599,23 +1650,6 @@ void collPrevIterNormalize(collIterate *data)
|
|||
*/
|
||||
data->writableBuffer.insert(0, (UChar)0);
|
||||
|
||||
if (data->offsetBuffer == NULL) {
|
||||
int32_t len = normLen >= UCOL_EXPAND_CE_BUFFER_SIZE ? normLen + 1 : UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
|
||||
data->offsetBufferSize = len;
|
||||
data->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * len);
|
||||
data->offsetStore = data->offsetBuffer;
|
||||
} else if(data->offsetBufferSize < normLen) {
|
||||
int32_t storeIX = (int32_t)(data->offsetStore - data->offsetBuffer);
|
||||
int32_t *tob = (int32_t *) uprv_realloc(data->offsetBuffer, sizeof(int32_t) * (normLen + 1));
|
||||
|
||||
if (tob != NULL) {
|
||||
data->offsetBuffer = tob;
|
||||
data->offsetStore = &data->offsetBuffer[storeIX];
|
||||
data->offsetBufferSize = normLen + 1;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* The usual case at this point is that we've got a base
|
||||
* character followed by marks that were normalized. If
|
||||
|
@ -1660,13 +1694,13 @@ void collPrevIterNormalize(collIterate *data)
|
|||
}
|
||||
}
|
||||
|
||||
*(data->offsetStore++) = baseOffset;
|
||||
data->appendOffset(baseOffset, status);
|
||||
}
|
||||
|
||||
*(data->offsetStore++) = firstMarkOffset;
|
||||
data->appendOffset(firstMarkOffset, status);
|
||||
|
||||
for (int32_t i = 0; i < trailCount; i += 1) {
|
||||
*(data->offsetStore++) = trailOffset;
|
||||
data->appendOffset(trailOffset, status);
|
||||
}
|
||||
|
||||
data->offsetRepeatValue = trailOffset;
|
||||
|
@ -1748,26 +1782,92 @@ inline UBool collPrevIterFCD(collIterate *data)
|
|||
return result;
|
||||
}
|
||||
|
||||
/** gets a character from the string at a given offset
|
||||
/** gets a code unit from the string at a given offset
|
||||
* Handles both normal and iterative cases.
|
||||
* No error checking - caller beware!
|
||||
*/
|
||||
inline static
|
||||
UChar peekCharacter(collIterate *source, int32_t offset) {
|
||||
static inline
|
||||
UChar peekCodeUnit(collIterate *source, int32_t offset) {
|
||||
if(source->pos != NULL) {
|
||||
return *(source->pos + offset);
|
||||
} else if(source->iterator != NULL) {
|
||||
UChar32 c;
|
||||
if(offset != 0) {
|
||||
source->iterator->move(source->iterator, offset, UITER_CURRENT);
|
||||
UChar toReturn = (UChar)source->iterator->next(source->iterator);
|
||||
c = source->iterator->next(source->iterator);
|
||||
source->iterator->move(source->iterator, -offset-1, UITER_CURRENT);
|
||||
return toReturn;
|
||||
} else {
|
||||
return (UChar)source->iterator->current(source->iterator);
|
||||
c = source->iterator->current(source->iterator);
|
||||
}
|
||||
return c >= 0 ? (UChar)c : 0xfffd; // If the caller works properly, we should never see c<0.
|
||||
} else {
|
||||
return 0xfffd;
|
||||
}
|
||||
}
|
||||
|
||||
// Code point version. Treats the offset as a _code point_ delta.
|
||||
// We cannot use U16_FWD_1_UNSAFE and similar because we might not have well-formed UTF-16.
|
||||
// We cannot use U16_FWD_1 and similar because we do not know the start and limit of the buffer.
|
||||
static inline
|
||||
UChar32 peekCodePoint(collIterate *source, int32_t offset) {
|
||||
UChar32 c;
|
||||
if(source->pos != NULL) {
|
||||
const UChar *p = source->pos;
|
||||
if(offset >= 0) {
|
||||
// Skip forward over (offset-1) code points.
|
||||
while(--offset >= 0) {
|
||||
if(U16_IS_LEAD(*p++) && U16_IS_TRAIL(*p)) {
|
||||
++p;
|
||||
}
|
||||
}
|
||||
// Read the code point there.
|
||||
c = *p++;
|
||||
UChar trail;
|
||||
if(U16_IS_LEAD(c) && U16_IS_TRAIL(trail = *p)) {
|
||||
c = U16_GET_SUPPLEMENTARY(c, trail);
|
||||
}
|
||||
} else /* offset<0 */ {
|
||||
// Skip backward over (offset-1) code points.
|
||||
while(++offset < 0) {
|
||||
if(U16_IS_TRAIL(*--p) && U16_IS_LEAD(*(p - 1))) {
|
||||
--p;
|
||||
}
|
||||
}
|
||||
// Read the code point before that.
|
||||
c = *--p;
|
||||
UChar lead;
|
||||
if(U16_IS_TRAIL(c) && U16_IS_LEAD(lead = *(p - 1))) {
|
||||
c = U16_GET_SUPPLEMENTARY(lead, c);
|
||||
}
|
||||
}
|
||||
} else if(source->iterator != NULL) {
|
||||
if(offset >= 0) {
|
||||
// Skip forward over (offset-1) code points.
|
||||
int32_t fwd = offset;
|
||||
while(fwd-- > 0) {
|
||||
uiter_next32(source->iterator);
|
||||
}
|
||||
// Read the code point there.
|
||||
c = uiter_current32(source->iterator);
|
||||
// Return to the starting point, skipping backward over (offset-1) code points.
|
||||
while(offset-- > 0) {
|
||||
uiter_previous32(source->iterator);
|
||||
}
|
||||
} else /* offset<0 */ {
|
||||
// Read backward, reading offset code points, remember only the last-read one.
|
||||
int32_t back = offset;
|
||||
do {
|
||||
c = uiter_previous32(source->iterator);
|
||||
} while(++back < 0);
|
||||
// Return to the starting position, skipping forward over offset code points.
|
||||
do {
|
||||
uiter_next32(source->iterator);
|
||||
} while(++offset < 0);
|
||||
}
|
||||
} else {
|
||||
return (UChar)U_SENTINEL;
|
||||
c = U_SENTINEL;
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1830,7 +1930,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
|||
} else {
|
||||
if (data->offsetReturn == data->offsetBuffer) {
|
||||
data->offsetReturn = NULL;
|
||||
data->offsetStore = data->offsetBuffer;
|
||||
data->offsetStore = data->offsetBuffer;
|
||||
} else {
|
||||
data->offsetReturn -= 1;
|
||||
}
|
||||
|
@ -2304,7 +2404,7 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
|
|||
|
||||
backupState(source, &discState);
|
||||
|
||||
buffer.setTo(peekCharacter(source, -1));
|
||||
buffer.setTo(peekCodePoint(source, -1));
|
||||
for (;;) {
|
||||
UChar *UCharOffset;
|
||||
UChar schar,
|
||||
|
@ -2312,7 +2412,7 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
|
|||
uint32_t result;
|
||||
|
||||
if (((source->flags & UCOL_ITER_HASLEN) && source->pos >= source->endp)
|
||||
|| (peekCharacter(source, 0) == 0 &&
|
||||
|| (peekCodeUnit(source, 0) == 0 &&
|
||||
//|| (*source->pos == 0 &&
|
||||
((source->flags & UCOL_ITER_INNORMBUF) == 0 ||
|
||||
source->fcdPosition == NULL ||
|
||||
|
@ -2322,7 +2422,7 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
|
|||
/* end of string in null terminated string or stopped by a
|
||||
null character, note fcd does not always point to a base
|
||||
character after the discontiguos change */
|
||||
u_getCombiningClass(peekCharacter(source, 0)) == 0) {
|
||||
u_getCombiningClass(peekCodePoint(source, 0)) == 0) {
|
||||
//u_getCombiningClass(*(source->pos)) == 0) {
|
||||
//constart = (UChar *)coll->image + getContractOffset(CE);
|
||||
if (multicontraction) {
|
||||
|
@ -2350,8 +2450,7 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
|
|||
}
|
||||
else {
|
||||
if (u_getCombiningClass(schar) ==
|
||||
u_getCombiningClass(peekCharacter(source, -2))) {
|
||||
//u_getCombiningClass(*(source->pos - 2))) {
|
||||
u_getCombiningClass(peekCodePoint(source, -2))) {
|
||||
buffer.append(schar);
|
||||
continue;
|
||||
}
|
||||
|
@ -2390,17 +2489,9 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
|
|||
return *(coll->contractionCEs + (constart - coll->contractionIndex));
|
||||
}
|
||||
|
||||
static
|
||||
inline UBool isNonChar(UChar32 cp) {
|
||||
return (UBool)((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDFFF));
|
||||
}
|
||||
|
||||
/* now uses Mark's getImplicitPrimary code */
|
||||
static
|
||||
inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
|
||||
if(isNonChar(cp)) {
|
||||
return 0;
|
||||
}
|
||||
uint32_t r = uprv_uca_getImplicitPrimary(cp);
|
||||
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
|
||||
collationSource->offsetRepeatCount += 1;
|
||||
|
@ -3128,7 +3219,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
/* we encountered a leading surrogate. We shall get the CE by using the following code unit */
|
||||
/* two things can happen here: next code point can be a trailing surrogate - we will use it */
|
||||
/* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
|
||||
/* we return 0 (completely ignorable - per UCA specification */
|
||||
/* we treat it like an unassigned code point. */
|
||||
{
|
||||
UChar trail;
|
||||
collIterateState state;
|
||||
|
@ -3137,7 +3228,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
// we chould have stepped one char forward and it might have turned that it
|
||||
// was not a trail surrogate. In that case, we have to backup.
|
||||
loadState(source, &state, TRUE);
|
||||
return 0;
|
||||
return UCOL_NOT_FOUND;
|
||||
} else {
|
||||
/* TODO: CE contain the data from the previous CE + the mask. It should at least be unmasked */
|
||||
CE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, CE&0xFFFFFF, trail);
|
||||
|
@ -3158,19 +3249,16 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
|
||||
source->iterator->next(source->iterator);
|
||||
return getImplicit(cp, source);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
} else if((((source->flags & UCOL_ITER_HASLEN) == 0 ) || (source->pos<source->endp)) &&
|
||||
U_IS_TRAIL((nextChar=*source->pos))) {
|
||||
cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
|
||||
source->pos++;
|
||||
return getImplicit(cp, source);
|
||||
} else {
|
||||
return 0; /* completely ignorable */
|
||||
U_IS_TRAIL((nextChar=*source->pos))) {
|
||||
cp = U16_GET_SUPPLEMENTARY(ch, nextChar);
|
||||
source->pos++;
|
||||
return getImplicit(cp, source);
|
||||
}
|
||||
return UCOL_NOT_FOUND;
|
||||
case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
|
||||
return 0; /* broken surrogate sequence */
|
||||
return UCOL_NOT_FOUND; /* broken surrogate sequence */
|
||||
case CHARSET_TAG:
|
||||
/* not yet implemented */
|
||||
/* probably after 1.8 */
|
||||
|
@ -3189,36 +3277,27 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
/* now uses Mark's getImplicitPrimary code */
|
||||
static
|
||||
inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
|
||||
if(isNonChar(cp)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t r = uprv_uca_getImplicitPrimary(cp);
|
||||
|
||||
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
|
||||
collationSource->toReturn = collationSource->CEpos;
|
||||
|
||||
if (collationSource->offsetBuffer == NULL) {
|
||||
collationSource->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
collationSource->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
collationSource->offsetStore = collationSource->offsetBuffer;
|
||||
}
|
||||
// **** doesn't work if using iterator ****
|
||||
if (collationSource->flags & UCOL_ITER_INNORMBUF) {
|
||||
collationSource->offsetRepeatCount = 1;
|
||||
} else {
|
||||
int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
|
||||
|
||||
// **** doesn't work if using iterator ****
|
||||
if (collationSource->flags & UCOL_ITER_INNORMBUF) {
|
||||
collationSource->offsetRepeatCount = 1;
|
||||
} else {
|
||||
int32_t firstOffset = (int32_t)(collationSource->pos - collationSource->string);
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
collationSource->appendOffset(firstOffset, errorCode);
|
||||
collationSource->appendOffset(firstOffset + 1, errorCode);
|
||||
|
||||
*(collationSource->offsetStore++) = firstOffset;
|
||||
*(collationSource->offsetStore++) = firstOffset + 1;
|
||||
|
||||
collationSource->offsetReturn = collationSource->offsetStore - 1;
|
||||
*(collationSource->offsetBuffer) = firstOffset;
|
||||
if (collationSource->offsetReturn == collationSource->offsetBuffer) {
|
||||
collationSource->offsetStore = collationSource->offsetBuffer;
|
||||
}
|
||||
}
|
||||
collationSource->offsetReturn = collationSource->offsetStore - 1;
|
||||
*(collationSource->offsetBuffer) = firstOffset;
|
||||
if (collationSource->offsetReturn == collationSource->offsetBuffer) {
|
||||
collationSource->offsetStore = collationSource->offsetBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
|
||||
}
|
||||
|
@ -3297,7 +3376,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
// it's easy for BMP code points
|
||||
if(isZeroCE == 0) {
|
||||
continue;
|
||||
} else if(U16_IS_TRAIL(schar) || U16_IS_LEAD(schar)) {
|
||||
} else if(U16_IS_SURROGATE(schar)) {
|
||||
// for supplementary code points, we have to check the next one
|
||||
// situations where we are going to ignore
|
||||
// 1. beginning of the string: schar is a lone surrogate
|
||||
|
@ -3306,9 +3385,9 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
// that is explicitly set to zero.
|
||||
if (!collIter_bos(source)) {
|
||||
UChar lead;
|
||||
if(U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
|
||||
if(!U16_IS_SURROGATE_LEAD(schar) && U16_IS_LEAD(lead = getPrevNormalizedChar(source, status))) {
|
||||
isZeroCE = UTRIE_GET32_FROM_LEAD(&coll->mapping, lead);
|
||||
if(getCETag(isZeroCE) == SURROGATE_TAG) {
|
||||
if(isSpecial(isZeroCE) && getCETag(isZeroCE) == SURROGATE_TAG) {
|
||||
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(&coll->mapping, isZeroCE&0xFFFFFF, schar);
|
||||
if(finalCE == 0) {
|
||||
// this is a real, assigned completely ignorable code point
|
||||
|
@ -3317,12 +3396,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
}
|
||||
}
|
||||
} else {
|
||||
// lone surrogate, completely ignorable
|
||||
continue;
|
||||
// lone surrogate, treat like unassigned
|
||||
return UCOL_NOT_FOUND;
|
||||
}
|
||||
} else {
|
||||
// lone surrogate at the beggining, completely ignorable
|
||||
continue;
|
||||
// lone surrogate at the beggining, treat like unassigned
|
||||
return UCOL_NOT_FOUND;
|
||||
}
|
||||
}
|
||||
// Source string char was not in the table.
|
||||
|
@ -3350,7 +3429,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
the forward iteration. this will ensure that the obstinate problem of
|
||||
overlapping contractions will not occur.
|
||||
*/
|
||||
schar = peekCharacter(source, 0);
|
||||
schar = peekCodeUnit(source, 0);
|
||||
constart = (UChar *)coll->image + getContractOffset(CE);
|
||||
if (isAtStartPrevIterate(source)
|
||||
/* commented away contraction end checks after adding the checks
|
||||
|
@ -3441,17 +3520,11 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
CECount = (int32_t)((source->CEpos - source->CEs)/sizeof(uint32_t));
|
||||
}
|
||||
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
while (CE != UCOL_NO_MORE_CES) {
|
||||
*(source->CEpos ++) = CE;
|
||||
|
||||
if (offsetBias >= 0) {
|
||||
*(source->offsetStore ++) = rawOffset + offsetBias;
|
||||
source->appendOffset(rawOffset + offsetBias, *status);
|
||||
}
|
||||
|
||||
CECount++;
|
||||
|
@ -3462,38 +3535,12 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
this bail*/
|
||||
if (!increaseCEsCapacity(source)) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
if (strbuffer != buffer) {
|
||||
uprv_free(strbuffer);
|
||||
}
|
||||
|
||||
return (uint32_t)UCOL_NULLORDER;
|
||||
break;
|
||||
}
|
||||
|
||||
endCEBuffer = source->extendCEs + source->extendCEsSize;
|
||||
}
|
||||
|
||||
if (offsetBias >= 0 && source->offsetStore >= &source->offsetBuffer[source->offsetBufferSize]) {
|
||||
int32_t storeIX = (int32_t)(source->offsetStore - source->offsetBuffer);
|
||||
int32_t *tob = (int32_t *) uprv_realloc(source->offsetBuffer,
|
||||
sizeof(int32_t) * (source->offsetBufferSize + UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE));
|
||||
|
||||
if (tob != NULL) {
|
||||
source->offsetBuffer = tob;
|
||||
source->offsetStore = &source->offsetBuffer[storeIX];
|
||||
source->offsetBufferSize += UCOL_EXPAND_CE_BUFFER_EXTEND_SIZE;
|
||||
} else {
|
||||
// memory error...
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
source->CEpos = source->CEs;
|
||||
|
||||
if (strbuffer != buffer) {
|
||||
uprv_free(strbuffer);
|
||||
}
|
||||
|
||||
return (uint32_t) UCOL_NULLORDER;
|
||||
}
|
||||
}
|
||||
|
||||
if ((temp.flags & UCOL_ITER_INNORMBUF) != 0) {
|
||||
rawOffset = (int32_t)(temp.fcdPosition - temp.string);
|
||||
} else {
|
||||
|
@ -3503,6 +3550,13 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
CE = ucol_IGetNextCE(coll, &temp, status);
|
||||
}
|
||||
|
||||
if (strbuffer != buffer) {
|
||||
uprv_free(strbuffer);
|
||||
}
|
||||
if (U_FAILURE(*status)) {
|
||||
return (uint32_t)UCOL_NULLORDER;
|
||||
}
|
||||
|
||||
if (source->offsetRepeatValue != 0) {
|
||||
if (CECount > noChars) {
|
||||
source->offsetRepeatCount += temp.offsetRepeatCount;
|
||||
|
@ -3512,10 +3566,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
}
|
||||
}
|
||||
|
||||
if (strbuffer != buffer) {
|
||||
uprv_free(strbuffer);
|
||||
}
|
||||
|
||||
if (offsetBias >= 0) {
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
|
@ -3536,26 +3586,20 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
*(source->CEpos++) = ((CE & 0xFF)<<24)|UCOL_CONTINUATION_MARKER;
|
||||
source->toReturn = source->CEpos - 1;
|
||||
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
if (source->flags & UCOL_ITER_INNORMBUF) {
|
||||
if (source->flags & UCOL_ITER_INNORMBUF) {
|
||||
source->offsetRepeatCount = 1;
|
||||
} else {
|
||||
int32_t firstOffset = (int32_t)(source->pos - source->string);
|
||||
} else {
|
||||
int32_t firstOffset = (int32_t)(source->pos - source->string);
|
||||
|
||||
*(source->offsetStore++) = firstOffset;
|
||||
*(source->offsetStore++) = firstOffset + 1;
|
||||
source->appendOffset(firstOffset, *status);
|
||||
source->appendOffset(firstOffset + 1, *status);
|
||||
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
*(source->offsetBuffer) = firstOffset;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
}
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
*(source->offsetBuffer) = firstOffset;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return *(source->toReturn);
|
||||
|
@ -3579,12 +3623,6 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
}
|
||||
}
|
||||
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
/* find the offset to expansion table */
|
||||
CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
|
||||
size = getExpansionCount(CE);
|
||||
|
@ -3598,7 +3636,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
*(source->CEpos ++) = *CEOffset++;
|
||||
|
||||
if (firstOffset >= 0) {
|
||||
*(source->offsetStore ++) = firstOffset + 1;
|
||||
source->appendOffset(firstOffset + 1, *status);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -3607,7 +3645,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
*(source->CEpos ++) = *CEOffset ++;
|
||||
|
||||
if (firstOffset >= 0) {
|
||||
*(source->offsetStore ++) = firstOffset + 1;
|
||||
source->appendOffset(firstOffset + 1, *status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -3891,15 +3929,8 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
V += VBase;
|
||||
T += TBase;
|
||||
|
||||
if (source->offsetBuffer == NULL) {
|
||||
source->offsetBufferSize = UCOL_EXPAND_CE_BUFFER_SIZE;
|
||||
source->offsetBuffer = (int32_t *) uprv_malloc(sizeof(int32_t) * UCOL_EXPAND_CE_BUFFER_SIZE);
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
int32_t firstOffset = (int32_t)(source->pos - source->string);
|
||||
|
||||
*(source->offsetStore++) = firstOffset;
|
||||
int32_t firstOffset = (int32_t)(source->pos - source->string);
|
||||
source->appendOffset(firstOffset, *status);
|
||||
|
||||
/*
|
||||
* return the first CE, but first put the rest into the expansion buffer
|
||||
|
@ -3907,21 +3938,21 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
if (!source->coll->image->jamoSpecial) {
|
||||
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, L);
|
||||
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, V);
|
||||
*(source->offsetStore++) = firstOffset + 1;
|
||||
source->appendOffset(firstOffset + 1, *status);
|
||||
|
||||
if (T != TBase) {
|
||||
if (T != TBase) {
|
||||
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(&coll->mapping, T);
|
||||
*(source->offsetStore++) = firstOffset + 1;
|
||||
}
|
||||
source->appendOffset(firstOffset + 1, *status);
|
||||
}
|
||||
|
||||
source->toReturn = source->CEpos - 1;
|
||||
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
return *(source->toReturn);
|
||||
source->offsetReturn = source->offsetStore - 1;
|
||||
if (source->offsetReturn == source->offsetBuffer) {
|
||||
source->offsetStore = source->offsetBuffer;
|
||||
}
|
||||
|
||||
return *(source->toReturn);
|
||||
} else {
|
||||
// Since Hanguls pass the FCD check, it is
|
||||
// guaranteed that we won't be in
|
||||
|
@ -3971,13 +4002,13 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
return getPrevImplicit(ch, source);
|
||||
|
||||
case SURROGATE_TAG: /* This is a surrogate pair */
|
||||
/* essentialy an engaged lead surrogate. */
|
||||
/* essentially an engaged lead surrogate. */
|
||||
/* if you have encountered it here, it means that a */
|
||||
/* broken sequence was encountered and this is an error */
|
||||
return 0;
|
||||
return UCOL_NOT_FOUND;
|
||||
|
||||
case LEAD_SURROGATE_TAG: /* D800-DBFF*/
|
||||
return 0; /* broken surrogate sequence */
|
||||
return UCOL_NOT_FOUND; /* broken surrogate sequence */
|
||||
|
||||
case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
|
||||
{
|
||||
|
@ -3986,7 +4017,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
const UChar *prev;
|
||||
if (isAtStartPrevIterate(source)) {
|
||||
/* we are at the start of the string, wrong place to be at */
|
||||
return 0;
|
||||
return UCOL_NOT_FOUND;
|
||||
}
|
||||
if (source->pos != source->writableBuffer.getBuffer()) {
|
||||
prev = source->pos - 1;
|
||||
|
@ -4000,7 +4031,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
cp = ((((uint32_t)prevChar)<<10UL)+(ch)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
|
||||
source->pos = prev;
|
||||
} else {
|
||||
return 0; /* completely ignorable */
|
||||
return UCOL_NOT_FOUND; /* like unassigned */
|
||||
}
|
||||
|
||||
return getPrevImplicit(cp, source);
|
||||
|
@ -4241,6 +4272,14 @@ ucol_getSortKeyWithAllocation(const UCollator *coll,
|
|||
|
||||
#define UCOL_FSEC_BUF_SIZE 256
|
||||
|
||||
// Is this primary weight compressible?
|
||||
// Returns false for multi-lead-byte scripts (digits, Latin, Han, implicit).
|
||||
// TODO: This should use per-lead-byte flags from FractionalUCA.txt.
|
||||
static inline UBool
|
||||
isCompressible(const UCollator * /*coll*/, uint8_t primary1) {
|
||||
return UCOL_BYTE_FIRST_NON_LATIN_PRIMARY <= primary1 && primary1 <= maxRegularPrimary;
|
||||
}
|
||||
|
||||
/* This function tries to get the size of a sortkey. It will be invoked if the size of resulting buffer is 0 */
|
||||
/* or if we run out of space while making a sortkey and want to return ASAP */
|
||||
int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t currentSize, UColAttributeValue strength, int32_t len) {
|
||||
|
@ -4330,7 +4369,7 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
|
|||
} else {
|
||||
wasShifted = FALSE;
|
||||
/* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
|
||||
/* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
|
||||
/* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
|
||||
/* calculate sortkey size */
|
||||
if(primary1 != UCOL_IGNORABLE) {
|
||||
if(notIsContinuation) {
|
||||
|
@ -4344,19 +4383,13 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
|
|||
/* one byter, not compressed */
|
||||
currentSize++;
|
||||
leadPrimary = 0;
|
||||
}
|
||||
else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
|
||||
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
|
||||
//(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
|
||||
(primary1 > maxRegularPrimary && primary1 < minImplicitPrimary))
|
||||
{
|
||||
/* not compressible */
|
||||
leadPrimary = 0;
|
||||
currentSize+=2;
|
||||
}
|
||||
else { /* compress */
|
||||
} else if(isCompressible(coll, primary1)) {
|
||||
/* compress */
|
||||
leadPrimary = primary1;
|
||||
currentSize+=2;
|
||||
} else {
|
||||
leadPrimary = 0;
|
||||
currentSize+=2;
|
||||
}
|
||||
}
|
||||
} else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
|
||||
|
@ -4678,9 +4711,7 @@ ucol_calcSortKey(const UCollator *coll,
|
|||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if(source == normSource.getBuffer()) {
|
||||
s.flags &= ~UCOL_ITER_NORM;
|
||||
}
|
||||
s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
|
||||
|
||||
if(resultLength == 0 || primaries == NULL) {
|
||||
return ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
|
||||
|
@ -4781,7 +4812,7 @@ ucol_calcSortKey(const UCollator *coll,
|
|||
} else {
|
||||
wasShifted = FALSE;
|
||||
/* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
|
||||
/* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
|
||||
/* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
|
||||
/* regular and simple sortkey calc */
|
||||
if(primary1 != UCOL_IGNORABLE) {
|
||||
if(notIsContinuation) {
|
||||
|
@ -4795,20 +4826,18 @@ ucol_calcSortKey(const UCollator *coll,
|
|||
/* one byter, not compressed */
|
||||
*primaries++ = primary1;
|
||||
leadPrimary = 0;
|
||||
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
|
||||
//(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
|
||||
(primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
|
||||
/* not compressible */
|
||||
leadPrimary = 0;
|
||||
*primaries++ = primary1;
|
||||
if(primaries <= primarySafeEnd) {
|
||||
*primaries++ = primary2;
|
||||
}
|
||||
} else { /* compress */
|
||||
} else if(isCompressible(coll, primary1)) {
|
||||
/* compress */
|
||||
*primaries++ = leadPrimary = primary1;
|
||||
if(primaries <= primarySafeEnd) {
|
||||
*primaries++ = primary2;
|
||||
}
|
||||
} else {
|
||||
leadPrimary = 0;
|
||||
*primaries++ = primary1;
|
||||
if(primaries <= primarySafeEnd) {
|
||||
*primaries++ = primary2;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
|
||||
|
@ -4957,9 +4986,7 @@ ucol_calcSortKey(const UCollator *coll,
|
|||
finished = TRUE;
|
||||
break;
|
||||
}
|
||||
if(source == normSource.getBuffer()) {
|
||||
s.flags &= ~UCOL_ITER_NORM;
|
||||
}
|
||||
s.flags &= ~UCOL_ITER_NORM;
|
||||
sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, strength, len);
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
finished = TRUE;
|
||||
|
@ -5283,9 +5310,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
|
|||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if(source == normSource.getBuffer()) {
|
||||
s.flags &= ~UCOL_ITER_NORM;
|
||||
}
|
||||
s.flags &= ~UCOL_ITER_NORM; // source passed the FCD test or else was normalized.
|
||||
|
||||
if(resultLength == 0 || primaries == NULL) {
|
||||
return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
|
||||
|
@ -5346,7 +5371,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
|
|||
primary1 = (uint8_t)(order >> 8);
|
||||
|
||||
/* Note: This code assumes that the table is well built i.e. not having 0 bytes where they are not supposed to be. */
|
||||
/* Usually, we'll have non-zero primary1 & primary2, except in cases of LatinOne and friends, when primary2 will */
|
||||
/* Usually, we'll have non-zero primary1 & primary2, except in cases of a-z and friends, when primary2 will */
|
||||
/* be zero with non zero primary1. primary3 is different than 0 only for long primaries - see above. */
|
||||
/* regular and simple sortkey calc */
|
||||
if(primary1 != UCOL_IGNORABLE) {
|
||||
|
@ -5361,17 +5386,14 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
|
|||
/* one byter, not compressed */
|
||||
*primaries++ = primary1;
|
||||
leadPrimary = 0;
|
||||
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
|
||||
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24)))
|
||||
//(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
|
||||
(primary1 > maxRegularPrimary && primary1 < minImplicitPrimary)) {
|
||||
/* not compressible */
|
||||
leadPrimary = 0;
|
||||
*primaries++ = primary1;
|
||||
*primaries++ = primary2;
|
||||
} else { /* compress */
|
||||
} else if(isCompressible(coll, primary1)) {
|
||||
/* compress */
|
||||
*primaries++ = leadPrimary = primary1;
|
||||
*primaries++ = primary2;
|
||||
} else {
|
||||
leadPrimary = 0;
|
||||
*primaries++ = primary1;
|
||||
*primaries++ = primary2;
|
||||
}
|
||||
}
|
||||
} else { /* we are in continuation, so we're gonna add primary to the key don't care about compression */
|
||||
|
@ -5450,9 +5472,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
|
|||
finished = TRUE;
|
||||
break;
|
||||
}
|
||||
if(source == normSource.getBuffer()) {
|
||||
s.flags &= ~UCOL_ITER_NORM;
|
||||
}
|
||||
s.flags &= ~UCOL_ITER_NORM;
|
||||
sortKeySize = ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
finished = TRUE;
|
||||
|
@ -7583,7 +7603,7 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
|
|||
tCE = tCEs.pos-2;
|
||||
for(;;) {
|
||||
while (secS == 0 && sCE >= sCEs.buf) {
|
||||
if(sCESave == 0) {
|
||||
if(sCESave == NULL) {
|
||||
secS = *(sCE--);
|
||||
if(isContinuation(secS)) {
|
||||
while(isContinuation(secS = *(sCE--)))
|
||||
|
@ -7597,7 +7617,8 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
|
|||
secS = *(sCE++);
|
||||
if(!isContinuation(secS)) { /* This means we have finished with this cont */
|
||||
sCE = sCESave; /* reset the pointer to before continuation */
|
||||
sCESave = 0;
|
||||
sCESave = NULL;
|
||||
secS = 0; /* Fetch a fresh CE before the continuation sequence. */
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -7605,7 +7626,7 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
|
|||
}
|
||||
|
||||
while(secT == 0 && tCE >= tCEs.buf) {
|
||||
if(tCESave == 0) {
|
||||
if(tCESave == NULL) {
|
||||
secT = *(tCE--);
|
||||
if(isContinuation(secT)) {
|
||||
while(isContinuation(secT = *(tCE--)))
|
||||
|
@ -7619,7 +7640,8 @@ ucol_strcollRegular(collIterate *sColl, collIterate *tColl, UErrorCode *status)
|
|||
secT = *(tCE++);
|
||||
if(!isContinuation(secT)) { /* This means we have finished with this cont */
|
||||
tCE = tCESave; /* reset the pointer to before continuation */
|
||||
tCESave = 0;
|
||||
tCESave = NULL;
|
||||
secT = 0; /* Fetch a fresh CE before the continuation sequence. */
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -512,8 +512,10 @@ static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t
|
|||
}
|
||||
}
|
||||
|
||||
if(low == 0) {
|
||||
low = 0x01000000;
|
||||
if(low < 0x02000000) {
|
||||
// We must not use CE weight byte 02, so we set it as the minimum lower bound.
|
||||
// See http://site.icu-project.org/design/collation/bytes
|
||||
low = 0x02000000;
|
||||
}
|
||||
|
||||
if(strength == UCOL_SECONDARY) { /* similar as simple */
|
||||
|
@ -761,7 +763,7 @@ U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErro
|
|||
fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
|
||||
fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
|
||||
}
|
||||
tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
|
||||
tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
|
||||
|
||||
do {
|
||||
fprintf(stderr,"%i", tok->strength);
|
||||
|
@ -769,7 +771,7 @@ U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErro
|
|||
} while(tok != NULL);
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
|
||||
tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
|
||||
|
||||
do {
|
||||
fprintf(stderr,"%i", tok->toInsert);
|
||||
|
|
|
@ -1402,12 +1402,13 @@ U_CDECL_END
|
|||
#ifdef UCOL_DEBUG
|
||||
// This is a debug function to print the contents of a trie.
|
||||
// It is used in conjuction with the code around utrie_unserialize call
|
||||
void enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
|
||||
UBool enumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
|
||||
if(start<0x10000) {
|
||||
fprintf(stdout, "%08X, %08X, %08X\n", start, limit, value);
|
||||
} else {
|
||||
fprintf(stdout, "%08X=%04X %04X, %08X=%04X %04X, %08X\n", start, UTF16_LEAD(start), UTF16_TRAIL(start), limit, UTF16_LEAD(limit), UTF16_TRAIL(limit), value);
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int32_t
|
||||
|
@ -1541,7 +1542,7 @@ uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
|
|||
if(U_SUCCESS(*status)) {
|
||||
utrie_enum(&UCAt, NULL, enumRange, NULL);
|
||||
}
|
||||
trieWord = UTRIE_GET32_FROM_LEAD(UCAt, 0xDC01)
|
||||
trieWord = UTRIE_GET32_FROM_LEAD(&UCAt, 0xDC01);
|
||||
}
|
||||
#endif
|
||||
tableOffset += paddedsize(mappingSize);
|
||||
|
|
|
@ -300,6 +300,10 @@ typedef struct collIterate : public U_NAMESPACE_QUALIFIER UMemory {
|
|||
|
||||
UCharIterator *iterator;
|
||||
/*int32_t iteratorIndex;*/
|
||||
|
||||
// The offsetBuffer should probably be a UVector32, but helper functions
|
||||
// are an improvement over duplicated code.
|
||||
void appendOffset(int32_t offset, UErrorCode &errorCode);
|
||||
} collIterate;
|
||||
|
||||
#else
|
||||
|
@ -630,7 +634,8 @@ ucol_setReqValidLocales(UCollator *coll, char *requestedLocaleToAdopt, char *val
|
|||
#define getExpansionSuffix(coleiter) ((coleiter)->iteratordata_.CEpos - (coleiter)->iteratordata_.toReturn)
|
||||
#define setExpansionSuffix(coleiter, offset) ((coleiter)->iteratordata_.toReturn = (coleiter)->iteratordata_.CEpos - leftoverces)
|
||||
|
||||
/* This is an enum that lists magic special byte values from the fractional UCA */
|
||||
/* This is an enum that lists magic special byte values from the fractional UCA.
|
||||
* See also http://site.icu-project.org/design/collation/bytes */
|
||||
/* TODO: all the #defines that refer to special byte values from the UCA should be changed to point here */
|
||||
|
||||
enum {
|
||||
|
@ -642,9 +647,9 @@ enum {
|
|||
UCOL_BYTE_FIRST_TAILORED = 0x04,
|
||||
UCOL_BYTE_COMMON = 0x05,
|
||||
UCOL_BYTE_FIRST_UCA = UCOL_BYTE_COMMON,
|
||||
UCOL_CODAN_PLACEHOLDER = 0x27,
|
||||
UCOL_BYTE_LAST_LATIN_PRIMARY = 0x4C,
|
||||
UCOL_BYTE_FIRST_NON_LATIN_PRIMARY = 0x4D,
|
||||
/* TODO: Make the following values dynamic since they change with almost every UCA version. */
|
||||
UCOL_CODAN_PLACEHOLDER = 0x12,
|
||||
UCOL_BYTE_FIRST_NON_LATIN_PRIMARY = 0x5B,
|
||||
UCOL_BYTE_UNSHIFTED_MAX = 0xFF
|
||||
};
|
||||
|
||||
|
|
|
@ -397,9 +397,9 @@ ucol_openRules( const UChar *rules,
|
|||
/* so something might be done here... or on lower level */
|
||||
#ifdef UCOL_DEBUG
|
||||
if(*status == U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
fprintf(stderr, "bad option starting at offset %i\n", src.current-src.source);
|
||||
fprintf(stderr, "bad option starting at offset %i\n", (int)(src.current-src.source));
|
||||
} else {
|
||||
fprintf(stderr, "invalid rule just before offset %i\n", src.current-src.source);
|
||||
fprintf(stderr, "invalid rule just before offset %i\n", (int)(src.current-src.source));
|
||||
}
|
||||
#endif
|
||||
goto cleanup;
|
||||
|
|
|
@ -508,7 +508,7 @@ ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) {
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef UCOL_DEBUG
|
||||
#if 0 // #ifdef UCOL_DEBUG
|
||||
|
||||
static void
|
||||
testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) {
|
||||
|
|
|
@ -1947,7 +1947,7 @@ static void TestShortString(void)
|
|||
uint32_t expectedIdentifier;
|
||||
} testCases[] = {
|
||||
/*
|
||||
* The following expectedOutput contains a collation weight (2D00 from UCA 5.2)
|
||||
* The following expectedOutput contains a collation weight (2700 from UCA 6.0)
|
||||
* which is the primary weight for the T character (U+0041) in the input.
|
||||
* When that character gets a different weight in FractionalUCA.txt,
|
||||
* the expectedOutput needs to be adjusted.
|
||||
|
@ -1955,7 +1955,7 @@ static void TestShortString(void)
|
|||
* in such a way that the absolute weight for 'A' changes,
|
||||
* we will get a test failure here and need to adjust the test case.
|
||||
*/
|
||||
{"LDE_RDE_KPHONEBOOK_T0041_ZLATN","B2D00_KPHONEBOOK_LDE", "de@collation=phonebook", U_USING_FALLBACK_WARNING, 0, 0 },
|
||||
{"LDE_RDE_KPHONEBOOK_T0041_ZLATN","B2700_KPHONEBOOK_LDE", "de@collation=phonebook", U_USING_FALLBACK_WARNING, 0, 0 },
|
||||
|
||||
{"LEN_RUS_NO_AS_S4","AS_LROOT_NO_S4", NULL, U_USING_DEFAULT_WARNING, 0, 0 },
|
||||
{"LDE_VPHONEBOOK_EO_SI","EO_KPHONEBOOK_LDE_SI", "de@collation=phonebook", U_ZERO_ERROR, 0, 0 },
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
#include "cstring.h"
|
||||
#include "ucol_imp.h"
|
||||
#include "ucol_tok.h"
|
||||
#include "uparse.h"
|
||||
#include <stdio.h>
|
||||
|
||||
extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *);
|
||||
|
@ -1028,41 +1029,40 @@ static int32_t hex2num(char hex) {
|
|||
* @param codepoints array for storage, assuming size > 5
|
||||
* @return position at the end of the codepoint section
|
||||
*/
|
||||
static char * getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) {
|
||||
char *pStartCP = str;
|
||||
char *pEndCP = str + 4;
|
||||
|
||||
*codepoints = (UChar)((hex2num(*pStartCP) << 12) |
|
||||
(hex2num(*(pStartCP + 1)) << 8) |
|
||||
(hex2num(*(pStartCP + 2)) << 4) |
|
||||
(hex2num(*(pStartCP + 3))));
|
||||
if (*pEndCP == '|' || *(pEndCP+1) == '|') {
|
||||
/* pre-context rule */
|
||||
pStartCP = pEndCP;
|
||||
while (*pStartCP==' ' || *pStartCP== '|' ) {
|
||||
pStartCP++;
|
||||
}
|
||||
pEndCP = pStartCP+4;
|
||||
*contextCPs = *codepoints;
|
||||
*(++codepoints) = (UChar)((hex2num(*pStartCP) << 12) |
|
||||
(hex2num(*(pStartCP + 1)) << 8) |
|
||||
(hex2num(*(pStartCP + 2)) << 4) |
|
||||
(hex2num(*(pStartCP + 3))));
|
||||
contextCPs++;
|
||||
}
|
||||
*contextCPs = 0;
|
||||
codepoints ++;
|
||||
while (*pEndCP != ';') {
|
||||
pStartCP = pEndCP + 1;
|
||||
*codepoints = (UChar)((hex2num(*pStartCP) << 12) |
|
||||
(hex2num(*(pStartCP + 1)) << 8) |
|
||||
(hex2num(*(pStartCP + 2)) << 4) |
|
||||
(hex2num(*(pStartCP + 3))));
|
||||
codepoints ++;
|
||||
pEndCP = pStartCP + 4;
|
||||
}
|
||||
static char *getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
char *semi = uprv_strchr(str, ';');
|
||||
char *pipe = uprv_strchr(str, '|');
|
||||
char *s;
|
||||
*codepoints = 0;
|
||||
return pEndCP + 1;
|
||||
*contextCPs = 0;
|
||||
if(semi == NULL) {
|
||||
log_err("expected semicolon after code point string in FractionalUCA.txt %s\n", str);
|
||||
return str;
|
||||
}
|
||||
if(pipe != NULL) {
|
||||
int32_t contextLength;
|
||||
*pipe = 0;
|
||||
contextLength = u_parseString(str, contextCPs, 99, NULL, &errorCode);
|
||||
*pipe = '|';
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err("error parsing precontext string from FractionalUCA.txt %s\n", str);
|
||||
return str;
|
||||
}
|
||||
/* prepend the precontext string to the codepoints */
|
||||
u_memcpy(codepoints, contextCPs, contextLength);
|
||||
codepoints += contextLength;
|
||||
/* start of the code point string */
|
||||
s = pipe + 1;
|
||||
} else {
|
||||
s = str;
|
||||
}
|
||||
u_parseString(s, codepoints, 99, NULL, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err("error parsing code point string from FractionalUCA.txt %s\n", str);
|
||||
return str;
|
||||
}
|
||||
return semi + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1262,7 +1262,7 @@ static FileStream * getFractionalUCA(void)
|
|||
*/
|
||||
static void TestCEs() {
|
||||
FileStream *file = NULL;
|
||||
char line[1024];
|
||||
char line[2048];
|
||||
char *str;
|
||||
UChar codepoints[10];
|
||||
uint32_t ces[20];
|
||||
|
@ -1525,7 +1525,6 @@ static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
|
|||
UBool result = FALSE;
|
||||
UBool primaryDone = FALSE, secondaryDone = FALSE, tertiaryDone = FALSE;
|
||||
const char * collLocale;
|
||||
char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5];
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
log_err("Error creating iterator for testing validity\n");
|
||||
|
@ -1547,6 +1546,22 @@ static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
|
|||
if (ce == 0) {
|
||||
continue;
|
||||
}
|
||||
if (ce == 0x02000202) {
|
||||
/* special CE for merge-sort character */
|
||||
if (*codepoints == 0xFFFE /* && length == 1 */) {
|
||||
/*
|
||||
* Note: We should check for length==1 but the token parser appears
|
||||
* to give us trailing NUL characters.
|
||||
* TODO: Ticket #8047: Change TestCEValidity to use ucol_getTailoredSet()
|
||||
* rather than the internal collation rule parser
|
||||
*/
|
||||
continue;
|
||||
} else {
|
||||
log_err("Special 02/02/02 weight for code point U+%04X [len %d] != U+FFFE\n",
|
||||
(int)*codepoints, (int)length);
|
||||
break;
|
||||
}
|
||||
}
|
||||
primary = UCOL_PRIMARYORDER(ce);
|
||||
p1 = primary >> 8;
|
||||
p2 = primary & 0xFF;
|
||||
|
@ -1603,8 +1618,7 @@ static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
|
|||
break;
|
||||
}
|
||||
if (tertiary <= 2) {
|
||||
showCodepoints(codepoints, length, codepointText);
|
||||
log_err("Tertiary byte of %08lX out of range: locale %s, codepoints %s\n", (long)ce, collLocale, codepointText);
|
||||
log_err("Tertiary byte of %08lX out of range\n", (long)ce);
|
||||
break;
|
||||
}
|
||||
tertiaryDone = FALSE;
|
||||
|
@ -1656,14 +1670,18 @@ static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints,
|
|||
if (tertiary == 0) {
|
||||
tertiaryDone = TRUE;
|
||||
} else if (tertiary <= 2) {
|
||||
showCodepoints(codepoints, length, codepointText);
|
||||
log_err("Tertiary byte of %08lX out of range: locale %s, codepoints %s\n", (long)ce, collLocale, codepointText);
|
||||
log_err("Tertiary byte of %08lX out of range\n", (long)ce);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
ucol_closeElements(iter);
|
||||
return result;
|
||||
}
|
||||
if (!result) {
|
||||
char codepointText[5*MAX_CODEPOINTS_TO_SHOW + 5];
|
||||
showCodepoints(codepoints, length, codepointText);
|
||||
log_err("Locale: %s Code point string: %s\n", collLocale, codepointText);
|
||||
}
|
||||
ucol_closeElements(iter);
|
||||
return result;
|
||||
}
|
||||
|
||||
static void TestCEValidity()
|
||||
|
@ -1676,7 +1694,7 @@ static void TestCEValidity()
|
|||
char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"};
|
||||
const char *loc;
|
||||
FileStream *file = NULL;
|
||||
char line[1024];
|
||||
char line[2048];
|
||||
UChar codepoints[10];
|
||||
int count = 0;
|
||||
int maxCount = 0;
|
||||
|
@ -1883,7 +1901,7 @@ static void TestSortKeyValidity(void)
|
|||
/* tailored locales */
|
||||
char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"};
|
||||
FileStream *file = NULL;
|
||||
char line[1024];
|
||||
char line[2048];
|
||||
UChar codepoints[10];
|
||||
int count = 0;
|
||||
UChar contextCPs[5];
|
||||
|
@ -1906,6 +1924,10 @@ static void TestSortKeyValidity(void)
|
|||
}
|
||||
|
||||
getCodePoints(line, codepoints, contextCPs);
|
||||
if(codepoints[0] == 0xFFFE) {
|
||||
/* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
|
||||
continue;
|
||||
}
|
||||
checkSortKeyValidity(coll, codepoints, u_strlen(codepoints));
|
||||
}
|
||||
|
||||
|
@ -1976,6 +1998,10 @@ static void TestSortKeyValidity(void)
|
|||
uprv_memcpy(codepoints, src.source + chOffset,
|
||||
chLen * sizeof(UChar));
|
||||
codepoints[chLen] = 0;
|
||||
if(codepoints[0] == 0xFFFE) {
|
||||
/* Skip special merge-sort character U+FFFE which has otherwise illegal 02 weight bytes. */
|
||||
continue;
|
||||
}
|
||||
checkSortKeyValidity(coll, codepoints, chLen);
|
||||
}
|
||||
free(rulesCopy);
|
||||
|
|
|
@ -3102,8 +3102,15 @@ static void TestVariableTopSetting(void) {
|
|||
varTop1 = ucol_setVariableTop(coll, conts, 3, &status);
|
||||
}
|
||||
if(U_FAILURE(status)) {
|
||||
log_err("Couldn't set variable top to a contraction %04X %04X %04X\n",
|
||||
*conts, *(conts+1), *(conts+2));
|
||||
if(status == U_PRIMARY_TOO_LONG_ERROR) {
|
||||
/* ucol_setVariableTop() is documented to not accept 3-byte primaries,
|
||||
* therefore it is not an error when it complains about them. */
|
||||
log_verbose("Couldn't set variable top to a contraction %04X %04X %04X - U_PRIMARY_TOO_LONG_ERROR\n",
|
||||
*conts, *(conts+1), *(conts+2));
|
||||
} else {
|
||||
log_err("Couldn't set variable top to a contraction %04X %04X %04X - %s\n",
|
||||
*conts, *(conts+1), *(conts+2), u_errorName(status));
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
}
|
||||
conts+=3;
|
||||
|
@ -3153,10 +3160,11 @@ static void TestVariableTopSetting(void) {
|
|||
|
||||
static void TestNonChars(void) {
|
||||
static const char *test[] = {
|
||||
"\\u0000",
|
||||
"\\uFFFE", "\\uFFFF",
|
||||
"\\U0001FFFE", "\\U0001FFFF",
|
||||
"\\U0002FFFE", "\\U0002FFFF",
|
||||
"\\u0000", /* ignorable */
|
||||
"\\uFFFE", /* special merge-sort character with minimum non-ignorable weights */
|
||||
"\\uFDD0", "\\uFDEF",
|
||||
"\\U0001FFFE", "\\U0001FFFF", /* UCA 6.0: noncharacters are treated like unassigned, */
|
||||
"\\U0002FFFE", "\\U0002FFFF", /* not like ignorable. */
|
||||
"\\U0003FFFE", "\\U0003FFFF",
|
||||
"\\U0004FFFE", "\\U0004FFFF",
|
||||
"\\U0005FFFE", "\\U0005FFFF",
|
||||
|
@ -3170,7 +3178,8 @@ static void TestNonChars(void) {
|
|||
"\\U000DFFFE", "\\U000DFFFF",
|
||||
"\\U000EFFFE", "\\U000EFFFF",
|
||||
"\\U000FFFFE", "\\U000FFFFF",
|
||||
"\\U0010FFFE", "\\U0010FFFF"
|
||||
"\\U0010FFFE", "\\U0010FFFF",
|
||||
"\\uFFFF" /* special character with maximum primary weight */
|
||||
};
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCollator *coll = ucol_open("en_US", &status);
|
||||
|
@ -3178,7 +3187,7 @@ static void TestNonChars(void) {
|
|||
log_verbose("Test non characters\n");
|
||||
|
||||
if(U_SUCCESS(status)) {
|
||||
genericOrderingTestWithResult(coll, test, 35, UCOL_EQUAL);
|
||||
genericOrderingTestWithResult(coll, test, 35, UCOL_LESS);
|
||||
} else {
|
||||
log_err_status(status, "Unable to open collator\n");
|
||||
}
|
||||
|
@ -3634,13 +3643,31 @@ static void TestRuleOptions(void) {
|
|||
{ "c", "b", "\\u0009", "a", "\\u000a" }, 5
|
||||
},
|
||||
|
||||
/*
|
||||
* These strings contain the last character before [variable top]
|
||||
* and the first and second characters (by primary weights) after it.
|
||||
* See FractionalUCA.txt. For example:
|
||||
[last variable [0C FE, 05, 05]] # U+10A7F OLD SOUTH ARABIAN NUMERIC INDICATOR
|
||||
[variable top = 0C FE]
|
||||
[first regular [0D 0A, 05, 05]] # U+0060 GRAVE ACCENT
|
||||
and
|
||||
00B4; [0D 0C, 05, 05]
|
||||
*
|
||||
* Note: Starting with UCA 6.0, the [variable top] collation element
|
||||
* is not the weight of any character or string,
|
||||
* which means that LAST_VARIABLE_CHAR_STRING sorts before [last variable].
|
||||
*/
|
||||
#define LAST_VARIABLE_CHAR_STRING "\\U00010A7F"
|
||||
#define FIRST_REGULAR_CHAR_STRING "\\u0060"
|
||||
#define SECOND_REGULAR_CHAR_STRING "\\u00B4"
|
||||
|
||||
{ "&[last variable]<a &[before 3][last variable]<<<c<<<b ",
|
||||
{ "c", "b", "\\uD834\\uDF71", "a", "\\u02d0" }, 5
|
||||
{ LAST_VARIABLE_CHAR_STRING, "c", "b", /* [last variable] */ "a", FIRST_REGULAR_CHAR_STRING }, 5
|
||||
},
|
||||
|
||||
{ "&[first regular]<a"
|
||||
"&[before 1][first regular]<b",
|
||||
{ "b", "\\u02d0", "a", "\\u02d1"}, 4
|
||||
{ "b", FIRST_REGULAR_CHAR_STRING, "a", SECOND_REGULAR_CHAR_STRING }, 4
|
||||
},
|
||||
|
||||
/*
|
||||
|
@ -3648,11 +3675,17 @@ static void TestRuleOptions(void) {
|
|||
* has to match the character that has the [last regular] weight
|
||||
* which changes with each UCA version.
|
||||
* See the bottom of FractionalUCA.txt which says something like
|
||||
* [last regular [CE 27, 05, 05]] # U+1342E EGYPTIAN HIEROGLYPH AA032
|
||||
[last regular [7A FE, 05, 05]] # U+1342E EGYPTIAN HIEROGLYPH AA032
|
||||
*
|
||||
* Note: Starting with UCA 6.0, the [last regular] collation element
|
||||
* is not the weight of any character or string,
|
||||
* which means that LAST_REGULAR_CHAR_STRING sorts before [last regular].
|
||||
*/
|
||||
#define LAST_REGULAR_CHAR_STRING "\\U0001342E"
|
||||
|
||||
{ "&[before 1][last regular]<b"
|
||||
"&[last regular]<a",
|
||||
{ "b", "\\U0001342E", "a", "\\u4e00" }, 4
|
||||
{ LAST_REGULAR_CHAR_STRING, "b", /* [last regular] */ "a", "\\u4e00" }, 4
|
||||
},
|
||||
|
||||
{ "&[before 1][first implicit]<b"
|
||||
|
@ -3670,7 +3703,7 @@ static void TestRuleOptions(void) {
|
|||
"&[last secondary ignorable]<<y"
|
||||
"&[last tertiary ignorable]<<<w"
|
||||
"&[top]<u",
|
||||
{"\\ufffb", "w", "y", "\\u20e3", "x", "\\u137c", "z", "u"}, 7
|
||||
{"\\ufffb", "w", "y", "\\u20e3", "x", LAST_VARIABLE_CHAR_STRING, "z", "u"}, 7
|
||||
}
|
||||
|
||||
};
|
||||
|
|
|
@ -1284,23 +1284,23 @@ void CollationAPITest::TestMaxExpansion()
|
|||
|
||||
size = coll.getMaxExpansion(order);
|
||||
if (U_FAILURE(status) || size < count) {
|
||||
errln("Failure at codepoint %d, maximum expansion count < %d\n",
|
||||
ch, count);
|
||||
errln("Failure at codepoint U+%04X, maximum expansion count %d < %d",
|
||||
ch, size, count);
|
||||
}
|
||||
}
|
||||
|
||||
/* testing for exact max expansion */
|
||||
int32_t size;
|
||||
ch = 0;
|
||||
while (ch < 0x61) {
|
||||
uint32_t order;
|
||||
int32_t size;
|
||||
str.setCharAt(0, ch);
|
||||
iter->setText(str, status);
|
||||
order = iter->previous(status);
|
||||
size = coll.getMaxExpansion(order);
|
||||
if (U_FAILURE(status) || size != 1) {
|
||||
errln("Failure at codepoint %d, maximum expansion count < %d\n",
|
||||
ch, 1);
|
||||
errln("Failure at codepoint U+%04X, maximum expansion count %d < %d",
|
||||
ch, size, 1);
|
||||
}
|
||||
ch ++;
|
||||
}
|
||||
|
@ -1309,29 +1309,29 @@ void CollationAPITest::TestMaxExpansion()
|
|||
str.setTo(ch);
|
||||
iter->setText(str, status);
|
||||
temporder = iter->previous(status);
|
||||
|
||||
if (U_FAILURE(status) || coll.getMaxExpansion(temporder) != 3) {
|
||||
errln("Failure at codepoint %d, maximum expansion count != %d\n",
|
||||
ch, 3);
|
||||
size = coll.getMaxExpansion(temporder);
|
||||
if (U_FAILURE(status) || size != 3) {
|
||||
errln("Failure at codepoint U+%04X, CE %08x, maximum expansion count %d != %d",
|
||||
ch, temporder, size, 3);
|
||||
}
|
||||
|
||||
ch = 0x64;
|
||||
str.setTo(ch);
|
||||
iter->setText(str, status);
|
||||
temporder = iter->previous(status);
|
||||
|
||||
if (U_FAILURE(status) || coll.getMaxExpansion(temporder) != 1) {
|
||||
errln("Failure at codepoint %d, maximum expansion count != %d\n",
|
||||
ch, 3);
|
||||
size = coll.getMaxExpansion(temporder);
|
||||
if (U_FAILURE(status) || size != 1) {
|
||||
errln("Failure at codepoint U+%04X, CE %08x, maximum expansion count %d != %d",
|
||||
ch, temporder, size, 1);
|
||||
}
|
||||
|
||||
str.setTo(unassigned);
|
||||
iter->setText(str, status);
|
||||
sorder = iter->previous(status);
|
||||
|
||||
if (U_FAILURE(status) || coll.getMaxExpansion(sorder) != 2) {
|
||||
errln("Failure at supplementary codepoints, maximum expansion count < %d\n",
|
||||
2);
|
||||
size = coll.getMaxExpansion(sorder);
|
||||
if (U_FAILURE(status) || size != 2) {
|
||||
errln("Failure at supplementary codepoints, maximum expansion count %d < %d",
|
||||
size, 2);
|
||||
}
|
||||
|
||||
/* testing jamo */
|
||||
|
@ -1339,9 +1339,10 @@ void CollationAPITest::TestMaxExpansion()
|
|||
str.setTo(ch);
|
||||
iter->setText(str, status);
|
||||
temporder = iter->previous(status);
|
||||
if (U_FAILURE(status) || coll.getMaxExpansion(temporder) > 3) {
|
||||
errln("Failure at codepoint %d, maximum expansion count > %d\n",
|
||||
ch, 3);
|
||||
size = coll.getMaxExpansion(temporder);
|
||||
if (U_FAILURE(status) || size > 3) {
|
||||
errln("Failure at codepoint U+%04X, maximum expansion count %d > %d",
|
||||
ch, size, 3);
|
||||
}
|
||||
|
||||
delete iter;
|
||||
|
@ -1352,9 +1353,10 @@ void CollationAPITest::TestMaxExpansion()
|
|||
RuleBasedCollator jamocoll(rule, status);
|
||||
iter = jamocoll.createCollationElementIterator(str);
|
||||
temporder = iter->previous(status);
|
||||
if (U_FAILURE(status) || iter->getMaxExpansion(temporder) != 6) {
|
||||
errln("Failure at codepoint %d, maximum expansion count > %d\n",
|
||||
ch, 5);
|
||||
size = iter->getMaxExpansion(temporder);
|
||||
if (U_FAILURE(status) || size != 6) {
|
||||
errln("Failure at codepoint U+%04X, maximum expansion count %d > %d",
|
||||
ch, size, 5);
|
||||
}
|
||||
|
||||
delete iter;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -196,11 +196,12 @@ DataDrivenCollatorTest::processTest(TestData *testData) {
|
|||
if(U_SUCCESS(status)) {
|
||||
logln("Testing collator for rules "+testSetting);
|
||||
} else {
|
||||
errln("Unable to instantiate collator for rules "+testSetting);
|
||||
errln("Unable to instantiate collator for rules "+testSetting+" - "+u_errorName(status));
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
errln("No collator definition!");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -74,6 +74,7 @@ SSearchTest::~SSearchTest()
|
|||
|
||||
void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
|
||||
{
|
||||
static const UVersionInfo icu47 = { 4, 7, 0, 0 };
|
||||
if (exec) logln("TestSuite SSearchTest: ");
|
||||
switch (index) {
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
@ -82,7 +83,7 @@ void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
|||
break;
|
||||
|
||||
case 1: name = "offsetTest";
|
||||
if (exec) offsetTest();
|
||||
if (exec && isICUVersionAtLeast(icu47)) offsetTest();
|
||||
break;
|
||||
|
||||
case 2: name = "monkeyTest";
|
||||
|
@ -90,7 +91,7 @@ void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
|||
break;
|
||||
|
||||
case 3: name = "bmMonkeyTest";
|
||||
if (exec) bmMonkeyTest(params);
|
||||
if (exec && isICUVersionAtLeast(icu47)) bmMonkeyTest(params);
|
||||
break;
|
||||
|
||||
case 4: name = "boyerMooreTest";
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1999-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1999-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -860,34 +860,34 @@ public:
|
|||
virtual void run() {
|
||||
//sleep(10000);
|
||||
int32_t line = 0;
|
||||
|
||||
|
||||
uint8_t sk1[1024], sk2[1024];
|
||||
uint8_t *oldSk = NULL, *newSk = sk1;
|
||||
int32_t resLen = 0, oldLen = 0;
|
||||
int32_t i = 0;
|
||||
|
||||
|
||||
for(i = 0; i < noLines; i++) {
|
||||
resLen = ucol_getSortKey(coll, lines[i].buff, lines[i].buflen, newSk, 1024);
|
||||
|
||||
|
||||
int32_t res = 0, cmpres = 0, cmpres2 = 0;
|
||||
|
||||
|
||||
if(oldSk != NULL) {
|
||||
res = strcmp((char *)oldSk, (char *)newSk);
|
||||
cmpres = ucol_strcoll(coll, lines[i-1].buff, lines[i-1].buflen, lines[i].buff, lines[i].buflen);
|
||||
cmpres2 = ucol_strcoll(coll, lines[i].buff, lines[i].buflen, lines[i-1].buff, lines[i-1].buflen);
|
||||
//cmpres = res;
|
||||
//cmpres2 = -cmpres;
|
||||
|
||||
|
||||
if(cmpres != -cmpres2) {
|
||||
error("Compare result not symmetrical on line "+ line);
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
if(((res&0x80000000) != (cmpres&0x80000000)) || (res == 0 && cmpres != 0) || (res != 0 && cmpres == 0)) {
|
||||
error(UnicodeString("Difference between ucol_strcoll and sortkey compare on line ")+ UnicodeString(line));
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
if(res > 0) {
|
||||
error(UnicodeString("Line %i is not greater or equal than previous line ")+ UnicodeString(i));
|
||||
break;
|
||||
|
@ -896,20 +896,24 @@ public:
|
|||
if (res == 0) {
|
||||
error(UnicodeString("Probable error in test file on line %i (comparing identical strings)")+ UnicodeString(i));
|
||||
break;
|
||||
} else if (res > 0) {
|
||||
error(UnicodeString("Sortkeys are identical, but code point comapare gives >0 on line ")+ UnicodeString(i));
|
||||
}
|
||||
/*
|
||||
* UCA 6.0 test files can have lines that compare == if they are
|
||||
* different strings but canonically equivalent.
|
||||
else if (res > 0) {
|
||||
error(UnicodeString("Sortkeys are identical, but code point compare gives >0 on line ")+ UnicodeString(i));
|
||||
break;
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
oldSk = newSk;
|
||||
oldLen = resLen;
|
||||
|
||||
|
||||
newSk = (newSk == sk1)?sk2:sk1;
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
void MultithreadTest::TestCollators()
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2002-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 2002-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -223,11 +223,16 @@ void UCAConformanceTest::testConformance(UCollator *coll)
|
|||
if (res == 0) {
|
||||
errln("Probable error in test file on line %i (comparing identical strings)", line);
|
||||
errln(" Data line %s", lineB);
|
||||
} else if (res > 0) {
|
||||
errln("Sortkeys are identical, but code point comapare gives >0 on line %i", line);
|
||||
}
|
||||
/*
|
||||
* UCA 6.0 test files can have lines that compare == if they are
|
||||
* different strings but canonically equivalent.
|
||||
else if (res > 0) {
|
||||
errln("Sortkeys are identical, but code point compare gives >0 on line %i", line);
|
||||
errln(" Previous data line %s", oldLineB);
|
||||
errln(" Current data line %s", lineB);
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
55476
icu4c/source/test/testdata/CollationTest_SHIFTED_SHORT.txt
vendored
55476
icu4c/source/test/testdata/CollationTest_SHIFTED_SHORT.txt
vendored
File diff suppressed because it is too large
Load diff
Loading…
Add table
Reference in a new issue