mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-18 11:14:22 +00:00
ICU-1930 new UCA + skipping ignorables in contraction and prefixes
X-SVN-Rev: 9004
This commit is contained in:
parent
f0c4f70cf1
commit
3e72d5fee5
6 changed files with 304 additions and 158 deletions
|
@ -54,6 +54,7 @@ U_NAMESPACE_USE
|
|||
#define ZERO_CC_LIMIT_ 0xC0
|
||||
|
||||
static UCollator* UCA = NULL;
|
||||
static UCAConstants *UCAconsts = NULL;
|
||||
static UDataMemory* UCA_DATA_MEM = NULL;
|
||||
|
||||
|
||||
|
@ -83,6 +84,13 @@ isAcceptableUCA(void * /*context*/,
|
|||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
_getFoldingOffset(uint32_t data) {
|
||||
return (int32_t)(data&0xFFFFFF);
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
static
|
||||
|
@ -224,6 +232,22 @@ inline UBool collIter_eos(collIterate *s) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* collIter_bos()
|
||||
* Checks for a collIterate being positioned at the start of
|
||||
* its source string.
|
||||
*
|
||||
*/
|
||||
static
|
||||
inline UBool collIter_bos(collIterate *source) {
|
||||
if (source->pos <= source->string ||
|
||||
((source->flags & UCOL_ITER_INNORMBUF) &&
|
||||
*(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
|
||||
return TRUE;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks and free writable buffer if it is not the original stack buffer
|
||||
|
@ -996,8 +1020,9 @@ void ucol_initUCA(UErrorCode *status) {
|
|||
ucln_i18n_registerCleanup();
|
||||
}
|
||||
// Initalize variables for implicit generation
|
||||
UCAConstants *consts = (UCAConstants *)((uint8_t *)UCA->image + UCA->image->UCAConsts);
|
||||
uprv_uca_initImplicitConstants(consts->UCA_PRIMARY_IMPLICIT_MIN);
|
||||
UCAconsts = (UCAConstants *)((uint8_t *)UCA->image + UCA->image->UCAConsts);
|
||||
uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN);
|
||||
UCA->mapping->getFoldingOffset = _getFoldingOffset;
|
||||
}else{
|
||||
udata_close(result);
|
||||
uprv_free(newUCA);
|
||||
|
@ -2039,65 +2064,19 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
|
|||
return *(coll->contractionCEs + (constart - coll->contractionIndex));
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* added for Han implicit CE */
|
||||
static const uint32_t IMPLICIT_HAN_START_ = 0x3400;
|
||||
static const uint32_t IMPLICIT_HAN_LIMIT_ = 0xA000;
|
||||
static const uint32_t IMPLICIT_SUPPLEMENTARY_COUNT_ = 0x100000;
|
||||
static const uint32_t IMPLICIT_BYTES_TO_AVOID_ = 3;
|
||||
static const uint32_t IMPLICIT_OTHER_COUNT_ = 256 - IMPLICIT_BYTES_TO_AVOID_;
|
||||
static const uint32_t IMPLICIT_LAST_COUNT_ = IMPLICIT_OTHER_COUNT_ / 2;
|
||||
static const uint32_t IMPLICIT_LAST_COUNT2_ =
|
||||
(IMPLICIT_SUPPLEMENTARY_COUNT_ - 1) /
|
||||
(IMPLICIT_OTHER_COUNT_ * IMPLICIT_OTHER_COUNT_) + 1;
|
||||
static const uint32_t IMPLICIT_HAN_SHIFT_ = IMPLICIT_LAST_COUNT_ *
|
||||
IMPLICIT_OTHER_COUNT_ - IMPLICIT_HAN_START_;
|
||||
static const uint32_t IMPLICIT_BOUNDARY_ = 2 * IMPLICIT_OTHER_COUNT_ *
|
||||
IMPLICIT_LAST_COUNT_ + IMPLICIT_HAN_START_;
|
||||
static const uint32_t IMPLICIT_LAST2_MULTIPLIER_ = IMPLICIT_OTHER_COUNT_ /
|
||||
IMPLICIT_LAST_COUNT2_;
|
||||
|
||||
|
||||
static
|
||||
inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource, uint32_t hanFixup) {
|
||||
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
|
||||
return 0; /* illegal code value, use completely ignoreable! */
|
||||
inline UBool isNonChar(UChar32 cp) {
|
||||
if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDC00)) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
we must skip all 00, 01, 02 bytes, so most bytes have 253 values
|
||||
we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
|
||||
we shift so that HAN all has the same first primary, for compression.
|
||||
for the 4 byte case, we make the gap as large as we can fit.
|
||||
Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
|
||||
Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
|
||||
*/
|
||||
int32_t last0 = cp - IMPLICIT_BOUNDARY_;
|
||||
uint32_t r = 0;
|
||||
if (last0 < 0) {
|
||||
cp += IMPLICIT_HAN_SHIFT_; // shift so HAN shares single block
|
||||
int32_t last1 = cp / IMPLICIT_LAST_COUNT_;
|
||||
last0 = cp % IMPLICIT_LAST_COUNT_;
|
||||
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
|
||||
last1 %= IMPLICIT_OTHER_COUNT_;
|
||||
r = 0xEC030300 - hanFixup + (last2 << 24) + (last1 << 16) + (last0 << 9);
|
||||
} else {
|
||||
int32_t last1 = last0 / IMPLICIT_LAST_COUNT2_;
|
||||
last0 %= IMPLICIT_LAST_COUNT2_;
|
||||
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
|
||||
last1 %= IMPLICIT_OTHER_COUNT_;
|
||||
r = 0xEF030303 - hanFixup + (last2 << 16) + (last1 << 8) + (last0 * IMPLICIT_LAST2_MULTIPLIER_);
|
||||
}
|
||||
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
|
||||
return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
|
||||
return FALSE;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* now uses Mark's getImplicitPrimary code */
|
||||
static
|
||||
inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
|
||||
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
|
||||
return 0; /* illegal code value, use completely ignoreable! */
|
||||
if(isNonChar(cp)) {
|
||||
return 0;
|
||||
}
|
||||
uint32_t r = getImplicitPrimary(cp);
|
||||
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
|
||||
|
@ -2208,10 +2187,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
|
||||
// First we position ourselves at the begining of contraction sequence
|
||||
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
|
||||
if (source->pos == source->string ||
|
||||
((source->flags & UCOL_ITER_INNORMBUF) &&
|
||||
*(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
|
||||
// if(sourcePointer == source->string) {
|
||||
if (collIter_bos(source)) {
|
||||
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
||||
break;
|
||||
}
|
||||
|
@ -2231,6 +2207,42 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
}
|
||||
else
|
||||
{
|
||||
// if there is a completely ignorable code point in the middle of
|
||||
// a prefix, we need to act as if it's not there
|
||||
// assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
|
||||
// lone surrogates cannot be set to zero as it would break other processing
|
||||
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
|
||||
// it's easy for BMP code points
|
||||
if(isZeroCE == 0) {
|
||||
continue;
|
||||
} else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
|
||||
// for supplementary code points, we have to check the next one
|
||||
// situations where we are going to ignore
|
||||
// 1. beginning of the string: schar is a lone surrogate
|
||||
// 2. schar is a lone surrogate
|
||||
// 3. schar is a trail surrogate in a valid surrogate sequence
|
||||
// that is explicitly set to zero.
|
||||
if (!collIter_bos(source)) {
|
||||
UChar lead;
|
||||
if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
|
||||
isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
|
||||
if(getCETag(isZeroCE) == SURROGATE_TAG) {
|
||||
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
|
||||
if(finalCE == 0) {
|
||||
// this is a real, assigned completely ignorable code point
|
||||
source->pos--;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// lone surrogate, completely ignorable
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// lone surrogate at the beggining, completely ignorable
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Source string char was not in the table.
|
||||
// We have not found the prefix.
|
||||
CE = *(coll->contractionCEs +
|
||||
|
@ -2297,6 +2309,35 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
|
|||
}
|
||||
else
|
||||
{
|
||||
// if there is a completely ignorable code point in the middle of
|
||||
// contraction, we need to act as if it's not there
|
||||
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
|
||||
// it's easy for BMP code points
|
||||
if(isZeroCE == 0) {
|
||||
continue;
|
||||
} else if(UTF_IS_LEAD(schar)) {
|
||||
if(!collIter_eos(source)) {
|
||||
backupState(source, &state);
|
||||
UChar trail = getNextNormalizedChar(source);
|
||||
if(UTF_IS_TRAIL(trail)) { // do stuff with trail
|
||||
if(getCETag(isZeroCE) == SURROGATE_TAG) {
|
||||
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail);
|
||||
if(finalCE == 0) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// broken surrogate sequence, thus completely ignorable
|
||||
loadState(source, &state, TRUE);
|
||||
continue;
|
||||
}
|
||||
loadState(source, &state, TRUE);
|
||||
} else { // no more characters, so broken surrogate pair...
|
||||
// this contraction will ultimately fail, but not because of us
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Source string char was not in contraction table.
|
||||
// Unless we have a discontiguous contraction, we have finished
|
||||
// with this contraction.
|
||||
|
@ -2710,56 +2751,19 @@ inline UChar getPrevNormalizedChar(collIterate *data)
|
|||
return ch;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static
|
||||
inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource, uint32_t hanFixup) {
|
||||
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
|
||||
return 0; /* illegal code value, use completely ignoreable! */
|
||||
}
|
||||
/* we must skip all 00, 01, 02 bytes, so most bytes have 253 values
|
||||
we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
|
||||
we shift so that HAN all has the same first primary, for compression.
|
||||
for the 4 byte case, we make the gap as large as we can fit.
|
||||
Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
|
||||
Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
|
||||
*/
|
||||
int32_t last0 = cp - IMPLICIT_BOUNDARY_;
|
||||
uint32_t r = 0;
|
||||
|
||||
if (last0 < 0) {
|
||||
cp += IMPLICIT_HAN_SHIFT_; // shift so HAN shares single block
|
||||
int32_t last1 = cp / IMPLICIT_LAST_COUNT_;
|
||||
last0 = cp % IMPLICIT_LAST_COUNT_;
|
||||
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
|
||||
last1 %= IMPLICIT_OTHER_COUNT_;
|
||||
r = 0xEC030300 - hanFixup + (last2 << 24) + (last1 << 16) + (last0 << 9);
|
||||
} else {
|
||||
int32_t last1 = last0 / IMPLICIT_LAST_COUNT2_;
|
||||
last0 %= IMPLICIT_LAST_COUNT2_;
|
||||
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
|
||||
last1 %= IMPLICIT_OTHER_COUNT_;
|
||||
r = 0xEF030303 - hanFixup + (last2 << 16) + (last1 << 8) +
|
||||
(last0 * IMPLICIT_LAST2_MULTIPLIER_);
|
||||
}
|
||||
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
|
||||
collationSource->toReturn = collationSource->CEpos;
|
||||
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/* now uses Mark's getImplicitPrimary code */
|
||||
static
|
||||
inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
|
||||
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
|
||||
return 0; /* illegal code value, use completely ignoreable! */
|
||||
}
|
||||
if(isNonChar(cp)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t r = getImplicitPrimary(cp);
|
||||
uint32_t r = getImplicitPrimary(cp);
|
||||
|
||||
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
|
||||
collationSource->toReturn = collationSource->CEpos;
|
||||
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
|
||||
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
|
||||
collationSource->toReturn = collationSource->CEpos;
|
||||
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2779,6 +2783,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
UChar buffer[UCOL_MAX_BUFFER];
|
||||
uint32_t *endCEBuffer;
|
||||
UChar *strbuffer;
|
||||
int32_t noChars = 0;
|
||||
|
||||
for(;;)
|
||||
{
|
||||
|
@ -2856,9 +2861,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
// First we position ourselves at the begining of contraction sequence
|
||||
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
|
||||
|
||||
if (source->pos == source->string ||
|
||||
((source->flags & UCOL_ITER_INNORMBUF) &&
|
||||
*(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
|
||||
if (collIter_bos(source)) {
|
||||
//if(sourcePointer == source->string) {
|
||||
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
|
||||
break;
|
||||
|
@ -2878,7 +2881,43 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
(UCharOffset - coll->contractionIndex));
|
||||
}
|
||||
else
|
||||
{
|
||||
{
|
||||
// if there is a completely ignorable code point in the middle of
|
||||
// a prefix, we need to act as if it's not there
|
||||
// assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
|
||||
// lone surrogates cannot be set to zero as it would break other processing
|
||||
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
|
||||
// it's easy for BMP code points
|
||||
if(isZeroCE == 0) {
|
||||
continue;
|
||||
} else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
|
||||
// for supplementary code points, we have to check the next one
|
||||
// situations where we are going to ignore
|
||||
// 1. beginning of the string: schar is a lone surrogate
|
||||
// 2. schar is a lone surrogate
|
||||
// 3. schar is a trail surrogate in a valid surrogate sequence
|
||||
// that is explicitly set to zero.
|
||||
if (!collIter_bos(source)) {
|
||||
UChar lead;
|
||||
if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
|
||||
isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
|
||||
if(getCETag(isZeroCE) == SURROGATE_TAG) {
|
||||
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
|
||||
if(finalCE == 0) {
|
||||
// this is a real, assigned completely ignorable code point
|
||||
source->pos--;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// lone surrogate, completely ignorable
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// lone surrogate at the beggining, completely ignorable
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Source string char was not in the table.
|
||||
// We have not found the prefix.
|
||||
CE = *(coll->contractionCEs +
|
||||
|
@ -2917,8 +2956,10 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
strbuffer = buffer;
|
||||
UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
|
||||
*(UCharOffset --) = 0;
|
||||
noChars = 0;
|
||||
while (ucol_unsafeCP(schar, coll)) {
|
||||
*(UCharOffset) = schar;
|
||||
noChars++;
|
||||
UCharOffset --;
|
||||
schar = getPrevNormalizedChar(source);
|
||||
source->pos --;
|
||||
|
@ -2945,12 +2986,14 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
|
|||
}
|
||||
/* adds the initial base character to the string */
|
||||
*(UCharOffset) = schar;
|
||||
noChars++;
|
||||
|
||||
/* a new collIterate is used to simply things, since using the current
|
||||
collIterate will mean that the forward and backwards iteration will
|
||||
share and change the same buffers. we don't want to get into that. */
|
||||
collIterate temp;
|
||||
IInit_collIterate(coll, UCharOffset, -1, &temp);
|
||||
//IInit_collIterate(coll, UCharOffset, -1, &temp);
|
||||
IInit_collIterate(coll, UCharOffset, noChars, &temp);
|
||||
temp.flags &= ~UCOL_ITER_NORM;
|
||||
|
||||
CE = ucol_IGetNextCE(coll, &temp, status);
|
||||
|
@ -3441,7 +3484,8 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
|
|||
currentSize++;
|
||||
leadPrimary = 0;
|
||||
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
|
||||
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
|
||||
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
|
||||
(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
|
||||
/* not compressible */
|
||||
leadPrimary = 0;
|
||||
currentSize+=2;
|
||||
|
@ -3881,7 +3925,8 @@ ucol_calcSortKey(const UCollator *coll,
|
|||
*primaries++ = primary1;
|
||||
leadPrimary = 0;
|
||||
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
|
||||
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
|
||||
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
|
||||
(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
|
||||
/* not compressible */
|
||||
leadPrimary = 0;
|
||||
*primaries++ = primary1;
|
||||
|
@ -4365,7 +4410,8 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
|
|||
*primaries++ = primary1;
|
||||
leadPrimary = 0;
|
||||
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
|
||||
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
|
||||
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
|
||||
(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
|
||||
/* not compressible */
|
||||
leadPrimary = 0;
|
||||
*primaries++ = primary1;
|
||||
|
@ -4740,7 +4786,10 @@ ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod
|
|||
|
||||
uint32_t CE = ucol_IGetNextCE(coll, &s, status);
|
||||
|
||||
if(s.pos != s.endp) {
|
||||
/* here we check if we have consumed all characters */
|
||||
/* you can put in either one character or a contraction */
|
||||
/* you shouldn't put more... */
|
||||
if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
|
||||
*status = U_CE_NOT_FOUND_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -875,6 +875,43 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
|
|||
}
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
tempUCATable *t = (tempUCATable *)context;
|
||||
if(value == 0) {
|
||||
UChar32 stopHere = start;
|
||||
while(start < limit) {
|
||||
uint32_t CE = utrie_get32(t->mapping, start, NULL);
|
||||
if(CE == UCOL_NOT_FOUND) {
|
||||
UCAElements el;
|
||||
el.isThai = FALSE;
|
||||
el.prefixSize = 0;
|
||||
el.prefixChars[0] = 0;
|
||||
el.prefix = el.prefixChars;
|
||||
el.cPoints = el.uchars;
|
||||
|
||||
el.cSize = 0;
|
||||
UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
|
||||
|
||||
el.noOfCEs = 1;
|
||||
el.CEs[0] = 0;
|
||||
uprv_uca_addAnElement(t, &el, &status);
|
||||
|
||||
}
|
||||
start++;
|
||||
}
|
||||
}
|
||||
if(U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
} else {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
U_CDECL_END
|
||||
|
||||
|
||||
UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
|
||||
uint32_t i = 0;
|
||||
if(U_FAILURE(*status)) {
|
||||
|
@ -1015,8 +1052,8 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
|
|||
/* copy contractions from the UCA - this is felt mostly for cyrillic*/
|
||||
|
||||
uint32_t tailoredCE = UCOL_NOT_FOUND;
|
||||
UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
|
||||
//UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
|
||||
//UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
|
||||
UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
|
||||
UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
|
||||
while(*conts != 0) {
|
||||
/*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
|
||||
|
@ -1055,6 +1092,10 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
|
|||
ucol_closeElements(ucaEl);
|
||||
}
|
||||
|
||||
// Add completely ignorable elements
|
||||
utrie_enum(t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
|
||||
|
||||
|
||||
// canonical closure
|
||||
uprv_uca_canonicalClosure(t, status);
|
||||
|
||||
|
|
|
@ -977,6 +977,17 @@ static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements
|
|||
|
||||
static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element, UErrorCode *status) {
|
||||
uint32_t CE = UCOL_NOT_FOUND;
|
||||
// This should add a completely ignorable element to the
|
||||
// unsafe table, so that backward iteration will skip
|
||||
// over it when treating contractions.
|
||||
uint32_t i = 0;
|
||||
if(element->mapCE == 0) {
|
||||
for(i = 0; i < element->cSize; i++) {
|
||||
if(!UTF_IS_TRAIL(element->cPoints[i])) {
|
||||
unsafeCPSet(t->unsafeCP, element->cPoints[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(element->cSize > 1) { /* we're adding a contraction */
|
||||
uint32_t i = 0;
|
||||
UChar32 cp;
|
||||
|
@ -1151,24 +1162,7 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
|
|||
}
|
||||
source = it.next();
|
||||
}
|
||||
#if 0
|
||||
CE = uprv_uca_finalizeAddition(t, element, status);
|
||||
UChar composed[256];
|
||||
uint32_t compLen = unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0, composed, 256, status);;
|
||||
|
||||
if(compLen != element->cSize || uprv_memcmp(composed, element->cPoints, element->cSize*sizeof(UChar))) {
|
||||
// composed form of a contraction is different than the decomposed form!
|
||||
// do it!
|
||||
#ifdef UCOL_DEBUG
|
||||
fprintf(stderr, "Adding composed for %04X->%04X\n", *element->cPoints, *composed);
|
||||
#endif
|
||||
element->cSize = compLen;
|
||||
uprv_memcpy(element->cPoints, composed, element->cSize*sizeof(UChar));
|
||||
uprv_uca_finalizeAddition(t, element, status);
|
||||
}
|
||||
#else
|
||||
CE = element->mapCE;
|
||||
#endif
|
||||
} else {
|
||||
CE = uprv_uca_finalizeAddition(t, element, status);
|
||||
}
|
||||
|
@ -1485,7 +1479,6 @@ struct enumStruct {
|
|||
UCollationElements* colEl;
|
||||
UErrorCode *status;
|
||||
};
|
||||
#include <stdio.h>
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV
|
||||
_enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {
|
||||
|
|
|
@ -424,6 +424,7 @@ enum {
|
|||
UCOL_BYTE_UNSHIFTED_MAX = 0xFF
|
||||
};
|
||||
|
||||
#if 0
|
||||
#define UCOL_RESET_TOP_VALUE 0x9F000303
|
||||
#define UCOL_FIRST_PRIMARY_IGNORABLE 0x00008705
|
||||
#define UCOL_LAST_PRIMARY_IGNORABLE 0x0000DD05
|
||||
|
@ -449,6 +450,7 @@ enum {
|
|||
|
||||
#define PRIMARY_IMPLICIT_MIN 0xE8000000
|
||||
#define PRIMARY_IMPLICIT_MAX 0xF0000000
|
||||
#endif
|
||||
|
||||
/* These constants can be changed - sortkey size is affected by them */
|
||||
#define UCOL_PROPORTION2 0.5
|
||||
|
@ -532,23 +534,24 @@ typedef struct {
|
|||
} UColOptionSet;
|
||||
|
||||
typedef struct {
|
||||
uint32_t UCA_RESET_TOP_VALUE; /*0x9F000303*/
|
||||
#if 0
|
||||
uint32_t UCA_FIRST_PRIMARY_IGNORABLE; /*0x00008705*/
|
||||
uint32_t UCA_LAST_PRIMARY_IGNORABLE; /*0x0000DD05*/
|
||||
uint32_t UCA_LAST_PRIMARY_IGNORABLE_CONT; /*0x0000C1C0*/
|
||||
uint32_t UCA_FIRST_SECONDARY_IGNORABLE; /*0x00000000*/
|
||||
uint32_t UCA_LAST_SECONDARY_IGNORABLE; /*0x00000500*/
|
||||
uint32_t UCA_FIRST_TERTIARY_IGNORABLE; /*0x00000000*/
|
||||
uint32_t UCA_LAST_TERTIARY_IGNORABLE; /*0x00000000*/
|
||||
uint32_t UCA_FIRST_VARIABLE; /*0x05070505*/
|
||||
uint32_t UCA_LAST_VARIABLE; /*0x13CF0505*/
|
||||
uint32_t UCA_FIRST_NON_VARIABLE; /*0x16200505*/
|
||||
uint32_t UCA_LAST_NON_VARIABLE; /*0x767C0505*/
|
||||
#endif
|
||||
uint32_t UCA_FIRST_TERTIARY_IGNORABLE[2]; /*0x00000000*/
|
||||
uint32_t UCA_LAST_TERTIARY_IGNORABLE[2]; /*0x00000000*/
|
||||
uint32_t UCA_FIRST_PRIMARY_IGNORABLE[2]; /*0x00008705*/
|
||||
uint32_t UCA_FIRST_SECONDARY_IGNORABLE[2]; /*0x00000000*/
|
||||
uint32_t UCA_LAST_SECONDARY_IGNORABLE[2]; /*0x00000500*/
|
||||
uint32_t UCA_LAST_PRIMARY_IGNORABLE[2]; /*0x0000DD05*/
|
||||
uint32_t UCA_FIRST_VARIABLE[2]; /*0x05070505*/
|
||||
uint32_t UCA_LAST_VARIABLE[2]; /*0x13CF0505*/
|
||||
uint32_t UCA_FIRST_NON_VARIABLE[2]; /*0x16200505*/
|
||||
uint32_t UCA_LAST_NON_VARIABLE[2]; /*0x767C0505*/
|
||||
uint32_t UCA_RESET_TOP_VALUE[2]; /*0x9F000303*/
|
||||
uint32_t UCA_FIRST_IMPLICIT[2];
|
||||
uint32_t UCA_LAST_IMPLICIT[2];
|
||||
uint32_t UCA_FIRST_TRAILING[2];
|
||||
uint32_t UCA_LAST_TRAILING[2];
|
||||
|
||||
uint32_t UCA_NEXT_TOP_VALUE; /*0xE8960303*/
|
||||
#if 0
|
||||
uint32_t UCA_NEXT_TOP_VALUE[2]; /*0xE8960303*/
|
||||
uint32_t UCA_NEXT_FIRST_PRIMARY_IGNORABLE; /*0x00008905*/
|
||||
uint32_t UCA_NEXT_LAST_PRIMARY_IGNORABLE; /*0x03000303*/
|
||||
uint32_t UCA_NEXT_FIRST_SECONDARY_IGNORABLE; /*0x00008705*/
|
||||
|
@ -559,8 +562,13 @@ typedef struct {
|
|||
uint32_t UCA_NEXT_LAST_VARIABLE; /*0x16200505*/
|
||||
#endif
|
||||
|
||||
uint32_t UCA_PRIMARY_TOP_MIN;
|
||||
uint32_t UCA_PRIMARY_IMPLICIT_MIN; /*0xE8000000*/
|
||||
uint32_t UCA_PRIMARY_IMPLICIT_MAX; /*0xF0000000*/
|
||||
uint32_t UCA_PRIMARY_TRAILING_MIN; /*0xE8000000*/
|
||||
uint32_t UCA_PRIMARY_TRAILING_MAX; /*0xF0000000*/
|
||||
uint32_t UCA_PRIMARY_SPECIAL_MIN; /*0xE8000000*/
|
||||
uint32_t UCA_PRIMARY_SPECIAL_MAX; /*0xF0000000*/
|
||||
} UCAConstants;
|
||||
|
||||
typedef struct {
|
||||
|
@ -569,7 +577,7 @@ typedef struct {
|
|||
/* to get the address add to the header address and cast properly */
|
||||
uint32_t options; /* these are the default options for the collator */
|
||||
uint32_t UCAConsts; /* structure which holds values for indirect positioning and implicit ranges */
|
||||
/*uint32_t contractionUCACombos;*/ /* this one is needed only for UCA, to copy the appropriate contractions */
|
||||
uint32_t contractionUCACombos; /* this one is needed only for UCA, to copy the appropriate contractions */
|
||||
uint32_t unusedReserved1; /* reserved for future use */
|
||||
uint32_t mappingPosition; /* const uint8_t *mappingPosition; */
|
||||
uint32_t expansion; /* uint32_t *expansion; */
|
||||
|
@ -801,7 +809,7 @@ static inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
|
|||
|
||||
hash = c;
|
||||
if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
|
||||
if(UTF_IS_TRAIL(c)) {
|
||||
if(UTF_IS_LEAD(c) || UTF_IS_TRAIL(c)) {
|
||||
/* Trail surrogate */
|
||||
/* These are always considered unsafe. */
|
||||
return TRUE;
|
||||
|
|
|
@ -112,6 +112,8 @@ typedef struct {
|
|||
/* they can be used to assure that the CEs will be always positioned in */
|
||||
/* the same place relative to a point with known properties (e.g. first */
|
||||
/* primary ignorable). */
|
||||
static indirectBoundaries ucolIndirectBoundaries[11];
|
||||
/*
|
||||
static indirectBoundaries ucolIndirectBoundaries[11] = {
|
||||
{ UCOL_RESET_TOP_VALUE, 0,
|
||||
UCOL_NEXT_TOP_VALUE, 0 },
|
||||
|
@ -136,6 +138,23 @@ static indirectBoundaries ucolIndirectBoundaries[11] = {
|
|||
{ UCOL_LAST_NON_VARIABLE, 0,
|
||||
0, 0 },
|
||||
};
|
||||
*/
|
||||
|
||||
static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
|
||||
|
||||
// Set values for the top - TODO: once we have values for all the indirects, we are going
|
||||
// to initalize here.
|
||||
ucolIndirectBoundaries[indexR].startCE = start[0];
|
||||
ucolIndirectBoundaries[indexR].startContCE = start[1];
|
||||
if(end) {
|
||||
ucolIndirectBoundaries[indexR].limitCE = end[0];
|
||||
ucolIndirectBoundaries[indexR].limitContCE = end[1];
|
||||
} else {
|
||||
ucolIndirectBoundaries[indexR].limitCE = 0;
|
||||
ucolIndirectBoundaries[indexR].limitContCE = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, UCollator *UCA, UErrorCode *status) {
|
||||
uint32_t nSize = 0;
|
||||
|
@ -203,8 +222,9 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
|
|||
|
||||
// rulesToParse = src->source;
|
||||
src->lh = 0;
|
||||
src->lh = (UColTokListHeader *)uprv_malloc(512*sizeof(UColTokListHeader));
|
||||
/* test for NULL */
|
||||
src->listCapacity = 1024;
|
||||
src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
|
||||
//Test for NULL
|
||||
if (src->lh == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
|
@ -212,13 +232,30 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
|
|||
src->resultLen = 0;
|
||||
|
||||
UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
|
||||
|
||||
// Set values for the top - TODO: once we have values for all the indirects, we are going
|
||||
// to initalize here.
|
||||
ucolIndirectBoundaries[0].startCE = consts->UCA_RESET_TOP_VALUE;
|
||||
ucolIndirectBoundaries[0].startContCE = 0;
|
||||
ucolIndirectBoundaries[0].limitCE = consts->UCA_NEXT_TOP_VALUE;
|
||||
ucolIndirectBoundaries[0].limitContCE = 0;
|
||||
|
||||
// UCOL_RESET_TOP_VALUE
|
||||
setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
|
||||
// UCOL_FIRST_PRIMARY_IGNORABLE
|
||||
setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
|
||||
// UCOL_LAST_PRIMARY_IGNORABLE
|
||||
setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
|
||||
// UCOL_FIRST_SECONDARY_IGNORABLE
|
||||
setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
|
||||
// UCOL_LAST_SECONDARY_IGNORABLE
|
||||
setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
|
||||
// UCOL_FIRST_TERTIARY_IGNORABLE
|
||||
setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
|
||||
// UCOL_LAST_TERTIARY_IGNORABLE
|
||||
setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
|
||||
// UCOL_FIRST_VARIABLE
|
||||
setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
|
||||
// UCOL_LAST_VARIABLE
|
||||
setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
|
||||
// UCOL_FIRST_NON_VARIABLE
|
||||
setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
|
||||
// UCOL_LAST_NON_VARIABLE
|
||||
setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, 0);
|
||||
|
||||
}
|
||||
|
||||
static inline
|
||||
|
@ -750,7 +787,14 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||
*src->extraCurrent++ = 0xFFFE;
|
||||
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
|
||||
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
|
||||
newCharsLen = 3;
|
||||
if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
|
||||
newCharsLen = 3;
|
||||
} else {
|
||||
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
|
||||
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
|
||||
newCharsLen = 5;
|
||||
}
|
||||
|
||||
src->current++;
|
||||
goto EndOfLoop;
|
||||
} else {
|
||||
|
@ -964,6 +1008,15 @@ Processing Description
|
|||
|
||||
static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
|
||||
UParseError *parseError, UErrorCode *status) {
|
||||
if(src->resultLen == src->listCapacity) {
|
||||
// Unfortunately, this won't work, as we store addresses of lhs in token
|
||||
src->listCapacity *= 2;
|
||||
src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
|
||||
if(src->lh == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
/* do the reset thing */
|
||||
UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
|
||||
/* test for NULL */
|
||||
|
@ -1024,6 +1077,7 @@ static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint3
|
|||
}
|
||||
|
||||
src->resultLen++;
|
||||
|
||||
uhash_put(src->tailored, sourceToken, sourceToken, status);
|
||||
|
||||
return sourceToken;
|
||||
|
|
|
@ -111,6 +111,7 @@ typedef struct {
|
|||
UHashtable *tailored;
|
||||
UColOptionSet *opts;
|
||||
uint32_t resultLen;
|
||||
uint32_t listCapacity;
|
||||
UColTokListHeader *lh;
|
||||
UColToken *varTop;
|
||||
} UColTokenParser;
|
||||
|
|
Loading…
Add table
Reference in a new issue