ICU-1930 new UCA + skipping ignorables in contraction and prefixes

X-SVN-Rev: 9004
This commit is contained in:
Vladimir Weinstein 2002-07-02 22:32:14 +00:00
parent f0c4f70cf1
commit 3e72d5fee5
6 changed files with 304 additions and 158 deletions

View file

@ -54,6 +54,7 @@ U_NAMESPACE_USE
#define ZERO_CC_LIMIT_ 0xC0
static UCollator* UCA = NULL;
static UCAConstants *UCAconsts = NULL;
static UDataMemory* UCA_DATA_MEM = NULL;
@ -83,6 +84,13 @@ isAcceptableUCA(void * /*context*/,
return FALSE;
}
}
static int32_t U_CALLCONV
_getFoldingOffset(uint32_t data) {
return (int32_t)(data&0xFFFFFF);
}
U_CDECL_END
static
@ -224,6 +232,22 @@ inline UBool collIter_eos(collIterate *s) {
}
}
/*
* collIter_bos()
* Checks for a collIterate being positioned at the start of
* its source string.
*
*/
static
inline UBool collIter_bos(collIterate *source) {
if (source->pos <= source->string ||
((source->flags & UCOL_ITER_INNORMBUF) &&
*(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
return TRUE;
}
return FALSE;
}
/**
* Checks and free writable buffer if it is not the original stack buffer
@ -996,8 +1020,9 @@ void ucol_initUCA(UErrorCode *status) {
ucln_i18n_registerCleanup();
}
// Initalize variables for implicit generation
UCAConstants *consts = (UCAConstants *)((uint8_t *)UCA->image + UCA->image->UCAConsts);
uprv_uca_initImplicitConstants(consts->UCA_PRIMARY_IMPLICIT_MIN);
UCAconsts = (UCAConstants *)((uint8_t *)UCA->image + UCA->image->UCAConsts);
uprv_uca_initImplicitConstants(UCAconsts->UCA_PRIMARY_IMPLICIT_MIN);
UCA->mapping->getFoldingOffset = _getFoldingOffset;
}else{
udata_close(result);
uprv_free(newUCA);
@ -2039,65 +2064,19 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
return *(coll->contractionCEs + (constart - coll->contractionIndex));
}
#if 0
/* added for Han implicit CE */
static const uint32_t IMPLICIT_HAN_START_ = 0x3400;
static const uint32_t IMPLICIT_HAN_LIMIT_ = 0xA000;
static const uint32_t IMPLICIT_SUPPLEMENTARY_COUNT_ = 0x100000;
static const uint32_t IMPLICIT_BYTES_TO_AVOID_ = 3;
static const uint32_t IMPLICIT_OTHER_COUNT_ = 256 - IMPLICIT_BYTES_TO_AVOID_;
static const uint32_t IMPLICIT_LAST_COUNT_ = IMPLICIT_OTHER_COUNT_ / 2;
static const uint32_t IMPLICIT_LAST_COUNT2_ =
(IMPLICIT_SUPPLEMENTARY_COUNT_ - 1) /
(IMPLICIT_OTHER_COUNT_ * IMPLICIT_OTHER_COUNT_) + 1;
static const uint32_t IMPLICIT_HAN_SHIFT_ = IMPLICIT_LAST_COUNT_ *
IMPLICIT_OTHER_COUNT_ - IMPLICIT_HAN_START_;
static const uint32_t IMPLICIT_BOUNDARY_ = 2 * IMPLICIT_OTHER_COUNT_ *
IMPLICIT_LAST_COUNT_ + IMPLICIT_HAN_START_;
static const uint32_t IMPLICIT_LAST2_MULTIPLIER_ = IMPLICIT_OTHER_COUNT_ /
IMPLICIT_LAST_COUNT2_;
static
inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource, uint32_t hanFixup) {
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
return 0; /* illegal code value, use completely ignoreable! */
inline UBool isNonChar(UChar32 cp) {
if ((cp & 0xFFFE) == 0xFFFE || (0xFDD0 <= cp && cp <= 0xFDEF) || (0xD800 <= cp && cp <= 0xDC00)) {
return TRUE;
}
/*
we must skip all 00, 01, 02 bytes, so most bytes have 253 values
we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
we shift so that HAN all has the same first primary, for compression.
for the 4 byte case, we make the gap as large as we can fit.
Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
*/
int32_t last0 = cp - IMPLICIT_BOUNDARY_;
uint32_t r = 0;
if (last0 < 0) {
cp += IMPLICIT_HAN_SHIFT_; // shift so HAN shares single block
int32_t last1 = cp / IMPLICIT_LAST_COUNT_;
last0 = cp % IMPLICIT_LAST_COUNT_;
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
last1 %= IMPLICIT_OTHER_COUNT_;
r = 0xEC030300 - hanFixup + (last2 << 24) + (last1 << 16) + (last0 << 9);
} else {
int32_t last1 = last0 / IMPLICIT_LAST_COUNT2_;
last0 %= IMPLICIT_LAST_COUNT2_;
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
last1 %= IMPLICIT_OTHER_COUNT_;
r = 0xEF030303 - hanFixup + (last2 << 16) + (last1 << 8) + (last0 * IMPLICIT_LAST2_MULTIPLIER_);
}
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
return FALSE;
}
#endif
/* now uses Mark's getImplicitPrimary code */
static
inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
return 0; /* illegal code value, use completely ignoreable! */
if(isNonChar(cp)) {
return 0;
}
uint32_t r = getImplicitPrimary(cp);
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
@ -2208,10 +2187,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
// First we position ourselves at the begining of contraction sequence
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
if (source->pos == source->string ||
((source->flags & UCOL_ITER_INNORMBUF) &&
*(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
// if(sourcePointer == source->string) {
if (collIter_bos(source)) {
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
break;
}
@ -2231,6 +2207,42 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
}
else
{
// if there is a completely ignorable code point in the middle of
// a prefix, we need to act as if it's not there
// assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
// lone surrogates cannot be set to zero as it would break other processing
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
// it's easy for BMP code points
if(isZeroCE == 0) {
continue;
} else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
// for supplementary code points, we have to check the next one
// situations where we are going to ignore
// 1. beginning of the string: schar is a lone surrogate
// 2. schar is a lone surrogate
// 3. schar is a trail surrogate in a valid surrogate sequence
// that is explicitly set to zero.
if (!collIter_bos(source)) {
UChar lead;
if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
if(getCETag(isZeroCE) == SURROGATE_TAG) {
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
if(finalCE == 0) {
// this is a real, assigned completely ignorable code point
source->pos--;
continue;
}
}
} else {
// lone surrogate, completely ignorable
continue;
}
} else {
// lone surrogate at the beggining, completely ignorable
continue;
}
}
// Source string char was not in the table.
// We have not found the prefix.
CE = *(coll->contractionCEs +
@ -2297,6 +2309,35 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
}
else
{
// if there is a completely ignorable code point in the middle of
// contraction, we need to act as if it's not there
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
// it's easy for BMP code points
if(isZeroCE == 0) {
continue;
} else if(UTF_IS_LEAD(schar)) {
if(!collIter_eos(source)) {
backupState(source, &state);
UChar trail = getNextNormalizedChar(source);
if(UTF_IS_TRAIL(trail)) { // do stuff with trail
if(getCETag(isZeroCE) == SURROGATE_TAG) {
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, trail);
if(finalCE == 0) {
continue;
}
}
} else {
// broken surrogate sequence, thus completely ignorable
loadState(source, &state, TRUE);
continue;
}
loadState(source, &state, TRUE);
} else { // no more characters, so broken surrogate pair...
// this contraction will ultimately fail, but not because of us
continue;
}
}
// Source string char was not in contraction table.
// Unless we have a discontiguous contraction, we have finished
// with this contraction.
@ -2710,56 +2751,19 @@ inline UChar getPrevNormalizedChar(collIterate *data)
return ch;
}
#if 0
static
inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource, uint32_t hanFixup) {
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
return 0; /* illegal code value, use completely ignoreable! */
}
/* we must skip all 00, 01, 02 bytes, so most bytes have 253 values
we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
we shift so that HAN all has the same first primary, for compression.
for the 4 byte case, we make the gap as large as we can fit.
Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
*/
int32_t last0 = cp - IMPLICIT_BOUNDARY_;
uint32_t r = 0;
if (last0 < 0) {
cp += IMPLICIT_HAN_SHIFT_; // shift so HAN shares single block
int32_t last1 = cp / IMPLICIT_LAST_COUNT_;
last0 = cp % IMPLICIT_LAST_COUNT_;
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
last1 %= IMPLICIT_OTHER_COUNT_;
r = 0xEC030300 - hanFixup + (last2 << 24) + (last1 << 16) + (last0 << 9);
} else {
int32_t last1 = last0 / IMPLICIT_LAST_COUNT2_;
last0 %= IMPLICIT_LAST_COUNT2_;
int32_t last2 = last1 / IMPLICIT_OTHER_COUNT_;
last1 %= IMPLICIT_OTHER_COUNT_;
r = 0xEF030303 - hanFixup + (last2 << 16) + (last1 << 8) +
(last0 * IMPLICIT_LAST2_MULTIPLIER_);
}
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
collationSource->toReturn = collationSource->CEpos;
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
}
#endif
/* now uses Mark's getImplicitPrimary code */
static
inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
return 0; /* illegal code value, use completely ignoreable! */
}
if(isNonChar(cp)) {
return 0;
}
uint32_t r = getImplicitPrimary(cp);
uint32_t r = getImplicitPrimary(cp);
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
collationSource->toReturn = collationSource->CEpos;
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
collationSource->toReturn = collationSource->CEpos;
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
}
/**
@ -2779,6 +2783,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
UChar buffer[UCOL_MAX_BUFFER];
uint32_t *endCEBuffer;
UChar *strbuffer;
int32_t noChars = 0;
for(;;)
{
@ -2856,9 +2861,7 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
// First we position ourselves at the begining of contraction sequence
const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);
if (source->pos == source->string ||
((source->flags & UCOL_ITER_INNORMBUF) &&
*(source->pos - 1) == 0 && source->fcdPosition == NULL)) {
if (collIter_bos(source)) {
//if(sourcePointer == source->string) {
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
break;
@ -2878,7 +2881,43 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
(UCharOffset - coll->contractionIndex));
}
else
{
{
// if there is a completely ignorable code point in the middle of
// a prefix, we need to act as if it's not there
// assumption: 'real' noncharacters (*fffe, *ffff, fdd0-fdef are set to zero)
// lone surrogates cannot be set to zero as it would break other processing
uint32_t isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, schar);
// it's easy for BMP code points
if(isZeroCE == 0) {
continue;
} else if(UTF_IS_TRAIL(schar) || UTF_IS_LEAD(schar)) {
// for supplementary code points, we have to check the next one
// situations where we are going to ignore
// 1. beginning of the string: schar is a lone surrogate
// 2. schar is a lone surrogate
// 3. schar is a trail surrogate in a valid surrogate sequence
// that is explicitly set to zero.
if (!collIter_bos(source)) {
UChar lead;
if(UTF_IS_LEAD(lead = getPrevNormalizedChar(source))) {
isZeroCE = UTRIE_GET32_FROM_LEAD(coll->mapping, lead);
if(getCETag(isZeroCE) == SURROGATE_TAG) {
uint32_t finalCE = UTRIE_GET32_FROM_OFFSET_TRAIL(coll->mapping, isZeroCE&0xFFFFFF, schar);
if(finalCE == 0) {
// this is a real, assigned completely ignorable code point
source->pos--;
continue;
}
}
} else {
// lone surrogate, completely ignorable
continue;
}
} else {
// lone surrogate at the beggining, completely ignorable
continue;
}
}
// Source string char was not in the table.
// We have not found the prefix.
CE = *(coll->contractionCEs +
@ -2917,8 +2956,10 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
strbuffer = buffer;
UCharOffset = strbuffer + (UCOL_MAX_BUFFER - 1);
*(UCharOffset --) = 0;
noChars = 0;
while (ucol_unsafeCP(schar, coll)) {
*(UCharOffset) = schar;
noChars++;
UCharOffset --;
schar = getPrevNormalizedChar(source);
source->pos --;
@ -2945,12 +2986,14 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
}
/* adds the initial base character to the string */
*(UCharOffset) = schar;
noChars++;
/* a new collIterate is used to simply things, since using the current
collIterate will mean that the forward and backwards iteration will
share and change the same buffers. we don't want to get into that. */
collIterate temp;
IInit_collIterate(coll, UCharOffset, -1, &temp);
//IInit_collIterate(coll, UCharOffset, -1, &temp);
IInit_collIterate(coll, UCharOffset, noChars, &temp);
temp.flags &= ~UCOL_ITER_NORM;
CE = ucol_IGetNextCE(coll, &temp, status);
@ -3441,7 +3484,8 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
currentSize++;
leadPrimary = 0;
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
/* not compressible */
leadPrimary = 0;
currentSize+=2;
@ -3881,7 +3925,8 @@ ucol_calcSortKey(const UCollator *coll,
*primaries++ = primary1;
leadPrimary = 0;
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
/* not compressible */
leadPrimary = 0;
*primaries++ = primary1;
@ -4365,7 +4410,8 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
*primaries++ = primary1;
leadPrimary = 0;
} else if(primary1<UCOL_BYTE_FIRST_NON_LATIN_PRIMARY ||
(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
//(primary1 > (UCOL_RESET_TOP_VALUE>>24) && primary1 < (UCOL_NEXT_TOP_VALUE>>24))) {
(primary1 > (*UCAconsts->UCA_LAST_NON_VARIABLE>>24) && primary1 < (*UCAconsts->UCA_FIRST_IMPLICIT>>24))) {
/* not compressible */
leadPrimary = 0;
*primaries++ = primary1;
@ -4740,7 +4786,10 @@ ucol_setVariableTop(UCollator *coll, const UChar *varTop, int32_t len, UErrorCod
uint32_t CE = ucol_IGetNextCE(coll, &s, status);
if(s.pos != s.endp) {
/* here we check if we have consumed all characters */
/* you can put in either one character or a contraction */
/* you shouldn't put more... */
if(s.pos != s.endp || CE == UCOL_NO_MORE_CES) {
*status = U_CE_NOT_FOUND_ERROR;
return 0;
}

View file

@ -875,6 +875,43 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
}
}
U_CDECL_BEGIN
static UBool U_CALLCONV
_processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
UErrorCode status = U_ZERO_ERROR;
tempUCATable *t = (tempUCATable *)context;
if(value == 0) {
UChar32 stopHere = start;
while(start < limit) {
uint32_t CE = utrie_get32(t->mapping, start, NULL);
if(CE == UCOL_NOT_FOUND) {
UCAElements el;
el.isThai = FALSE;
el.prefixSize = 0;
el.prefixChars[0] = 0;
el.prefix = el.prefixChars;
el.cPoints = el.uchars;
el.cSize = 0;
UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
el.noOfCEs = 1;
el.CEs[0] = 0;
uprv_uca_addAnElement(t, &el, &status);
}
start++;
}
}
if(U_FAILURE(status)) {
return FALSE;
} else {
return TRUE;
}
}
U_CDECL_END
UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
uint32_t i = 0;
if(U_FAILURE(*status)) {
@ -1015,8 +1052,8 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
/* copy contractions from the UCA - this is felt mostly for cyrillic*/
uint32_t tailoredCE = UCOL_NOT_FOUND;
UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
//UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
//UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
while(*conts != 0) {
/*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
@ -1055,6 +1092,10 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
ucol_closeElements(ucaEl);
}
// Add completely ignorable elements
utrie_enum(t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
// canonical closure
uprv_uca_canonicalClosure(t, status);

View file

@ -977,6 +977,17 @@ static uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements
static uint32_t uprv_uca_finalizeAddition(tempUCATable *t, UCAElements *element, UErrorCode *status) {
uint32_t CE = UCOL_NOT_FOUND;
// This should add a completely ignorable element to the
// unsafe table, so that backward iteration will skip
// over it when treating contractions.
uint32_t i = 0;
if(element->mapCE == 0) {
for(i = 0; i < element->cSize; i++) {
if(!UTF_IS_TRAIL(element->cPoints[i])) {
unsafeCPSet(t->unsafeCP, element->cPoints[i]);
}
}
}
if(element->cSize > 1) { /* we're adding a contraction */
uint32_t i = 0;
UChar32 cp;
@ -1151,24 +1162,7 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
}
source = it.next();
}
#if 0
CE = uprv_uca_finalizeAddition(t, element, status);
UChar composed[256];
uint32_t compLen = unorm_normalize(element->cPoints, element->cSize, UNORM_NFC, 0, composed, 256, status);;
if(compLen != element->cSize || uprv_memcmp(composed, element->cPoints, element->cSize*sizeof(UChar))) {
// composed form of a contraction is different than the decomposed form!
// do it!
#ifdef UCOL_DEBUG
fprintf(stderr, "Adding composed for %04X->%04X\n", *element->cPoints, *composed);
#endif
element->cSize = compLen;
uprv_memcpy(element->cPoints, composed, element->cSize*sizeof(UChar));
uprv_uca_finalizeAddition(t, element, status);
}
#else
CE = element->mapCE;
#endif
} else {
CE = uprv_uca_finalizeAddition(t, element, status);
}
@ -1485,7 +1479,6 @@ struct enumStruct {
UCollationElements* colEl;
UErrorCode *status;
};
#include <stdio.h>
U_CDECL_BEGIN
static UBool U_CALLCONV
_enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 limit, UCharCategory type) {

View file

@ -424,6 +424,7 @@ enum {
UCOL_BYTE_UNSHIFTED_MAX = 0xFF
};
#if 0
#define UCOL_RESET_TOP_VALUE 0x9F000303
#define UCOL_FIRST_PRIMARY_IGNORABLE 0x00008705
#define UCOL_LAST_PRIMARY_IGNORABLE 0x0000DD05
@ -449,6 +450,7 @@ enum {
#define PRIMARY_IMPLICIT_MIN 0xE8000000
#define PRIMARY_IMPLICIT_MAX 0xF0000000
#endif
/* These constants can be changed - sortkey size is affected by them */
#define UCOL_PROPORTION2 0.5
@ -532,23 +534,24 @@ typedef struct {
} UColOptionSet;
typedef struct {
uint32_t UCA_RESET_TOP_VALUE; /*0x9F000303*/
#if 0
uint32_t UCA_FIRST_PRIMARY_IGNORABLE; /*0x00008705*/
uint32_t UCA_LAST_PRIMARY_IGNORABLE; /*0x0000DD05*/
uint32_t UCA_LAST_PRIMARY_IGNORABLE_CONT; /*0x0000C1C0*/
uint32_t UCA_FIRST_SECONDARY_IGNORABLE; /*0x00000000*/
uint32_t UCA_LAST_SECONDARY_IGNORABLE; /*0x00000500*/
uint32_t UCA_FIRST_TERTIARY_IGNORABLE; /*0x00000000*/
uint32_t UCA_LAST_TERTIARY_IGNORABLE; /*0x00000000*/
uint32_t UCA_FIRST_VARIABLE; /*0x05070505*/
uint32_t UCA_LAST_VARIABLE; /*0x13CF0505*/
uint32_t UCA_FIRST_NON_VARIABLE; /*0x16200505*/
uint32_t UCA_LAST_NON_VARIABLE; /*0x767C0505*/
#endif
uint32_t UCA_FIRST_TERTIARY_IGNORABLE[2]; /*0x00000000*/
uint32_t UCA_LAST_TERTIARY_IGNORABLE[2]; /*0x00000000*/
uint32_t UCA_FIRST_PRIMARY_IGNORABLE[2]; /*0x00008705*/
uint32_t UCA_FIRST_SECONDARY_IGNORABLE[2]; /*0x00000000*/
uint32_t UCA_LAST_SECONDARY_IGNORABLE[2]; /*0x00000500*/
uint32_t UCA_LAST_PRIMARY_IGNORABLE[2]; /*0x0000DD05*/
uint32_t UCA_FIRST_VARIABLE[2]; /*0x05070505*/
uint32_t UCA_LAST_VARIABLE[2]; /*0x13CF0505*/
uint32_t UCA_FIRST_NON_VARIABLE[2]; /*0x16200505*/
uint32_t UCA_LAST_NON_VARIABLE[2]; /*0x767C0505*/
uint32_t UCA_RESET_TOP_VALUE[2]; /*0x9F000303*/
uint32_t UCA_FIRST_IMPLICIT[2];
uint32_t UCA_LAST_IMPLICIT[2];
uint32_t UCA_FIRST_TRAILING[2];
uint32_t UCA_LAST_TRAILING[2];
uint32_t UCA_NEXT_TOP_VALUE; /*0xE8960303*/
#if 0
uint32_t UCA_NEXT_TOP_VALUE[2]; /*0xE8960303*/
uint32_t UCA_NEXT_FIRST_PRIMARY_IGNORABLE; /*0x00008905*/
uint32_t UCA_NEXT_LAST_PRIMARY_IGNORABLE; /*0x03000303*/
uint32_t UCA_NEXT_FIRST_SECONDARY_IGNORABLE; /*0x00008705*/
@ -559,8 +562,13 @@ typedef struct {
uint32_t UCA_NEXT_LAST_VARIABLE; /*0x16200505*/
#endif
uint32_t UCA_PRIMARY_TOP_MIN;
uint32_t UCA_PRIMARY_IMPLICIT_MIN; /*0xE8000000*/
uint32_t UCA_PRIMARY_IMPLICIT_MAX; /*0xF0000000*/
uint32_t UCA_PRIMARY_TRAILING_MIN; /*0xE8000000*/
uint32_t UCA_PRIMARY_TRAILING_MAX; /*0xF0000000*/
uint32_t UCA_PRIMARY_SPECIAL_MIN; /*0xE8000000*/
uint32_t UCA_PRIMARY_SPECIAL_MAX; /*0xF0000000*/
} UCAConstants;
typedef struct {
@ -569,7 +577,7 @@ typedef struct {
/* to get the address add to the header address and cast properly */
uint32_t options; /* these are the default options for the collator */
uint32_t UCAConsts; /* structure which holds values for indirect positioning and implicit ranges */
/*uint32_t contractionUCACombos;*/ /* this one is needed only for UCA, to copy the appropriate contractions */
uint32_t contractionUCACombos; /* this one is needed only for UCA, to copy the appropriate contractions */
uint32_t unusedReserved1; /* reserved for future use */
uint32_t mappingPosition; /* const uint8_t *mappingPosition; */
uint32_t expansion; /* uint32_t *expansion; */
@ -801,7 +809,7 @@ static inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
hash = c;
if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
if(UTF_IS_TRAIL(c)) {
if(UTF_IS_LEAD(c) || UTF_IS_TRAIL(c)) {
/* Trail surrogate */
/* These are always considered unsafe. */
return TRUE;

View file

@ -112,6 +112,8 @@ typedef struct {
/* they can be used to assure that the CEs will be always positioned in */
/* the same place relative to a point with known properties (e.g. first */
/* primary ignorable). */
static indirectBoundaries ucolIndirectBoundaries[11];
/*
static indirectBoundaries ucolIndirectBoundaries[11] = {
{ UCOL_RESET_TOP_VALUE, 0,
UCOL_NEXT_TOP_VALUE, 0 },
@ -136,6 +138,23 @@ static indirectBoundaries ucolIndirectBoundaries[11] = {
{ UCOL_LAST_NON_VARIABLE, 0,
0, 0 },
};
*/
static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
// Set values for the top - TODO: once we have values for all the indirects, we are going
// to initalize here.
ucolIndirectBoundaries[indexR].startCE = start[0];
ucolIndirectBoundaries[indexR].startContCE = start[1];
if(end) {
ucolIndirectBoundaries[indexR].limitCE = end[0];
ucolIndirectBoundaries[indexR].limitContCE = end[1];
} else {
ucolIndirectBoundaries[indexR].limitCE = 0;
ucolIndirectBoundaries[indexR].limitContCE = 0;
}
}
void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, UCollator *UCA, UErrorCode *status) {
uint32_t nSize = 0;
@ -203,8 +222,9 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
// rulesToParse = src->source;
src->lh = 0;
src->lh = (UColTokListHeader *)uprv_malloc(512*sizeof(UColTokListHeader));
/* test for NULL */
src->listCapacity = 1024;
src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
//Test for NULL
if (src->lh == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return;
@ -212,13 +232,30 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
src->resultLen = 0;
UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
// Set values for the top - TODO: once we have values for all the indirects, we are going
// to initalize here.
ucolIndirectBoundaries[0].startCE = consts->UCA_RESET_TOP_VALUE;
ucolIndirectBoundaries[0].startContCE = 0;
ucolIndirectBoundaries[0].limitCE = consts->UCA_NEXT_TOP_VALUE;
ucolIndirectBoundaries[0].limitContCE = 0;
// UCOL_RESET_TOP_VALUE
setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
// UCOL_FIRST_PRIMARY_IGNORABLE
setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
// UCOL_LAST_PRIMARY_IGNORABLE
setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
// UCOL_FIRST_SECONDARY_IGNORABLE
setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
// UCOL_LAST_SECONDARY_IGNORABLE
setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
// UCOL_FIRST_TERTIARY_IGNORABLE
setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
// UCOL_LAST_TERTIARY_IGNORABLE
setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
// UCOL_FIRST_VARIABLE
setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
// UCOL_LAST_VARIABLE
setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
// UCOL_FIRST_NON_VARIABLE
setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
// UCOL_LAST_NON_VARIABLE
setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, 0);
}
static inline
@ -750,7 +787,14 @@ ucol_tok_parseNextToken(UColTokenParser *src,
*src->extraCurrent++ = 0xFFFE;
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
newCharsLen = 3;
if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
newCharsLen = 3;
} else {
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
newCharsLen = 5;
}
src->current++;
goto EndOfLoop;
} else {
@ -964,6 +1008,15 @@ Processing Description
static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
UParseError *parseError, UErrorCode *status) {
if(src->resultLen == src->listCapacity) {
// Unfortunately, this won't work, as we store addresses of lhs in token
src->listCapacity *= 2;
src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
if(src->lh == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
}
/* do the reset thing */
UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
/* test for NULL */
@ -1024,6 +1077,7 @@ static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint3
}
src->resultLen++;
uhash_put(src->tailored, sourceToken, sourceToken, status);
return sourceToken;

View file

@ -111,6 +111,7 @@ typedef struct {
UHashtable *tailored;
UColOptionSet *opts;
uint32_t resultLen;
uint32_t listCapacity;
UColTokListHeader *lh;
UColToken *varTop;
} UColTokenParser;