ICU-1083 Optimization of surrogate operations

X-SVN-Rev: 5452
This commit is contained in:
Vladimir Weinstein 2001-08-10 20:30:44 +00:00
parent b3c2c3cbbb
commit 20053b3398
7 changed files with 170 additions and 280 deletions

View file

@ -489,7 +489,7 @@ ucol_close(UCollator *coll)
}
}
if(coll->mapping != NULL) {
ucmp32_close(coll->mapping);
ucmpe32_close(coll->mapping);
}
if(coll->rules != NULL && coll->freeRulesOnClose) {
uprv_free((UChar *)coll->rules);
@ -722,7 +722,7 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UEr
result->image = image;
const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
CompactIntArray *newUCAmapping = ucmp32_openFromData(&mapping, status);
CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);
if(U_SUCCESS(*status)) {
result->mapping = newUCAmapping;
} else {
@ -1106,7 +1106,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
}
else
{
order = ucmp32_get(coll->mapping, ch); /* we'll go for slightly slower trie */
order = ucmpe32_get(coll->mapping, ch); /* we'll go for slightly slower trie */
if(order > UCOL_NOT_FOUND) { /* if a CE is special */
order = getSpecialCE(coll, order, collationSource, status); /* and try to get the special CE */
}
@ -1395,7 +1395,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
result = UCOL_THAI;
}
else {
result = ucmp32_get(coll->mapping, ch);
result = ucmpe32_get(coll->mapping, ch);
}
if (result > UCOL_NOT_FOUND) {
result = getSpecialPrevCE(coll, result, data, status);
@ -1434,7 +1434,7 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
uint32_t order;
/* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
order = ucmp32_get(UCA->mapping, ch);
order = ucmpe32_get(UCA->mapping, ch);
if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
order = getSpecialCE(UCA, order, collationSource, status);
@ -1472,12 +1472,12 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
// return the first CE, but first put the rest into the expansion buffer
if (!collationSource->coll->image->jamoSpecial) { // FAST PATH
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, V);
*(collationSource->CEpos++) = ucmpe32_get(UCA->mapping, V);
if (T != TBase) {
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T);
*(collationSource->CEpos++) = ucmpe32_get(UCA->mapping, T);
}
return ucmp32_get(UCA->mapping, L); // return first one
return ucmpe32_get(UCA->mapping, L); // return first one
} else { // Jamo is Special
collIterate jamos;
@ -1585,7 +1585,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
}
else {
*/
order = ucmp32_get(UCA->mapping, ch);
order = ucmpe32_get(UCA->mapping, ch);
//}
}
@ -1635,10 +1635,10 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
*/
if (!collationSource->coll->image->jamoSpecial)
{
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, L);
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V);
*(collationSource->CEpos ++) = ucmpe32_get(UCA->mapping, L);
*(collationSource->CEpos ++) = ucmpe32_get(UCA->mapping, V);
if (T != TBase)
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, T);
*(collationSource->CEpos ++) = ucmpe32_get(UCA->mapping, T);
collationSource->toReturn = collationSource->CEpos - 1;
return *(collationSource->toReturn);
@ -2139,8 +2139,19 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
/* This one is not found, and we'll let somebody else bother about it... no more games */
return CE;
case SURROGATE_TAG:
/* pending surrogate discussion with Markus and Mark */
return UCOL_NOT_FOUND;
/* we encountered a leading surrogate. We shall get the CE by using the following code unit */
/* two things can happen here: next code point can be a trailing surrogate - we will use it */
/* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
/* we return 0 (completely ignorable - per UCA specification */
{
UChar trail;
if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
return 0;
} else {
CE = ucmpe32_getSurrogate(coll->mapping, CE, trail);
}
}
break;
case THAI_TAG:
/* Thai/Lao reordering */
if (((source->flags) & UCOL_ITER_INNORMBUF) || /* Already Swapped || */
@ -4328,7 +4339,7 @@ U_CAPI UBool isTailored(const UCollator *coll, const UChar u, UErrorCode *status
return FALSE;
}
} else { /* regular */
CE = ucmp32_get(coll->mapping, u);
CE = ucmpe32_get(coll->mapping, u);
}
if(isContraction(CE)) {

View file

@ -865,6 +865,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
if(U_SUCCESS(*status)) {
ucol_initBuffers(&src->lh[i], status);
}
}
if(src->varTop != NULL) { /* stuff the variable top value */
@ -911,7 +912,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
/* add latin-1 stuff */
if(U_SUCCESS(*status)) {
for(u = 0; u<0x100; u++) {
if((CE = ucmp32_get(t->mapping, u)) == UCOL_NOT_FOUND
if((CE = ucmpe32_get(t->mapping, u)) == UCOL_NOT_FOUND
/* this test is for contractions that are missing the starting element. Looks like latin-1 should be done before assembling */
/* the table, even if it results in more false closure elements */
|| ((isContraction(CE)) &&
@ -941,7 +942,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
while(*conts != 0) {
tailoredCE = ucmp32_get(t->mapping, *conts);
tailoredCE = ucmpe32_get(t->mapping, *conts);
if(tailoredCE != UCOL_NOT_FOUND) {
UBool needToAdd = TRUE;
if(isContraction(tailoredCE)) {
@ -1055,159 +1056,3 @@ const InverseTableHeader *ucol_initInverseUCA(UErrorCode *status) {
return invUCA;
}
#if 0
/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
/* It is called by both getNextCE and getNextUCA */
uint32_t uprv_getSpecialDynamicCE(const tempUCATable *t, uint32_t CE, collIterate *source, UErrorCode *status) {
uint32_t i = 0; /* general counter */
uint32_t firstCE = UCOL_NOT_FOUND;
UChar *firstUChar = source->pos;
//uint32_t CE = *source->CEpos;
for (;;) {
const uint32_t *CEOffset = NULL;
const UChar *UCharOffset = NULL;
UChar schar, tchar;
uint32_t size = 0;
switch(getCETag(CE)) {
case NOT_FOUND_TAG:
/* This one is not found, and we'll let somebody else bother about it... no more games */
return CE;
case CHARSET_TAG:
case SURROGATE_TAG:
return UCOL_NOT_FOUND;
case CONTRACTION_TAG:
/* This should handle contractions */
for (;;) {
/* First we position ourselves at the begining of contraction sequence */
/*const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);*/
ContractionTable *ctb = t->contractions->elements[getContractOffset(CE)];
const UChar *ContractionStart = UCharOffset = ctb->codePoints;
if (source->pos>=source->endp) {
/* this is the end of string. (Null terminated handled later,
when the null doesn't match the contraction sequence.) */
{
/*CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));*/ /* So we'll pick whatever we have at the point... */
CE = *(ctb->CEs+(UCharOffset - ContractionStart)); /* So we'll pick whatever we have at the point... */
if (CE == UCOL_NOT_FOUND) {
source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */
if(firstCE != UCOL_NOT_FOUND) {
CE = firstCE;
}
}
}
break;
}
/* we need to convey the notion of having a backward search - most probably through the context object */
/* if (backwardsSearch) offset += contractionUChars[(int16_t)offset]; else UCharOffset++; */
UCharOffset++; /* skip the backward offset, see above */
schar = *source->pos++;
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
UCharOffset++;
}
if(schar != tchar) { /* we didn't find the correct codepoint. We can use either the first or the last CE */
UCharOffset = ContractionStart; /* We're not at the end, bailed out in the middle. Better use starting CE */
/*source->pos = firstUChar; *//* spit all the not found chars, which led us in this contraction */
source->pos--; /* Spit out the last char of the string, wasn't tasty enough */
}
/*CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));*/
CE = *(ctb->CEs + (UCharOffset - ContractionStart));
if(CE == UCOL_NOT_FOUND) {
source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */
if(firstCE != UCOL_NOT_FOUND) {
CE = firstCE;
}
break;
} else if(isContraction(CE)) { /* fix for the bug. Other places need to be checked */
/* this is contraction, and we will continue. However, we can fail along the */
/* th road, which means that we have part of contraction correct */
/*uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));*/
uint32_t tempCE = *(ctb->CEs);
if(tempCE != UCOL_NOT_FOUND) {
firstCE = *(ctb->CEs);
/*firstCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));*/
firstUChar = source->pos-1;
}
} else {
break;
}
}
break;
case EXPANSION_TAG:
case THAI_TAG:
/* This should handle expansion. */
/* NOTE: we can encounter both continuations and expansions in an expansion! */
/* I have to decide where continuations are going to be dealt with */
CEOffset = t->expansions->CEs+(getExpansionOffset(CE) - (headersize>>2)); /* find the offset to expansion table */
size = getExpansionCount(CE);
CE = *CEOffset++;
if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
for(i = 1; i<size; i++) {
*(source->CEpos++) = *CEOffset++;
}
} else { /* else, we do */
while(*CEOffset != 0) {
*(source->CEpos++) = *CEOffset++;
}
}
return CE;
default:
*status = U_INTERNAL_PROGRAM_ERROR;
CE=0;
break;
}
if (CE <= UCOL_NOT_FOUND) break;
}
return CE;
}
uint32_t uprv_ucol_getNextDynamicCE(tempUCATable *t, collIterate *collationSource, UErrorCode *status) {
uint32_t order;
if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
order = *(collationSource->toReturn++); /* if so, return them */
if(collationSource->CEpos == collationSource->toReturn) {
collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
}
return order;
}
UChar ch;
if (collationSource->pos >= collationSource->endp) {
// Ran off of the end of the main source string. We're done.
return UCOL_NO_MORE_CES;
}
ch = *collationSource->pos++;
order = ucmp32_get(t->mapping, ch); /* we'll go for slightly slower trie */
if(order >= UCOL_NOT_FOUND) { /* if a CE is special */
order = uprv_getSpecialDynamicCE(t, order, collationSource, status); /* and try to get the special CE */
if(order == UCOL_NOT_FOUND) { /* We couldn't find a good CE in the tailoring */
order = ucol_getNextUCA(ch, collationSource, status);
}
}
return order; /* return the CE */
}
uint32_t ucol_getDynamicCEs(UColTokenParser *src, tempUCATable *t, UChar *decomp, uint32_t noOfDec, uint32_t *result, uint32_t resultSize, UErrorCode *status) {
uint32_t resLen = 0;
collIterate colIt;
init_collIterate(src->UCA, decomp, noOfDec, &colIt);
result[resLen] = uprv_ucol_getNextDynamicCE(t, &colIt, status);
while(result[resLen] != UCOL_NO_MORE_CES) {
resLen++;
result[resLen] = uprv_ucol_getNextDynamicCE(t, &colIt, status);
}
return resLen;
}
#endif

View file

@ -39,7 +39,7 @@ void uprv_growTable(ContractionTable *tbl, UErrorCode *status) {
}
}
CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status) {
CntTable *uprv_cnttab_open(CompactEIntArray *mapping, UErrorCode *status) {
if(U_FAILURE(*status)) {
return 0;
}
@ -151,11 +151,11 @@ int32_t uprv_cnttab_constructTable(CntTable *table, uint32_t mainOffset, UErrorC
uint32_t CE;
for(i = 0; i<=0xFFFF; i++) {
CE = ucmp32_get(table->mapping, i);
for(i = 0; i<=0x10FFFF; i++) {
CE = ucmpe32_get(table->mapping, i);
if(isContraction(CE)) {
CE = constructContractCE(table->offsets[getContractOffset(CE)]);
ucmp32_set(table->mapping, (UChar)i, CE);
ucmpe32_set(table->mapping, i, CE);
}
}

View file

@ -21,7 +21,7 @@
#ifndef UCOL_CNTTABLE_H
#define UCOL_CNTTABLE_H
#include "ucmp32.h"
#include "ucmpe32.h"
#include "uhash.h"
#include "ucol_elm.h"
@ -37,7 +37,7 @@ struct ContractionTable {
struct CntTable {
ContractionTable **elements;
CompactIntArray *mapping;
CompactEIntArray *mapping;
UChar *codePoints;
uint32_t *CEs;
int32_t *offsets;
@ -46,7 +46,7 @@ struct CntTable {
int32_t capacity;
};
CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status);
CntTable *uprv_cnttab_open(CompactEIntArray *mapping, UErrorCode *status);
CntTable *uprv_cnttab_clone(CntTable *table);
void uprv_cnttab_close(CntTable *table);

View file

@ -128,7 +128,7 @@ tempUCATable * uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts
t->UCA = UCA;
t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable));
uprv_memset(t->expansions, 0, sizeof(ExpansionTable));
t->mapping = ucmp32_open(UCOL_NOT_FOUND);
t->mapping = ucmpe32_open(UCOL_NOT_FOUND, UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24), status);
t->contractions = uprv_cnttab_open(t->mapping, status);
/* copy UCA's maxexpansion and merge as we go along */
@ -179,13 +179,7 @@ tempUCATable *uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
/* mapping */
if(t->mapping != NULL) {
uint16_t *index = (uint16_t *)uprv_malloc(sizeof(uint16_t)*t->mapping->fCount);
int32_t *array = (int32_t *)uprv_malloc(sizeof(int32_t)*t->mapping->fCount);
uprv_memcpy(array, t->mapping->fArray, t->mapping->fCount*sizeof(int32_t));
uprv_memcpy(index, t->mapping->fIndex, UCMP32_kIndexCount*sizeof(uint16_t));
r->mapping = ucmp32_openAdopt(index, array, t->mapping->fCount);
r->mapping = ucmpe32_clone(t->mapping, status);
}
/* expansions */
@ -266,7 +260,7 @@ void uprv_uca_closeTempTable(tempUCATable *t) {
if(t->contractions != NULL) {
uprv_cnttab_close(t->contractions);
}
ucmp32_close(t->mapping);
ucmpe32_close(t->mapping);
uprv_free(t->maxExpansions->endExpansionCE);
uprv_free(t->maxExpansions->expansionCESize);
@ -544,34 +538,127 @@ void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t) {
}
}
uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, UCAElements *element, UErrorCode *status) {
uint32_t i = 0;
for (i=1; i<element->cSize; i++) { /* First add contraction chars to unsafe CP hash table */
unsafeCPSet(t->unsafeCP, element->cPoints[i]);
// Note regarding surrogate handling: We are interested only in the single
// or leading surrogates in a contraction. If a surrogate is somewhere else
// in the contraction, it is going to be handled as a pair of code units,
// as it doesn't affect the performance AND handling surrogates specially
// would complicate code way too much.
uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE,
UCAElements *element, UErrorCode *status) {
CntTable *contractions = t->contractions;
UChar32 cp;
uint32_t cpsize = 0;
// First we need to check if contractions starts with a surrogate
UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp);
if(cpsize<element->cSize) { // This is a real contraction, if there are other characters after the first
uint32_t j = 0;
for (j=1; j<element->cSize; j++) { /* First add contraction chars to unsafe CP hash table */
unsafeCPSet(t->unsafeCP, element->cPoints[j]);
}
// Add the last char of the contraction to the contraction-end hash table.
ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
if(UCOL_ISJAMO(element->cPoints[0])) {
t->image->jamoSpecial = TRUE;
}
/* then we need to deal with it */
/* we could aready have something in table - or we might not */
/* The fact is that we want to add or modify an existing contraction */
/* and add it backwards then */
element->cPoints+=cpsize;
element->cSize-=cpsize;
if(!isContraction(CE)) {
/* if it wasn't contraction, we wouldn't end up here*/
int32_t firstContractionOffset = 0;
int32_t contractionOffset = 0;
firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status);
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status);
CE = constructContractCE(firstContractionOffset);
} else { /* we are adding to existing contraction */
/* there were already some elements in the table, so we need to add a new contraction */
/* Two things can happen here: either the codepoint is already in the table, or it is not */
int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status);
if(position > 0) { /* if it is we just continue down the chain */
uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status);
uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status);
} else { /* if it isn't, we will have to create a new sequence */
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status);
}
}
element->cPoints-=cpsize;
element->cSize+=cpsize;
ucmpe32_set(t->mapping, cp, CE);
} else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */
ucmpe32_set(t->mapping, cp, element->mapCE);
} else { /* fill out the first stage of the contraction with the surrogate CE */
uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status);
uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status);
}
// Add the last char of the contraction to the contraction-end hash table.
ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
return CE;
}
if(UCOL_ISJAMO(element->cPoints[0])) {
t->image->jamoSpecial = TRUE;
uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) {
int32_t firstContractionOffset = 0;
int32_t contractionOffset = 0;
uint32_t contractionElement = UCOL_NOT_FOUND;
if(U_FAILURE(*status)) {
return UCOL_NOT_FOUND;
}
/* then we need to deal with it */
/* we could aready have something in table - or we might not */
/* The fact is that we want to add or modify an existing contraction */
/* and add it backwards then */
uint32_t result = uprv_uca_processContraction(t->contractions, element, CE, status);
if(CE == UCOL_NOT_FOUND || !isContraction(CE)) {
ucmp32_set(t->mapping, element->cPoints[0], result);
/* end of recursion */
if(element->cSize == 1) {
if(isContraction(existingCE)) {
uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status);
uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status);
return existingCE;
} else {
return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */
}
}
return result;
/* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
/* for both backward and forward cycles */
/* we encountered either an empty space or a non-contraction element */
/* this means we are constructing a new contraction sequence */
element->cPoints++;
element->cSize--;
if(!isContraction(existingCE)) {
/* if it wasn't contraction, we wouldn't end up here*/
firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status);
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status);
existingCE = constructContractCE(firstContractionOffset);
} else { /* we are adding to existing contraction */
/* there were already some elements in the table, so we need to add a new contraction */
/* Two things can happen here: either the codepoint is already in the table, or it is not */
int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status);
if(position > 0) { /* if it is we just continue down the chain */
uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status);
uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status);
} else { /* if it isn't, we will have to create a new sequence */
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status);
}
}
element->cPoints--;
element->cSize++;
return existingCE;
}
/* This adds a read element, while testing for existence */
uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) {
CompactIntArray *mapping = t->mapping;
CompactEIntArray *mapping = t->mapping;
ExpansionTable *expansions = t->expansions;
CntTable *contractions = t->contractions;
@ -619,10 +706,14 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode
}
}
CE = ucmp32_get(mapping, element->cPoints[0]);
if(element->cSize > 1) { /* we're adding a contraction */
/* OR A SURROGATE - HERE IS WHERE THE DISTINCTION HAS TO BE MADE! */
uint32_t i = 0;
UChar32 cp;
UTF_NEXT_CHAR(element->cPoints, i, element->cSize, cp);
CE = ucmpe32_get(mapping, cp);
UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements));
uprv_memcpy(composed, element, sizeof(UCAElements));
composed->cPoints = composed->uchars;
@ -639,22 +730,23 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode
uprv_free(composed);
CE = uprv_uca_addContraction(t, CE, element, status);
} else { /* easy case, */
CE = ucmpe32_get(mapping, element->cPoints[0]);
if( CE != UCOL_NOT_FOUND) {
if(isContraction(CE)) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */
uprv_cnttab_setContraction(contractions, CE, 0, 0, element->mapCE, status);
/* This loop has to change the CE at the end of contraction REDO!*/
uprv_cnttab_changeLastCE(contractions, CE, element->mapCE, status);
} else {
ucmp32_set(mapping, element->cPoints[0], element->mapCE);
ucmpe32_set(mapping, element->cPoints[0], element->mapCE);
#ifdef UCOL_DEBUG
fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]);
//*status = U_ILLEGAL_ARGUMENT_ERROR;
#endif
}
} else {
ucmp32_set(mapping, element->cPoints[0], element->mapCE);
ucmpe32_set(mapping, element->cPoints[0], element->mapCE);
}
}
@ -662,66 +754,8 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode
return CE;
}
uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) {
int32_t firstContractionOffset = 0;
int32_t contractionOffset = 0;
uint32_t contractionElement = UCOL_NOT_FOUND;
if(U_FAILURE(*status)) {
return UCOL_NOT_FOUND;
}
/* end of recursion */
if(element->cSize == 1) {
if(isContraction(existingCE)) {
uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status);
uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status);
return existingCE;
} else {
return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */
}
}
/* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
/* for both backward and forward cycles */
/* we encountered either an empty space or a non-contraction element */
/* this means we are constructing a new contraction sequence */
if(existingCE == UCOL_NOT_FOUND || !isContraction(existingCE)) {
/* if it wasn't contraction, we wouldn't end up here*/
firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status);
UChar toAdd = element->cPoints[1];
element->cPoints++;
element->cSize--;
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
element->cPoints--;
element->cSize++;
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, toAdd, newCE, status);
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status);
contractionElement = constructContractCE(firstContractionOffset);
return contractionElement;
} else { /* we are adding to existing contraction */
/* there were already some elements in the table, so we need to add a new contraction */
/* Two things can happen here: either the codepoint is already in the table, or it is not */
int32_t position = uprv_cnttab_findCP(contractions, existingCE, *(element->cPoints+1), status);
element->cPoints++;
element->cSize--;
if(position > 0) { /* if it is we just continue down the chain */
uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status);
uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status);
} else { /* if it isn't, we will have to create a new sequence */
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status);
}
element->cPoints--;
element->cSize++;
return existingCE;
}
}
void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping,
void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping,
MaxExpansionTable *maxexpansion,
MaxJamoExpansionTable *maxjamoexpansion,
UBool jamospecial,
@ -737,7 +771,7 @@ void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping,
uint32_t ce;
while (v >= VBASE) {
ce = ucmp32_get(mapping, v);
ce = ucmpe32_get(mapping, v);
if (ce < UCOL_SPECIAL_FLAG) {
uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status);
}
@ -746,7 +780,7 @@ void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping,
while (t >= TBASE)
{
ce = ucmp32_get(mapping, t);
ce = ucmpe32_get(mapping, t);
if (ce < UCOL_SPECIAL_FLAG) {
uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status);
}
@ -780,7 +814,7 @@ void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping,
UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
CompactIntArray *mapping = t->mapping;
CompactEIntArray *mapping = t->mapping;
ExpansionTable *expansions = t->expansions;
CntTable *contractions = t->contractions;
MaxExpansionTable *maxexpansion = t->maxExpansions;
@ -794,9 +828,9 @@ UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
int32_t contractionsSize = 0;
contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status);
ucmp32_compact(mapping, 1);
ucmpe32_compact(mapping);
UMemoryStream *ms = uprv_mstrm_openNew(8192);
int32_t mappingSize = ucmp32_flattenMem(mapping, ms);
int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);
const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);
/* sets jamo expansions */
@ -880,7 +914,7 @@ UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
uint32_t *store = (uint32_t*)(dataStart+tableOffset);
int32_t i = 0;
for(i = 0; i<=0xFF; i++) {
*(store++) = ucmp32_get(mapping,i);
*(store++) = ucmpe32_get(mapping,i);
tableOffset+=sizeof(uint32_t);
}

View file

@ -69,7 +69,7 @@ typedef struct {
} MaxExpansionTable;
typedef struct {
CompactIntArray *mapping;
CompactEIntArray *mapping;
ExpansionTable *expansions;
struct CntTable *contractions;
UCATableHeader *image;

View file

@ -27,7 +27,7 @@
#define UCOL_IMP_H
#include "unicode/ucol.h"
#include "ucmp32.h"
#include "ucmpe32.h"
#include "unicode/ures.h"
#include "unicode/udata.h"
@ -216,7 +216,7 @@ struct UCollationElements
if(ch <= 0xFF) { \
(order) = (coll)->latinOneMapping[ch]; \
} else { \
(order) = ucmp32_get((coll)->mapping, ch); \
(order) = ucmpe32_get((coll)->mapping, ch); \
} \
if((order) >= UCOL_NOT_FOUND) { \
(order) = getSpecialCE((coll), (order), &(collationSource), (status)); \
@ -583,7 +583,7 @@ struct UCollator {
UBool freeOnClose;
UResourceBundle *rb;
const UCATableHeader *image;
CompactIntArray *mapping;
CompactEIntArray *mapping;
const uint32_t *latinOneMapping;
const uint32_t *expansion;
const UChar *contractionIndex;