mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-1083 Optimization of surrogate operations
X-SVN-Rev: 5452
This commit is contained in:
parent
b3c2c3cbbb
commit
20053b3398
7 changed files with 170 additions and 280 deletions
|
@ -489,7 +489,7 @@ ucol_close(UCollator *coll)
|
|||
}
|
||||
}
|
||||
if(coll->mapping != NULL) {
|
||||
ucmp32_close(coll->mapping);
|
||||
ucmpe32_close(coll->mapping);
|
||||
}
|
||||
if(coll->rules != NULL && coll->freeRulesOnClose) {
|
||||
uprv_free((UChar *)coll->rules);
|
||||
|
@ -722,7 +722,7 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UEr
|
|||
|
||||
result->image = image;
|
||||
const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition;
|
||||
CompactIntArray *newUCAmapping = ucmp32_openFromData(&mapping, status);
|
||||
CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status);
|
||||
if(U_SUCCESS(*status)) {
|
||||
result->mapping = newUCAmapping;
|
||||
} else {
|
||||
|
@ -1106,7 +1106,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou
|
|||
}
|
||||
else
|
||||
{
|
||||
order = ucmp32_get(coll->mapping, ch); /* we'll go for slightly slower trie */
|
||||
order = ucmpe32_get(coll->mapping, ch); /* we'll go for slightly slower trie */
|
||||
if(order > UCOL_NOT_FOUND) { /* if a CE is special */
|
||||
order = getSpecialCE(coll, order, collationSource, status); /* and try to get the special CE */
|
||||
}
|
||||
|
@ -1395,7 +1395,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data,
|
|||
result = UCOL_THAI;
|
||||
}
|
||||
else {
|
||||
result = ucmp32_get(coll->mapping, ch);
|
||||
result = ucmpe32_get(coll->mapping, ch);
|
||||
}
|
||||
if (result > UCOL_NOT_FOUND) {
|
||||
result = getSpecialPrevCE(coll, result, data, status);
|
||||
|
@ -1434,7 +1434,7 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
|
|||
uint32_t order;
|
||||
|
||||
/* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */
|
||||
order = ucmp32_get(UCA->mapping, ch);
|
||||
order = ucmpe32_get(UCA->mapping, ch);
|
||||
|
||||
if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */
|
||||
order = getSpecialCE(UCA, order, collationSource, status);
|
||||
|
@ -1472,12 +1472,12 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
|
|||
// return the first CE, but first put the rest into the expansion buffer
|
||||
if (!collationSource->coll->image->jamoSpecial) { // FAST PATH
|
||||
|
||||
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, V);
|
||||
*(collationSource->CEpos++) = ucmpe32_get(UCA->mapping, V);
|
||||
if (T != TBase) {
|
||||
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T);
|
||||
*(collationSource->CEpos++) = ucmpe32_get(UCA->mapping, T);
|
||||
}
|
||||
|
||||
return ucmp32_get(UCA->mapping, L); // return first one
|
||||
return ucmpe32_get(UCA->mapping, L); // return first one
|
||||
|
||||
} else { // Jamo is Special
|
||||
collIterate jamos;
|
||||
|
@ -1585,7 +1585,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
|
|||
}
|
||||
else {
|
||||
*/
|
||||
order = ucmp32_get(UCA->mapping, ch);
|
||||
order = ucmpe32_get(UCA->mapping, ch);
|
||||
//}
|
||||
}
|
||||
|
||||
|
@ -1635,10 +1635,10 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
|
|||
*/
|
||||
if (!collationSource->coll->image->jamoSpecial)
|
||||
{
|
||||
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, L);
|
||||
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V);
|
||||
*(collationSource->CEpos ++) = ucmpe32_get(UCA->mapping, L);
|
||||
*(collationSource->CEpos ++) = ucmpe32_get(UCA->mapping, V);
|
||||
if (T != TBase)
|
||||
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, T);
|
||||
*(collationSource->CEpos ++) = ucmpe32_get(UCA->mapping, T);
|
||||
|
||||
collationSource->toReturn = collationSource->CEpos - 1;
|
||||
return *(collationSource->toReturn);
|
||||
|
@ -2139,8 +2139,19 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
|
|||
/* This one is not found, and we'll let somebody else bother about it... no more games */
|
||||
return CE;
|
||||
case SURROGATE_TAG:
|
||||
/* pending surrogate discussion with Markus and Mark */
|
||||
return UCOL_NOT_FOUND;
|
||||
/* we encountered a leading surrogate. We shall get the CE by using the following code unit */
|
||||
/* two things can happen here: next code point can be a trailing surrogate - we will use it */
|
||||
/* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */
|
||||
/* we return 0 (completely ignorable - per UCA specification */
|
||||
{
|
||||
UChar trail;
|
||||
if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) {
|
||||
return 0;
|
||||
} else {
|
||||
CE = ucmpe32_getSurrogate(coll->mapping, CE, trail);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case THAI_TAG:
|
||||
/* Thai/Lao reordering */
|
||||
if (((source->flags) & UCOL_ITER_INNORMBUF) || /* Already Swapped || */
|
||||
|
@ -4328,7 +4339,7 @@ U_CAPI UBool isTailored(const UCollator *coll, const UChar u, UErrorCode *status
|
|||
return FALSE;
|
||||
}
|
||||
} else { /* regular */
|
||||
CE = ucmp32_get(coll->mapping, u);
|
||||
CE = ucmpe32_get(coll->mapping, u);
|
||||
}
|
||||
|
||||
if(isContraction(CE)) {
|
||||
|
|
|
@ -865,6 +865,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
|
|||
if(U_SUCCESS(*status)) {
|
||||
ucol_initBuffers(&src->lh[i], status);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if(src->varTop != NULL) { /* stuff the variable top value */
|
||||
|
@ -911,7 +912,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
|
|||
/* add latin-1 stuff */
|
||||
if(U_SUCCESS(*status)) {
|
||||
for(u = 0; u<0x100; u++) {
|
||||
if((CE = ucmp32_get(t->mapping, u)) == UCOL_NOT_FOUND
|
||||
if((CE = ucmpe32_get(t->mapping, u)) == UCOL_NOT_FOUND
|
||||
/* this test is for contractions that are missing the starting element. Looks like latin-1 should be done before assembling */
|
||||
/* the table, even if it results in more false closure elements */
|
||||
|| ((isContraction(CE)) &&
|
||||
|
@ -941,7 +942,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st
|
|||
UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
|
||||
UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
|
||||
while(*conts != 0) {
|
||||
tailoredCE = ucmp32_get(t->mapping, *conts);
|
||||
tailoredCE = ucmpe32_get(t->mapping, *conts);
|
||||
if(tailoredCE != UCOL_NOT_FOUND) {
|
||||
UBool needToAdd = TRUE;
|
||||
if(isContraction(tailoredCE)) {
|
||||
|
@ -1055,159 +1056,3 @@ const InverseTableHeader *ucol_initInverseUCA(UErrorCode *status) {
|
|||
return invUCA;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
|
||||
/* It is called by both getNextCE and getNextUCA */
|
||||
uint32_t uprv_getSpecialDynamicCE(const tempUCATable *t, uint32_t CE, collIterate *source, UErrorCode *status) {
|
||||
uint32_t i = 0; /* general counter */
|
||||
uint32_t firstCE = UCOL_NOT_FOUND;
|
||||
UChar *firstUChar = source->pos;
|
||||
//uint32_t CE = *source->CEpos;
|
||||
for (;;) {
|
||||
const uint32_t *CEOffset = NULL;
|
||||
const UChar *UCharOffset = NULL;
|
||||
UChar schar, tchar;
|
||||
uint32_t size = 0;
|
||||
switch(getCETag(CE)) {
|
||||
case NOT_FOUND_TAG:
|
||||
/* This one is not found, and we'll let somebody else bother about it... no more games */
|
||||
return CE;
|
||||
case CHARSET_TAG:
|
||||
case SURROGATE_TAG:
|
||||
return UCOL_NOT_FOUND;
|
||||
case CONTRACTION_TAG:
|
||||
/* This should handle contractions */
|
||||
for (;;) {
|
||||
/* First we position ourselves at the begining of contraction sequence */
|
||||
/*const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);*/
|
||||
ContractionTable *ctb = t->contractions->elements[getContractOffset(CE)];
|
||||
const UChar *ContractionStart = UCharOffset = ctb->codePoints;
|
||||
|
||||
if (source->pos>=source->endp) {
|
||||
/* this is the end of string. (Null terminated handled later,
|
||||
when the null doesn't match the contraction sequence.) */
|
||||
{
|
||||
/*CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));*/ /* So we'll pick whatever we have at the point... */
|
||||
CE = *(ctb->CEs+(UCharOffset - ContractionStart)); /* So we'll pick whatever we have at the point... */
|
||||
if (CE == UCOL_NOT_FOUND) {
|
||||
source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */
|
||||
if(firstCE != UCOL_NOT_FOUND) {
|
||||
CE = firstCE;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/* we need to convey the notion of having a backward search - most probably through the context object */
|
||||
/* if (backwardsSearch) offset += contractionUChars[(int16_t)offset]; else UCharOffset++; */
|
||||
UCharOffset++; /* skip the backward offset, see above */
|
||||
|
||||
|
||||
schar = *source->pos++;
|
||||
while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */
|
||||
UCharOffset++;
|
||||
}
|
||||
if(schar != tchar) { /* we didn't find the correct codepoint. We can use either the first or the last CE */
|
||||
UCharOffset = ContractionStart; /* We're not at the end, bailed out in the middle. Better use starting CE */
|
||||
/*source->pos = firstUChar; *//* spit all the not found chars, which led us in this contraction */
|
||||
source->pos--; /* Spit out the last char of the string, wasn't tasty enough */
|
||||
}
|
||||
/*CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));*/
|
||||
CE = *(ctb->CEs + (UCharOffset - ContractionStart));
|
||||
|
||||
if(CE == UCOL_NOT_FOUND) {
|
||||
source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */
|
||||
if(firstCE != UCOL_NOT_FOUND) {
|
||||
CE = firstCE;
|
||||
}
|
||||
break;
|
||||
} else if(isContraction(CE)) { /* fix for the bug. Other places need to be checked */
|
||||
/* this is contraction, and we will continue. However, we can fail along the */
|
||||
/* th road, which means that we have part of contraction correct */
|
||||
/*uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));*/
|
||||
uint32_t tempCE = *(ctb->CEs);
|
||||
if(tempCE != UCOL_NOT_FOUND) {
|
||||
firstCE = *(ctb->CEs);
|
||||
/*firstCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));*/
|
||||
firstUChar = source->pos-1;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case EXPANSION_TAG:
|
||||
case THAI_TAG:
|
||||
/* This should handle expansion. */
|
||||
/* NOTE: we can encounter both continuations and expansions in an expansion! */
|
||||
/* I have to decide where continuations are going to be dealt with */
|
||||
CEOffset = t->expansions->CEs+(getExpansionOffset(CE) - (headersize>>2)); /* find the offset to expansion table */
|
||||
size = getExpansionCount(CE);
|
||||
CE = *CEOffset++;
|
||||
if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
|
||||
for(i = 1; i<size; i++) {
|
||||
*(source->CEpos++) = *CEOffset++;
|
||||
}
|
||||
} else { /* else, we do */
|
||||
while(*CEOffset != 0) {
|
||||
*(source->CEpos++) = *CEOffset++;
|
||||
}
|
||||
}
|
||||
return CE;
|
||||
default:
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
CE=0;
|
||||
break;
|
||||
}
|
||||
if (CE <= UCOL_NOT_FOUND) break;
|
||||
}
|
||||
return CE;
|
||||
}
|
||||
|
||||
uint32_t uprv_ucol_getNextDynamicCE(tempUCATable *t, collIterate *collationSource, UErrorCode *status) {
|
||||
uint32_t order;
|
||||
if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */
|
||||
order = *(collationSource->toReturn++); /* if so, return them */
|
||||
if(collationSource->CEpos == collationSource->toReturn) {
|
||||
collationSource->CEpos = collationSource->toReturn = collationSource->CEs;
|
||||
}
|
||||
return order;
|
||||
}
|
||||
|
||||
UChar ch;
|
||||
|
||||
if (collationSource->pos >= collationSource->endp) {
|
||||
// Ran off of the end of the main source string. We're done.
|
||||
return UCOL_NO_MORE_CES;
|
||||
}
|
||||
ch = *collationSource->pos++;
|
||||
|
||||
order = ucmp32_get(t->mapping, ch); /* we'll go for slightly slower trie */
|
||||
|
||||
if(order >= UCOL_NOT_FOUND) { /* if a CE is special */
|
||||
order = uprv_getSpecialDynamicCE(t, order, collationSource, status); /* and try to get the special CE */
|
||||
|
||||
if(order == UCOL_NOT_FOUND) { /* We couldn't find a good CE in the tailoring */
|
||||
order = ucol_getNextUCA(ch, collationSource, status);
|
||||
}
|
||||
}
|
||||
|
||||
return order; /* return the CE */
|
||||
}
|
||||
|
||||
uint32_t ucol_getDynamicCEs(UColTokenParser *src, tempUCATable *t, UChar *decomp, uint32_t noOfDec, uint32_t *result, uint32_t resultSize, UErrorCode *status) {
|
||||
uint32_t resLen = 0;
|
||||
collIterate colIt;
|
||||
|
||||
init_collIterate(src->UCA, decomp, noOfDec, &colIt);
|
||||
|
||||
result[resLen] = uprv_ucol_getNextDynamicCE(t, &colIt, status);
|
||||
while(result[resLen] != UCOL_NO_MORE_CES) {
|
||||
resLen++;
|
||||
result[resLen] = uprv_ucol_getNextDynamicCE(t, &colIt, status);
|
||||
}
|
||||
|
||||
return resLen;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -39,7 +39,7 @@ void uprv_growTable(ContractionTable *tbl, UErrorCode *status) {
|
|||
}
|
||||
}
|
||||
|
||||
CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status) {
|
||||
CntTable *uprv_cnttab_open(CompactEIntArray *mapping, UErrorCode *status) {
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -151,11 +151,11 @@ int32_t uprv_cnttab_constructTable(CntTable *table, uint32_t mainOffset, UErrorC
|
|||
|
||||
|
||||
uint32_t CE;
|
||||
for(i = 0; i<=0xFFFF; i++) {
|
||||
CE = ucmp32_get(table->mapping, i);
|
||||
for(i = 0; i<=0x10FFFF; i++) {
|
||||
CE = ucmpe32_get(table->mapping, i);
|
||||
if(isContraction(CE)) {
|
||||
CE = constructContractCE(table->offsets[getContractOffset(CE)]);
|
||||
ucmp32_set(table->mapping, (UChar)i, CE);
|
||||
ucmpe32_set(table->mapping, i, CE);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
#ifndef UCOL_CNTTABLE_H
|
||||
#define UCOL_CNTTABLE_H
|
||||
|
||||
#include "ucmp32.h"
|
||||
#include "ucmpe32.h"
|
||||
#include "uhash.h"
|
||||
#include "ucol_elm.h"
|
||||
|
||||
|
@ -37,7 +37,7 @@ struct ContractionTable {
|
|||
|
||||
struct CntTable {
|
||||
ContractionTable **elements;
|
||||
CompactIntArray *mapping;
|
||||
CompactEIntArray *mapping;
|
||||
UChar *codePoints;
|
||||
uint32_t *CEs;
|
||||
int32_t *offsets;
|
||||
|
@ -46,7 +46,7 @@ struct CntTable {
|
|||
int32_t capacity;
|
||||
};
|
||||
|
||||
CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status);
|
||||
CntTable *uprv_cnttab_open(CompactEIntArray *mapping, UErrorCode *status);
|
||||
CntTable *uprv_cnttab_clone(CntTable *table);
|
||||
void uprv_cnttab_close(CntTable *table);
|
||||
|
||||
|
|
|
@ -128,7 +128,7 @@ tempUCATable * uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts
|
|||
t->UCA = UCA;
|
||||
t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable));
|
||||
uprv_memset(t->expansions, 0, sizeof(ExpansionTable));
|
||||
t->mapping = ucmp32_open(UCOL_NOT_FOUND);
|
||||
t->mapping = ucmpe32_open(UCOL_NOT_FOUND, UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24), status);
|
||||
t->contractions = uprv_cnttab_open(t->mapping, status);
|
||||
|
||||
/* copy UCA's maxexpansion and merge as we go along */
|
||||
|
@ -179,13 +179,7 @@ tempUCATable *uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) {
|
|||
|
||||
/* mapping */
|
||||
if(t->mapping != NULL) {
|
||||
uint16_t *index = (uint16_t *)uprv_malloc(sizeof(uint16_t)*t->mapping->fCount);
|
||||
int32_t *array = (int32_t *)uprv_malloc(sizeof(int32_t)*t->mapping->fCount);
|
||||
|
||||
uprv_memcpy(array, t->mapping->fArray, t->mapping->fCount*sizeof(int32_t));
|
||||
uprv_memcpy(index, t->mapping->fIndex, UCMP32_kIndexCount*sizeof(uint16_t));
|
||||
|
||||
r->mapping = ucmp32_openAdopt(index, array, t->mapping->fCount);
|
||||
r->mapping = ucmpe32_clone(t->mapping, status);
|
||||
}
|
||||
|
||||
/* expansions */
|
||||
|
@ -266,7 +260,7 @@ void uprv_uca_closeTempTable(tempUCATable *t) {
|
|||
if(t->contractions != NULL) {
|
||||
uprv_cnttab_close(t->contractions);
|
||||
}
|
||||
ucmp32_close(t->mapping);
|
||||
ucmpe32_close(t->mapping);
|
||||
|
||||
uprv_free(t->maxExpansions->endExpansionCE);
|
||||
uprv_free(t->maxExpansions->expansionCESize);
|
||||
|
@ -544,34 +538,127 @@ void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t) {
|
|||
}
|
||||
}
|
||||
|
||||
uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, UCAElements *element, UErrorCode *status) {
|
||||
uint32_t i = 0;
|
||||
|
||||
for (i=1; i<element->cSize; i++) { /* First add contraction chars to unsafe CP hash table */
|
||||
unsafeCPSet(t->unsafeCP, element->cPoints[i]);
|
||||
// Note regarding surrogate handling: We are interested only in the single
|
||||
// or leading surrogates in a contraction. If a surrogate is somewhere else
|
||||
// in the contraction, it is going to be handled as a pair of code units,
|
||||
// as it doesn't affect the performance AND handling surrogates specially
|
||||
// would complicate code way too much.
|
||||
uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE,
|
||||
UCAElements *element, UErrorCode *status) {
|
||||
CntTable *contractions = t->contractions;
|
||||
UChar32 cp;
|
||||
uint32_t cpsize = 0;
|
||||
|
||||
// First we need to check if contractions starts with a surrogate
|
||||
UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp);
|
||||
|
||||
if(cpsize<element->cSize) { // This is a real contraction, if there are other characters after the first
|
||||
uint32_t j = 0;
|
||||
for (j=1; j<element->cSize; j++) { /* First add contraction chars to unsafe CP hash table */
|
||||
unsafeCPSet(t->unsafeCP, element->cPoints[j]);
|
||||
}
|
||||
// Add the last char of the contraction to the contraction-end hash table.
|
||||
ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
|
||||
if(UCOL_ISJAMO(element->cPoints[0])) {
|
||||
t->image->jamoSpecial = TRUE;
|
||||
}
|
||||
/* then we need to deal with it */
|
||||
/* we could aready have something in table - or we might not */
|
||||
/* The fact is that we want to add or modify an existing contraction */
|
||||
/* and add it backwards then */
|
||||
element->cPoints+=cpsize;
|
||||
element->cSize-=cpsize;
|
||||
if(!isContraction(CE)) {
|
||||
/* if it wasn't contraction, we wouldn't end up here*/
|
||||
int32_t firstContractionOffset = 0;
|
||||
int32_t contractionOffset = 0;
|
||||
firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status);
|
||||
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
|
||||
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
|
||||
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status);
|
||||
CE = constructContractCE(firstContractionOffset);
|
||||
} else { /* we are adding to existing contraction */
|
||||
/* there were already some elements in the table, so we need to add a new contraction */
|
||||
/* Two things can happen here: either the codepoint is already in the table, or it is not */
|
||||
int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status);
|
||||
if(position > 0) { /* if it is we just continue down the chain */
|
||||
uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status);
|
||||
uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
|
||||
uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status);
|
||||
} else { /* if it isn't, we will have to create a new sequence */
|
||||
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
|
||||
uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status);
|
||||
}
|
||||
}
|
||||
element->cPoints-=cpsize;
|
||||
element->cSize+=cpsize;
|
||||
ucmpe32_set(t->mapping, cp, CE);
|
||||
} else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */
|
||||
ucmpe32_set(t->mapping, cp, element->mapCE);
|
||||
} else { /* fill out the first stage of the contraction with the surrogate CE */
|
||||
uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status);
|
||||
uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status);
|
||||
}
|
||||
// Add the last char of the contraction to the contraction-end hash table.
|
||||
ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
|
||||
return CE;
|
||||
}
|
||||
|
||||
if(UCOL_ISJAMO(element->cPoints[0])) {
|
||||
t->image->jamoSpecial = TRUE;
|
||||
|
||||
uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) {
|
||||
int32_t firstContractionOffset = 0;
|
||||
int32_t contractionOffset = 0;
|
||||
uint32_t contractionElement = UCOL_NOT_FOUND;
|
||||
|
||||
if(U_FAILURE(*status)) {
|
||||
return UCOL_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* then we need to deal with it */
|
||||
/* we could aready have something in table - or we might not */
|
||||
/* The fact is that we want to add or modify an existing contraction */
|
||||
/* and add it backwards then */
|
||||
uint32_t result = uprv_uca_processContraction(t->contractions, element, CE, status);
|
||||
if(CE == UCOL_NOT_FOUND || !isContraction(CE)) {
|
||||
ucmp32_set(t->mapping, element->cPoints[0], result);
|
||||
/* end of recursion */
|
||||
if(element->cSize == 1) {
|
||||
if(isContraction(existingCE)) {
|
||||
uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status);
|
||||
uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status);
|
||||
return existingCE;
|
||||
} else {
|
||||
return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
/* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
|
||||
/* for both backward and forward cycles */
|
||||
|
||||
/* we encountered either an empty space or a non-contraction element */
|
||||
/* this means we are constructing a new contraction sequence */
|
||||
element->cPoints++;
|
||||
element->cSize--;
|
||||
if(!isContraction(existingCE)) {
|
||||
/* if it wasn't contraction, we wouldn't end up here*/
|
||||
firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status);
|
||||
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
|
||||
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status);
|
||||
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status);
|
||||
existingCE = constructContractCE(firstContractionOffset);
|
||||
} else { /* we are adding to existing contraction */
|
||||
/* there were already some elements in the table, so we need to add a new contraction */
|
||||
/* Two things can happen here: either the codepoint is already in the table, or it is not */
|
||||
int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status);
|
||||
if(position > 0) { /* if it is we just continue down the chain */
|
||||
uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status);
|
||||
uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
|
||||
uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status);
|
||||
} else { /* if it isn't, we will have to create a new sequence */
|
||||
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
|
||||
uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status);
|
||||
}
|
||||
}
|
||||
element->cPoints--;
|
||||
element->cSize++;
|
||||
return existingCE;
|
||||
}
|
||||
|
||||
/* This adds a read element, while testing for existence */
|
||||
uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) {
|
||||
CompactIntArray *mapping = t->mapping;
|
||||
CompactEIntArray *mapping = t->mapping;
|
||||
ExpansionTable *expansions = t->expansions;
|
||||
CntTable *contractions = t->contractions;
|
||||
|
||||
|
@ -619,10 +706,14 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode
|
|||
}
|
||||
}
|
||||
|
||||
CE = ucmp32_get(mapping, element->cPoints[0]);
|
||||
|
||||
if(element->cSize > 1) { /* we're adding a contraction */
|
||||
/* OR A SURROGATE - HERE IS WHERE THE DISTINCTION HAS TO BE MADE! */
|
||||
uint32_t i = 0;
|
||||
UChar32 cp;
|
||||
|
||||
UTF_NEXT_CHAR(element->cPoints, i, element->cSize, cp);
|
||||
CE = ucmpe32_get(mapping, cp);
|
||||
|
||||
UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements));
|
||||
uprv_memcpy(composed, element, sizeof(UCAElements));
|
||||
composed->cPoints = composed->uchars;
|
||||
|
@ -639,22 +730,23 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode
|
|||
uprv_free(composed);
|
||||
|
||||
CE = uprv_uca_addContraction(t, CE, element, status);
|
||||
|
||||
} else { /* easy case, */
|
||||
CE = ucmpe32_get(mapping, element->cPoints[0]);
|
||||
|
||||
if( CE != UCOL_NOT_FOUND) {
|
||||
if(isContraction(CE)) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */
|
||||
uprv_cnttab_setContraction(contractions, CE, 0, 0, element->mapCE, status);
|
||||
/* This loop has to change the CE at the end of contraction REDO!*/
|
||||
uprv_cnttab_changeLastCE(contractions, CE, element->mapCE, status);
|
||||
} else {
|
||||
ucmp32_set(mapping, element->cPoints[0], element->mapCE);
|
||||
ucmpe32_set(mapping, element->cPoints[0], element->mapCE);
|
||||
#ifdef UCOL_DEBUG
|
||||
fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]);
|
||||
//*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
#endif
|
||||
}
|
||||
} else {
|
||||
ucmp32_set(mapping, element->cPoints[0], element->mapCE);
|
||||
ucmpe32_set(mapping, element->cPoints[0], element->mapCE);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -662,66 +754,8 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode
|
|||
return CE;
|
||||
}
|
||||
|
||||
uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) {
|
||||
int32_t firstContractionOffset = 0;
|
||||
int32_t contractionOffset = 0;
|
||||
uint32_t contractionElement = UCOL_NOT_FOUND;
|
||||
|
||||
if(U_FAILURE(*status)) {
|
||||
return UCOL_NOT_FOUND;
|
||||
}
|
||||
|
||||
/* end of recursion */
|
||||
if(element->cSize == 1) {
|
||||
if(isContraction(existingCE)) {
|
||||
uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status);
|
||||
uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status);
|
||||
return existingCE;
|
||||
} else {
|
||||
return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */
|
||||
}
|
||||
}
|
||||
|
||||
/* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */
|
||||
/* for both backward and forward cycles */
|
||||
|
||||
/* we encountered either an empty space or a non-contraction element */
|
||||
/* this means we are constructing a new contraction sequence */
|
||||
if(existingCE == UCOL_NOT_FOUND || !isContraction(existingCE)) {
|
||||
/* if it wasn't contraction, we wouldn't end up here*/
|
||||
firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status);
|
||||
|
||||
UChar toAdd = element->cPoints[1];
|
||||
element->cPoints++;
|
||||
element->cSize--;
|
||||
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
|
||||
element->cPoints--;
|
||||
element->cSize++;
|
||||
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, toAdd, newCE, status);
|
||||
contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status);
|
||||
contractionElement = constructContractCE(firstContractionOffset);
|
||||
return contractionElement;
|
||||
} else { /* we are adding to existing contraction */
|
||||
/* there were already some elements in the table, so we need to add a new contraction */
|
||||
/* Two things can happen here: either the codepoint is already in the table, or it is not */
|
||||
int32_t position = uprv_cnttab_findCP(contractions, existingCE, *(element->cPoints+1), status);
|
||||
element->cPoints++;
|
||||
element->cSize--;
|
||||
if(position > 0) { /* if it is we just continue down the chain */
|
||||
uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status);
|
||||
uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status);
|
||||
uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status);
|
||||
} else { /* if it isn't, we will have to create a new sequence */
|
||||
uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status);
|
||||
uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status);
|
||||
}
|
||||
element->cPoints--;
|
||||
element->cSize++;
|
||||
return existingCE;
|
||||
}
|
||||
}
|
||||
|
||||
void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping,
|
||||
void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping,
|
||||
MaxExpansionTable *maxexpansion,
|
||||
MaxJamoExpansionTable *maxjamoexpansion,
|
||||
UBool jamospecial,
|
||||
|
@ -737,7 +771,7 @@ void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping,
|
|||
uint32_t ce;
|
||||
|
||||
while (v >= VBASE) {
|
||||
ce = ucmp32_get(mapping, v);
|
||||
ce = ucmpe32_get(mapping, v);
|
||||
if (ce < UCOL_SPECIAL_FLAG) {
|
||||
uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status);
|
||||
}
|
||||
|
@ -746,7 +780,7 @@ void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping,
|
|||
|
||||
while (t >= TBASE)
|
||||
{
|
||||
ce = ucmp32_get(mapping, t);
|
||||
ce = ucmpe32_get(mapping, t);
|
||||
if (ce < UCOL_SPECIAL_FLAG) {
|
||||
uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status);
|
||||
}
|
||||
|
@ -780,7 +814,7 @@ void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping,
|
|||
|
||||
|
||||
UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
|
||||
CompactIntArray *mapping = t->mapping;
|
||||
CompactEIntArray *mapping = t->mapping;
|
||||
ExpansionTable *expansions = t->expansions;
|
||||
CntTable *contractions = t->contractions;
|
||||
MaxExpansionTable *maxexpansion = t->maxExpansions;
|
||||
|
@ -794,9 +828,9 @@ UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
|
|||
int32_t contractionsSize = 0;
|
||||
contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status);
|
||||
|
||||
ucmp32_compact(mapping, 1);
|
||||
ucmpe32_compact(mapping);
|
||||
UMemoryStream *ms = uprv_mstrm_openNew(8192);
|
||||
int32_t mappingSize = ucmp32_flattenMem(mapping, ms);
|
||||
int32_t mappingSize = ucmpe32_flattenMem(mapping, ms);
|
||||
const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize);
|
||||
|
||||
/* sets jamo expansions */
|
||||
|
@ -880,7 +914,7 @@ UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
|
|||
uint32_t *store = (uint32_t*)(dataStart+tableOffset);
|
||||
int32_t i = 0;
|
||||
for(i = 0; i<=0xFF; i++) {
|
||||
*(store++) = ucmp32_get(mapping,i);
|
||||
*(store++) = ucmpe32_get(mapping,i);
|
||||
tableOffset+=sizeof(uint32_t);
|
||||
}
|
||||
|
||||
|
|
|
@ -69,7 +69,7 @@ typedef struct {
|
|||
} MaxExpansionTable;
|
||||
|
||||
typedef struct {
|
||||
CompactIntArray *mapping;
|
||||
CompactEIntArray *mapping;
|
||||
ExpansionTable *expansions;
|
||||
struct CntTable *contractions;
|
||||
UCATableHeader *image;
|
||||
|
|
|
@ -27,7 +27,7 @@
|
|||
#define UCOL_IMP_H
|
||||
|
||||
#include "unicode/ucol.h"
|
||||
#include "ucmp32.h"
|
||||
#include "ucmpe32.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/udata.h"
|
||||
|
||||
|
@ -216,7 +216,7 @@ struct UCollationElements
|
|||
if(ch <= 0xFF) { \
|
||||
(order) = (coll)->latinOneMapping[ch]; \
|
||||
} else { \
|
||||
(order) = ucmp32_get((coll)->mapping, ch); \
|
||||
(order) = ucmpe32_get((coll)->mapping, ch); \
|
||||
} \
|
||||
if((order) >= UCOL_NOT_FOUND) { \
|
||||
(order) = getSpecialCE((coll), (order), &(collationSource), (status)); \
|
||||
|
@ -583,7 +583,7 @@ struct UCollator {
|
|||
UBool freeOnClose;
|
||||
UResourceBundle *rb;
|
||||
const UCATableHeader *image;
|
||||
CompactIntArray *mapping;
|
||||
CompactEIntArray *mapping;
|
||||
const uint32_t *latinOneMapping;
|
||||
const uint32_t *expansion;
|
||||
const UChar *contractionIndex;
|
||||
|
|
Loading…
Add table
Reference in a new issue