From 20053b3398da5b49a2a5a191f120c69faf8ea5d1 Mon Sep 17 00:00:00 2001 From: Vladimir Weinstein Date: Fri, 10 Aug 2001 20:30:44 +0000 Subject: [PATCH] ICU-1083 Optimization of surrogate operations X-SVN-Rev: 5452 --- icu4c/source/i18n/ucol.cpp | 41 +++--- icu4c/source/i18n/ucol_bld.cpp | 161 +---------------------- icu4c/source/i18n/ucol_cnt.cpp | 8 +- icu4c/source/i18n/ucol_cnt.h | 6 +- icu4c/source/i18n/ucol_elm.cpp | 226 +++++++++++++++++++-------------- icu4c/source/i18n/ucol_elm.h | 2 +- icu4c/source/i18n/ucol_imp.h | 6 +- 7 files changed, 170 insertions(+), 280 deletions(-) diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index ec04f6d87ba..23e2f84dbf7 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -489,7 +489,7 @@ ucol_close(UCollator *coll) } } if(coll->mapping != NULL) { - ucmp32_close(coll->mapping); + ucmpe32_close(coll->mapping); } if(coll->rules != NULL && coll->freeRulesOnClose) { uprv_free((UChar *)coll->rules); @@ -722,7 +722,7 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UEr result->image = image; const uint8_t *mapping = (uint8_t*)result->image+result->image->mappingPosition; - CompactIntArray *newUCAmapping = ucmp32_openFromData(&mapping, status); + CompactEIntArray *newUCAmapping = ucmpe32_openFromData(&mapping, status); if(U_SUCCESS(*status)) { result->mapping = newUCAmapping; } else { @@ -1106,7 +1106,7 @@ inline uint32_t ucol_IGetNextCE(const UCollator *coll, collIterate *collationSou } else { - order = ucmp32_get(coll->mapping, ch); /* we'll go for slightly slower trie */ + order = ucmpe32_get(coll->mapping, ch); /* we'll go for slightly slower trie */ if(order > UCOL_NOT_FOUND) { /* if a CE is special */ order = getSpecialCE(coll, order, collationSource, status); /* and try to get the special CE */ } @@ -1395,7 +1395,7 @@ inline uint32_t ucol_IGetPrevCE(const UCollator *coll, collIterate *data, result = UCOL_THAI; } else { - result = ucmp32_get(coll->mapping, ch); + result = ucmpe32_get(coll->mapping, ch); } if (result > UCOL_NOT_FOUND) { result = getSpecialPrevCE(coll, result, data, status); @@ -1434,7 +1434,7 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta uint32_t order; /* if we got here, the codepoint MUST be over 0xFF - so we look directly in the trie */ - order = ucmp32_get(UCA->mapping, ch); + order = ucmpe32_get(UCA->mapping, ch); if(order > UCOL_NOT_FOUND) { /* UCA also gives us a special CE */ order = getSpecialCE(UCA, order, collationSource, status); @@ -1472,12 +1472,12 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta // return the first CE, but first put the rest into the expansion buffer if (!collationSource->coll->image->jamoSpecial) { // FAST PATH - *(collationSource->CEpos++) = ucmp32_get(UCA->mapping, V); + *(collationSource->CEpos++) = ucmpe32_get(UCA->mapping, V); if (T != TBase) { - *(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T); + *(collationSource->CEpos++) = ucmpe32_get(UCA->mapping, T); } - return ucmp32_get(UCA->mapping, L); // return first one + return ucmpe32_get(UCA->mapping, L); // return first one } else { // Jamo is Special collIterate jamos; @@ -1585,7 +1585,7 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource, } else { */ - order = ucmp32_get(UCA->mapping, ch); + order = ucmpe32_get(UCA->mapping, ch); //} } @@ -1635,10 +1635,10 @@ uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource, */ if (!collationSource->coll->image->jamoSpecial) { - *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, L); - *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V); + *(collationSource->CEpos ++) = ucmpe32_get(UCA->mapping, L); + *(collationSource->CEpos ++) = ucmpe32_get(UCA->mapping, V); if (T != TBase) - *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, T); + *(collationSource->CEpos ++) = ucmpe32_get(UCA->mapping, T); collationSource->toReturn = collationSource->CEpos - 1; return *(collationSource->toReturn); @@ -2139,8 +2139,19 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U /* This one is not found, and we'll let somebody else bother about it... no more games */ return CE; case SURROGATE_TAG: - /* pending surrogate discussion with Markus and Mark */ - return UCOL_NOT_FOUND; + /* we encountered a leading surrogate. We shall get the CE by using the following code unit */ + /* two things can happen here: next code point can be a trailing surrogate - we will use it */ + /* to retrieve the CE, or it is not a trailing surrogate (or the string is done). In that case */ + /* we return 0 (completely ignorable - per UCA specification */ + { + UChar trail; + if (collIter_eos(source) || !(UTF16_IS_TRAIL((trail = getNextNormalizedChar(source))))) { + return 0; + } else { + CE = ucmpe32_getSurrogate(coll->mapping, CE, trail); + } + } + break; case THAI_TAG: /* Thai/Lao reordering */ if (((source->flags) & UCOL_ITER_INNORMBUF) || /* Already Swapped || */ @@ -4328,7 +4339,7 @@ U_CAPI UBool isTailored(const UCollator *coll, const UChar u, UErrorCode *status return FALSE; } } else { /* regular */ - CE = ucmp32_get(coll->mapping, u); + CE = ucmpe32_get(coll->mapping, u); } if(isContraction(CE)) { diff --git a/icu4c/source/i18n/ucol_bld.cpp b/icu4c/source/i18n/ucol_bld.cpp index 6d8b46a4094..5891ca7f8a3 100644 --- a/icu4c/source/i18n/ucol_bld.cpp +++ b/icu4c/source/i18n/ucol_bld.cpp @@ -865,6 +865,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st if(U_SUCCESS(*status)) { ucol_initBuffers(&src->lh[i], status); } + } if(src->varTop != NULL) { /* stuff the variable top value */ @@ -911,7 +912,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st /* add latin-1 stuff */ if(U_SUCCESS(*status)) { for(u = 0; u<0x100; u++) { - if((CE = ucmp32_get(t->mapping, u)) == UCOL_NOT_FOUND + if((CE = ucmpe32_get(t->mapping, u)) == UCOL_NOT_FOUND /* this test is for contractions that are missing the starting element. Looks like latin-1 should be done before assembling */ /* the table, even if it results in more false closure elements */ || ((isContraction(CE)) && @@ -941,7 +942,7 @@ UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *st UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos); UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status); while(*conts != 0) { - tailoredCE = ucmp32_get(t->mapping, *conts); + tailoredCE = ucmpe32_get(t->mapping, *conts); if(tailoredCE != UCOL_NOT_FOUND) { UBool needToAdd = TRUE; if(isContraction(tailoredCE)) { @@ -1055,159 +1056,3 @@ const InverseTableHeader *ucol_initInverseUCA(UErrorCode *status) { return invUCA; } -#if 0 -/* This function handles the special CEs like contractions, expansions, surrogates, Thai */ -/* It is called by both getNextCE and getNextUCA */ -uint32_t uprv_getSpecialDynamicCE(const tempUCATable *t, uint32_t CE, collIterate *source, UErrorCode *status) { - uint32_t i = 0; /* general counter */ - uint32_t firstCE = UCOL_NOT_FOUND; - UChar *firstUChar = source->pos; - //uint32_t CE = *source->CEpos; - for (;;) { - const uint32_t *CEOffset = NULL; - const UChar *UCharOffset = NULL; - UChar schar, tchar; - uint32_t size = 0; - switch(getCETag(CE)) { - case NOT_FOUND_TAG: - /* This one is not found, and we'll let somebody else bother about it... no more games */ - return CE; - case CHARSET_TAG: - case SURROGATE_TAG: - return UCOL_NOT_FOUND; - case CONTRACTION_TAG: - /* This should handle contractions */ - for (;;) { - /* First we position ourselves at the begining of contraction sequence */ - /*const UChar *ContractionStart = UCharOffset = (UChar *)coll->image+getContractOffset(CE);*/ - ContractionTable *ctb = t->contractions->elements[getContractOffset(CE)]; - const UChar *ContractionStart = UCharOffset = ctb->codePoints; - - if (source->pos>=source->endp) { - /* this is the end of string. (Null terminated handled later, - when the null doesn't match the contraction sequence.) */ - { - /*CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));*/ /* So we'll pick whatever we have at the point... */ - CE = *(ctb->CEs+(UCharOffset - ContractionStart)); /* So we'll pick whatever we have at the point... */ - if (CE == UCOL_NOT_FOUND) { - source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */ - if(firstCE != UCOL_NOT_FOUND) { - CE = firstCE; - } - } - } - break; - } - - /* we need to convey the notion of having a backward search - most probably through the context object */ - /* if (backwardsSearch) offset += contractionUChars[(int16_t)offset]; else UCharOffset++; */ - UCharOffset++; /* skip the backward offset, see above */ - - - schar = *source->pos++; - while(schar > (tchar = *UCharOffset)) { /* since the contraction codepoints should be ordered, we skip all that are smaller */ - UCharOffset++; - } - if(schar != tchar) { /* we didn't find the correct codepoint. We can use either the first or the last CE */ - UCharOffset = ContractionStart; /* We're not at the end, bailed out in the middle. Better use starting CE */ - /*source->pos = firstUChar; *//* spit all the not found chars, which led us in this contraction */ - source->pos--; /* Spit out the last char of the string, wasn't tasty enough */ - } - /*CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));*/ - CE = *(ctb->CEs + (UCharOffset - ContractionStart)); - - if(CE == UCOL_NOT_FOUND) { - source->pos = firstUChar; /* spit all the not found chars, which led us in this contraction */ - if(firstCE != UCOL_NOT_FOUND) { - CE = firstCE; - } - break; - } else if(isContraction(CE)) { /* fix for the bug. Other places need to be checked */ - /* this is contraction, and we will continue. However, we can fail along the */ - /* th road, which means that we have part of contraction correct */ - /*uint32_t tempCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));*/ - uint32_t tempCE = *(ctb->CEs); - if(tempCE != UCOL_NOT_FOUND) { - firstCE = *(ctb->CEs); - /*firstCE = *(coll->contractionCEs + (ContractionStart - coll->contractionIndex));*/ - firstUChar = source->pos-1; - } - } else { - break; - } - } - break; - case EXPANSION_TAG: - case THAI_TAG: - /* This should handle expansion. */ - /* NOTE: we can encounter both continuations and expansions in an expansion! */ - /* I have to decide where continuations are going to be dealt with */ - CEOffset = t->expansions->CEs+(getExpansionOffset(CE) - (headersize>>2)); /* find the offset to expansion table */ - size = getExpansionCount(CE); - CE = *CEOffset++; - if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */ - for(i = 1; iCEpos++) = *CEOffset++; - } - } else { /* else, we do */ - while(*CEOffset != 0) { - *(source->CEpos++) = *CEOffset++; - } - } - return CE; - default: - *status = U_INTERNAL_PROGRAM_ERROR; - CE=0; - break; - } - if (CE <= UCOL_NOT_FOUND) break; - } - return CE; -} - -uint32_t uprv_ucol_getNextDynamicCE(tempUCATable *t, collIterate *collationSource, UErrorCode *status) { - uint32_t order; - if (collationSource->CEpos > collationSource->toReturn) { /* Are there any CEs from previous expansions? */ - order = *(collationSource->toReturn++); /* if so, return them */ - if(collationSource->CEpos == collationSource->toReturn) { - collationSource->CEpos = collationSource->toReturn = collationSource->CEs; - } - return order; - } - - UChar ch; - - if (collationSource->pos >= collationSource->endp) { - // Ran off of the end of the main source string. We're done. - return UCOL_NO_MORE_CES; - } - ch = *collationSource->pos++; - - order = ucmp32_get(t->mapping, ch); /* we'll go for slightly slower trie */ - - if(order >= UCOL_NOT_FOUND) { /* if a CE is special */ - order = uprv_getSpecialDynamicCE(t, order, collationSource, status); /* and try to get the special CE */ - - if(order == UCOL_NOT_FOUND) { /* We couldn't find a good CE in the tailoring */ - order = ucol_getNextUCA(ch, collationSource, status); - } - } - - return order; /* return the CE */ -} - -uint32_t ucol_getDynamicCEs(UColTokenParser *src, tempUCATable *t, UChar *decomp, uint32_t noOfDec, uint32_t *result, uint32_t resultSize, UErrorCode *status) { - uint32_t resLen = 0; - collIterate colIt; - - init_collIterate(src->UCA, decomp, noOfDec, &colIt); - - result[resLen] = uprv_ucol_getNextDynamicCE(t, &colIt, status); - while(result[resLen] != UCOL_NO_MORE_CES) { - resLen++; - result[resLen] = uprv_ucol_getNextDynamicCE(t, &colIt, status); - } - - return resLen; -} -#endif diff --git a/icu4c/source/i18n/ucol_cnt.cpp b/icu4c/source/i18n/ucol_cnt.cpp index f7f73a90442..d32c463941a 100644 --- a/icu4c/source/i18n/ucol_cnt.cpp +++ b/icu4c/source/i18n/ucol_cnt.cpp @@ -39,7 +39,7 @@ void uprv_growTable(ContractionTable *tbl, UErrorCode *status) { } } -CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status) { +CntTable *uprv_cnttab_open(CompactEIntArray *mapping, UErrorCode *status) { if(U_FAILURE(*status)) { return 0; } @@ -151,11 +151,11 @@ int32_t uprv_cnttab_constructTable(CntTable *table, uint32_t mainOffset, UErrorC uint32_t CE; - for(i = 0; i<=0xFFFF; i++) { - CE = ucmp32_get(table->mapping, i); + for(i = 0; i<=0x10FFFF; i++) { + CE = ucmpe32_get(table->mapping, i); if(isContraction(CE)) { CE = constructContractCE(table->offsets[getContractOffset(CE)]); - ucmp32_set(table->mapping, (UChar)i, CE); + ucmpe32_set(table->mapping, i, CE); } } diff --git a/icu4c/source/i18n/ucol_cnt.h b/icu4c/source/i18n/ucol_cnt.h index b94d6cfa704..39f24455929 100644 --- a/icu4c/source/i18n/ucol_cnt.h +++ b/icu4c/source/i18n/ucol_cnt.h @@ -21,7 +21,7 @@ #ifndef UCOL_CNTTABLE_H #define UCOL_CNTTABLE_H -#include "ucmp32.h" +#include "ucmpe32.h" #include "uhash.h" #include "ucol_elm.h" @@ -37,7 +37,7 @@ struct ContractionTable { struct CntTable { ContractionTable **elements; - CompactIntArray *mapping; + CompactEIntArray *mapping; UChar *codePoints; uint32_t *CEs; int32_t *offsets; @@ -46,7 +46,7 @@ struct CntTable { int32_t capacity; }; -CntTable *uprv_cnttab_open(CompactIntArray *mapping, UErrorCode *status); +CntTable *uprv_cnttab_open(CompactEIntArray *mapping, UErrorCode *status); CntTable *uprv_cnttab_clone(CntTable *table); void uprv_cnttab_close(CntTable *table); diff --git a/icu4c/source/i18n/ucol_elm.cpp b/icu4c/source/i18n/ucol_elm.cpp index cbeec7716f9..bfb22b3df15 100644 --- a/icu4c/source/i18n/ucol_elm.cpp +++ b/icu4c/source/i18n/ucol_elm.cpp @@ -128,7 +128,7 @@ tempUCATable * uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts t->UCA = UCA; t->expansions = (ExpansionTable *)uprv_malloc(sizeof(ExpansionTable)); uprv_memset(t->expansions, 0, sizeof(ExpansionTable)); - t->mapping = ucmp32_open(UCOL_NOT_FOUND); + t->mapping = ucmpe32_open(UCOL_NOT_FOUND, UCOL_SPECIAL_FLAG | (SURROGATE_TAG<<24), status); t->contractions = uprv_cnttab_open(t->mapping, status); /* copy UCA's maxexpansion and merge as we go along */ @@ -179,13 +179,7 @@ tempUCATable *uprv_uca_cloneTempTable(tempUCATable *t, UErrorCode *status) { /* mapping */ if(t->mapping != NULL) { - uint16_t *index = (uint16_t *)uprv_malloc(sizeof(uint16_t)*t->mapping->fCount); - int32_t *array = (int32_t *)uprv_malloc(sizeof(int32_t)*t->mapping->fCount); - - uprv_memcpy(array, t->mapping->fArray, t->mapping->fCount*sizeof(int32_t)); - uprv_memcpy(index, t->mapping->fIndex, UCMP32_kIndexCount*sizeof(uint16_t)); - - r->mapping = ucmp32_openAdopt(index, array, t->mapping->fCount); + r->mapping = ucmpe32_clone(t->mapping, status); } /* expansions */ @@ -266,7 +260,7 @@ void uprv_uca_closeTempTable(tempUCATable *t) { if(t->contractions != NULL) { uprv_cnttab_close(t->contractions); } - ucmp32_close(t->mapping); + ucmpe32_close(t->mapping); uprv_free(t->maxExpansions->endExpansionCE); uprv_free(t->maxExpansions->expansionCESize); @@ -544,34 +538,127 @@ void uprv_uca_unsafeCPAddCCNZ(tempUCATable *t) { } } -uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, UCAElements *element, UErrorCode *status) { - uint32_t i = 0; - for (i=1; icSize; i++) { /* First add contraction chars to unsafe CP hash table */ - unsafeCPSet(t->unsafeCP, element->cPoints[i]); +// Note regarding surrogate handling: We are interested only in the single +// or leading surrogates in a contraction. If a surrogate is somewhere else +// in the contraction, it is going to be handled as a pair of code units, +// as it doesn't affect the performance AND handling surrogates specially +// would complicate code way too much. +uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, + UCAElements *element, UErrorCode *status) { + CntTable *contractions = t->contractions; + UChar32 cp; + uint32_t cpsize = 0; + + // First we need to check if contractions starts with a surrogate + UTF_NEXT_CHAR(element->cPoints, cpsize, element->cSize, cp); + + if(cpsizecSize) { // This is a real contraction, if there are other characters after the first + uint32_t j = 0; + for (j=1; jcSize; j++) { /* First add contraction chars to unsafe CP hash table */ + unsafeCPSet(t->unsafeCP, element->cPoints[j]); + } + // Add the last char of the contraction to the contraction-end hash table. + ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); + if(UCOL_ISJAMO(element->cPoints[0])) { + t->image->jamoSpecial = TRUE; + } + /* then we need to deal with it */ + /* we could aready have something in table - or we might not */ + /* The fact is that we want to add or modify an existing contraction */ + /* and add it backwards then */ + element->cPoints+=cpsize; + element->cSize-=cpsize; + if(!isContraction(CE)) { + /* if it wasn't contraction, we wouldn't end up here*/ + int32_t firstContractionOffset = 0; + int32_t contractionOffset = 0; + firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, CE, status); + uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); + contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); + contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, CE, status); + CE = constructContractCE(firstContractionOffset); + } else { /* we are adding to existing contraction */ + /* there were already some elements in the table, so we need to add a new contraction */ + /* Two things can happen here: either the codepoint is already in the table, or it is not */ + int32_t position = uprv_cnttab_findCP(contractions, CE, *element->cPoints, status); + if(position > 0) { /* if it is we just continue down the chain */ + uint32_t eCE = uprv_cnttab_getCE(contractions, CE, position, status); + uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); + uprv_cnttab_setContraction(contractions, CE, position, *(element->cPoints), newCE, status); + } else { /* if it isn't, we will have to create a new sequence */ + uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); + uprv_cnttab_insertContraction(contractions, CE, *(element->cPoints), newCE, status); + } + } + element->cPoints-=cpsize; + element->cSize+=cpsize; + ucmpe32_set(t->mapping, cp, CE); + } else if(!isContraction(CE)) { /* this is just a surrogate, and there is no contraction */ + ucmpe32_set(t->mapping, cp, element->mapCE); + } else { /* fill out the first stage of the contraction with the surrogate CE */ + uprv_cnttab_changeContraction(contractions, CE, 0, element->mapCE, status); + uprv_cnttab_changeContraction(contractions, CE, 0xFFFF, element->mapCE, status); } - // Add the last char of the contraction to the contraction-end hash table. - ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]); + return CE; +} - if(UCOL_ISJAMO(element->cPoints[0])) { - t->image->jamoSpecial = TRUE; + +uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) { + int32_t firstContractionOffset = 0; + int32_t contractionOffset = 0; + uint32_t contractionElement = UCOL_NOT_FOUND; + + if(U_FAILURE(*status)) { + return UCOL_NOT_FOUND; } - /* then we need to deal with it */ - /* we could aready have something in table - or we might not */ - /* The fact is that we want to add or modify an existing contraction */ - /* and add it backwards then */ - uint32_t result = uprv_uca_processContraction(t->contractions, element, CE, status); - if(CE == UCOL_NOT_FOUND || !isContraction(CE)) { - ucmp32_set(t->mapping, element->cPoints[0], result); + /* end of recursion */ + if(element->cSize == 1) { + if(isContraction(existingCE)) { + uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status); + uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status); + return existingCE; + } else { + return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */ + } } - return result; + /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */ + /* for both backward and forward cycles */ + + /* we encountered either an empty space or a non-contraction element */ + /* this means we are constructing a new contraction sequence */ + element->cPoints++; + element->cSize--; + if(!isContraction(existingCE)) { + /* if it wasn't contraction, we wouldn't end up here*/ + firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status); + uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); + contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, *element->cPoints, newCE, status); + contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status); + existingCE = constructContractCE(firstContractionOffset); + } else { /* we are adding to existing contraction */ + /* there were already some elements in the table, so we need to add a new contraction */ + /* Two things can happen here: either the codepoint is already in the table, or it is not */ + int32_t position = uprv_cnttab_findCP(contractions, existingCE, *element->cPoints, status); + if(position > 0) { /* if it is we just continue down the chain */ + uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status); + uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); + uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status); + } else { /* if it isn't, we will have to create a new sequence */ + uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); + uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status); + } + } + element->cPoints--; + element->cSize++; + return existingCE; } /* This adds a read element, while testing for existence */ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status) { - CompactIntArray *mapping = t->mapping; + CompactEIntArray *mapping = t->mapping; ExpansionTable *expansions = t->expansions; CntTable *contractions = t->contractions; @@ -619,10 +706,14 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode } } - CE = ucmp32_get(mapping, element->cPoints[0]); if(element->cSize > 1) { /* we're adding a contraction */ - /* OR A SURROGATE - HERE IS WHERE THE DISTINCTION HAS TO BE MADE! */ + uint32_t i = 0; + UChar32 cp; + + UTF_NEXT_CHAR(element->cPoints, i, element->cSize, cp); + CE = ucmpe32_get(mapping, cp); + UCAElements *composed = (UCAElements *)uprv_malloc(sizeof(UCAElements)); uprv_memcpy(composed, element, sizeof(UCAElements)); composed->cPoints = composed->uchars; @@ -639,22 +730,23 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode uprv_free(composed); CE = uprv_uca_addContraction(t, CE, element, status); - } else { /* easy case, */ + CE = ucmpe32_get(mapping, element->cPoints[0]); + if( CE != UCOL_NOT_FOUND) { if(isContraction(CE)) { /* adding a non contraction element (thai, expansion, single) to already existing contraction */ uprv_cnttab_setContraction(contractions, CE, 0, 0, element->mapCE, status); /* This loop has to change the CE at the end of contraction REDO!*/ uprv_cnttab_changeLastCE(contractions, CE, element->mapCE, status); } else { - ucmp32_set(mapping, element->cPoints[0], element->mapCE); + ucmpe32_set(mapping, element->cPoints[0], element->mapCE); #ifdef UCOL_DEBUG fprintf(stderr, "Warning - trying to overwrite existing data %08X for cp %04X with %08X\n", CE, element->cPoints[0], element->CEs[0]); //*status = U_ILLEGAL_ARGUMENT_ERROR; #endif } } else { - ucmp32_set(mapping, element->cPoints[0], element->mapCE); + ucmpe32_set(mapping, element->cPoints[0], element->mapCE); } } @@ -662,66 +754,8 @@ uint32_t uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode return CE; } -uint32_t uprv_uca_processContraction(CntTable *contractions, UCAElements *element, uint32_t existingCE, UErrorCode *status) { - int32_t firstContractionOffset = 0; - int32_t contractionOffset = 0; - uint32_t contractionElement = UCOL_NOT_FOUND; - if(U_FAILURE(*status)) { - return UCOL_NOT_FOUND; - } - - /* end of recursion */ - if(element->cSize == 1) { - if(isContraction(existingCE)) { - uprv_cnttab_changeContraction(contractions, existingCE, 0, element->mapCE, status); - uprv_cnttab_changeContraction(contractions, existingCE, 0xFFFF, element->mapCE, status); - return existingCE; - } else { - return element->mapCE; /*can't do just that. existingCe might be a contraction, meaning that we need to do another step */ - } - } - - /* this recursion currently feeds on the only element we have... We will have to copy it in order to accomodate */ - /* for both backward and forward cycles */ - - /* we encountered either an empty space or a non-contraction element */ - /* this means we are constructing a new contraction sequence */ - if(existingCE == UCOL_NOT_FOUND || !isContraction(existingCE)) { - /* if it wasn't contraction, we wouldn't end up here*/ - firstContractionOffset = uprv_cnttab_addContraction(contractions, UPRV_CNTTAB_NEWELEMENT, 0, existingCE, status); - - UChar toAdd = element->cPoints[1]; - element->cPoints++; - element->cSize--; - uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); - element->cPoints--; - element->cSize++; - contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, toAdd, newCE, status); - contractionOffset = uprv_cnttab_addContraction(contractions, firstContractionOffset, 0xFFFF, existingCE, status); - contractionElement = constructContractCE(firstContractionOffset); - return contractionElement; - } else { /* we are adding to existing contraction */ - /* there were already some elements in the table, so we need to add a new contraction */ - /* Two things can happen here: either the codepoint is already in the table, or it is not */ - int32_t position = uprv_cnttab_findCP(contractions, existingCE, *(element->cPoints+1), status); - element->cPoints++; - element->cSize--; - if(position > 0) { /* if it is we just continue down the chain */ - uint32_t eCE = uprv_cnttab_getCE(contractions, existingCE, position, status); - uint32_t newCE = uprv_uca_processContraction(contractions, element, eCE, status); - uprv_cnttab_setContraction(contractions, existingCE, position, *(element->cPoints), newCE, status); - } else { /* if it isn't, we will have to create a new sequence */ - uint32_t newCE = uprv_uca_processContraction(contractions, element, UCOL_NOT_FOUND, status); - uprv_cnttab_insertContraction(contractions, existingCE, *(element->cPoints), newCE, status); - } - element->cPoints--; - element->cSize++; - return existingCE; - } -} - -void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping, +void uprv_uca_getMaxExpansionJamo(CompactEIntArray *mapping, MaxExpansionTable *maxexpansion, MaxJamoExpansionTable *maxjamoexpansion, UBool jamospecial, @@ -737,7 +771,7 @@ void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping, uint32_t ce; while (v >= VBASE) { - ce = ucmp32_get(mapping, v); + ce = ucmpe32_get(mapping, v); if (ce < UCOL_SPECIAL_FLAG) { uprv_uca_setMaxExpansion(ce, 2, maxexpansion, status); } @@ -746,7 +780,7 @@ void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping, while (t >= TBASE) { - ce = ucmp32_get(mapping, t); + ce = ucmpe32_get(mapping, t); if (ce < UCOL_SPECIAL_FLAG) { uprv_uca_setMaxExpansion(ce, 3, maxexpansion, status); } @@ -780,7 +814,7 @@ void uprv_uca_getMaxExpansionJamo(CompactIntArray *mapping, UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) { - CompactIntArray *mapping = t->mapping; + CompactEIntArray *mapping = t->mapping; ExpansionTable *expansions = t->expansions; CntTable *contractions = t->contractions; MaxExpansionTable *maxexpansion = t->maxExpansions; @@ -794,9 +828,9 @@ UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) { int32_t contractionsSize = 0; contractionsSize = uprv_cnttab_constructTable(contractions, beforeContractions, status); - ucmp32_compact(mapping, 1); + ucmpe32_compact(mapping); UMemoryStream *ms = uprv_mstrm_openNew(8192); - int32_t mappingSize = ucmp32_flattenMem(mapping, ms); + int32_t mappingSize = ucmpe32_flattenMem(mapping, ms); const uint8_t *flattened = uprv_mstrm_getBuffer(ms, &mappingSize); /* sets jamo expansions */ @@ -880,7 +914,7 @@ UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) { uint32_t *store = (uint32_t*)(dataStart+tableOffset); int32_t i = 0; for(i = 0; i<=0xFF; i++) { - *(store++) = ucmp32_get(mapping,i); + *(store++) = ucmpe32_get(mapping,i); tableOffset+=sizeof(uint32_t); } diff --git a/icu4c/source/i18n/ucol_elm.h b/icu4c/source/i18n/ucol_elm.h index fc0e6581dfa..4052957bf04 100644 --- a/icu4c/source/i18n/ucol_elm.h +++ b/icu4c/source/i18n/ucol_elm.h @@ -69,7 +69,7 @@ typedef struct { } MaxExpansionTable; typedef struct { - CompactIntArray *mapping; + CompactEIntArray *mapping; ExpansionTable *expansions; struct CntTable *contractions; UCATableHeader *image; diff --git a/icu4c/source/i18n/ucol_imp.h b/icu4c/source/i18n/ucol_imp.h index b80776f8ffc..370e7e51c2f 100644 --- a/icu4c/source/i18n/ucol_imp.h +++ b/icu4c/source/i18n/ucol_imp.h @@ -27,7 +27,7 @@ #define UCOL_IMP_H #include "unicode/ucol.h" -#include "ucmp32.h" +#include "ucmpe32.h" #include "unicode/ures.h" #include "unicode/udata.h" @@ -216,7 +216,7 @@ struct UCollationElements if(ch <= 0xFF) { \ (order) = (coll)->latinOneMapping[ch]; \ } else { \ - (order) = ucmp32_get((coll)->mapping, ch); \ + (order) = ucmpe32_get((coll)->mapping, ch); \ } \ if((order) >= UCOL_NOT_FOUND) { \ (order) = getSpecialCE((coll), (order), &(collationSource), (status)); \ @@ -583,7 +583,7 @@ struct UCollator { UBool freeOnClose; UResourceBundle *rb; const UCATableHeader *image; - CompactIntArray *mapping; + CompactEIntArray *mapping; const uint32_t *latinOneMapping; const uint32_t *expansion; const UChar *contractionIndex;