From 78a57a7680dcb141c50b140f3bbb715fa6aa905a Mon Sep 17 00:00:00 2001 From: Syn Wee Quek Date: Tue, 20 Feb 2001 00:26:50 +0000 Subject: [PATCH] ICU-861 Implemented backwards iterator for collation X-SVN-Rev: 3679 --- icu4c/source/i18n/coleitr.cpp | 178 ++++++++++------ icu4c/source/i18n/ucol.cpp | 306 +++++++++++++++++++++++++++ icu4c/source/i18n/ucoleitr.cpp | 184 ++++++++++++---- icu4c/source/i18n/ucolimp.h | 81 ++++++- icu4c/source/i18n/unicode/coleitr.h | 50 +++-- icu4c/source/i18n/unicode/ucoleitr.h | 72 +++---- 6 files changed, 704 insertions(+), 167 deletions(-) diff --git a/icu4c/source/i18n/coleitr.cpp b/icu4c/source/i18n/coleitr.cpp index 12ab1e670f3..b4ce9068e0f 100644 --- a/icu4c/source/i18n/coleitr.cpp +++ b/icu4c/source/i18n/coleitr.cpp @@ -14,58 +14,53 @@ * * Modification History: * -* Date Name Description +* Date Name Description * -* 6/23/97 helena Adding comments to make code more readable. -* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java -* 12/10/99 aliu Ported Thai collation support from Java. -* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) +* 6/23/97 helena Adding comments to make code more readable. +* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java +* 12/10/99 aliu Ported Thai collation support from Java. +* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h) +* 02/19/01 swquek Removed CollationElementsIterator() since it is +* private constructor and no calls are made to it */ // #include "unicode/sortkey.h" #include "unicode/coleitr.h" +#include "ucolimp.h" +#include "cmemory.h" // #include "unicode/chariter.h" -#include "tables.h" +// #include "tables.h" // #include "unicode/normlzr.h" // #include "unicode/unicode.h" // #include "tcoldata.h" // #include "ucmp32.h" -// Constants ------------------------------------------------------------------ +/* Constants --------------------------------------------------------------- */ +/* synwee : public can't remove */ int32_t const CollationElementIterator::NULLORDER = 0xffffffff; -int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000; +// int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000; -// CollationElementIterator public constructor/destructor --------------------- +/* CollationElementIterator public constructor/destructor ------------------ */ CollationElementIterator::CollationElementIterator( - const CollationElementIterator& other) - : text(0), - ownBuffer(new VectorOfInt(2)), - reorderBuffer(0), - expIndex(other.expIndex) + const CollationElementIterator& other) + : isDataOwned_(TRUE) { *this = other; } CollationElementIterator::~CollationElementIterator() { - delete text; - text = NULL; - bufferAlias = NULL; - orderAlias = NULL; - delete ownBuffer; - delete reorderBuffer; + ucol_closeElements(m_data_); } -// CollationElementIterator public methods ------------------------------------ +/* CollationElementIterator public methods --------------------------------- */ UTextOffset CollationElementIterator::getOffset() const { - // Since the DecompositionIterator is doing the work of iterating through - // the text string, we can just ask it what its offset is. - return (text != NULL) ? text->getIndex() : 0; + return ucol_getOffset(m_data_); } /** @@ -75,6 +70,7 @@ UTextOffset CollationElementIterator::getOffset() const */ int32_t CollationElementIterator::next(UErrorCode& status) { + /* if (text == NULL || U_FAILURE(status)) return NULLORDER; @@ -111,9 +107,8 @@ int32_t CollationElementIterator::next(UErrorCode& status) // Ask the collator for this character's ordering. // Used to be RuleBasedCollator.getUnicodeOrder(). // It can't be inlined in tblcoll.h file unfortunately. - /* - synwee : have to modify this part - int32_t value = ucmp32_get(orderAlias->data->mapping, ch); + + int32_t value = ucmp32_get(orderAlias->data->mapping, ch); if (value == RuleBasedCollator::UNMAPPED) { @@ -153,21 +148,22 @@ int32_t CollationElementIterator::next(UErrorCode& status) return strengthOrder(value); */ - return 0; + return ucol_next(m_data_, &status); } UBool CollationElementIterator::operator!=( - const CollationElementIterator& other) const + const CollationElementIterator& other) const { return !(*this == other); } -UBool CollationElementIterator::operator==(const CollationElementIterator& that) - const +UBool CollationElementIterator::operator==( + const CollationElementIterator& that) const { if (this == &that) return TRUE; - + + /* if (*text != *(that.text)) return FALSE; @@ -182,6 +178,9 @@ UBool CollationElementIterator::operator==(const CollationElementIterator& that) return FALSE; return TRUE; + */ + + return m_data_ == that.m_data_; } /** @@ -192,6 +191,7 @@ UBool CollationElementIterator::operator==(const CollationElementIterator& that) */ int32_t CollationElementIterator::previous(UErrorCode& status) { + /* if (text == NULL || U_FAILURE(status)) return NULLORDER; @@ -212,8 +212,7 @@ int32_t CollationElementIterator::previous(UErrorCode& status) // Used to be RuleBasedCollator.getUnicodeOrder(). It can't be inlined in // tblcoll.h file unfortunately. - /* - + int32_t value = ucmp32_get(orderAlias->data->mapping, ch); if (value == RuleBasedCollator::UNMAPPED) @@ -252,7 +251,7 @@ int32_t CollationElementIterator::previous(UErrorCode& status) return strengthOrder(value); */ - return 0; + return ucol_previous(m_data_, &status); } /** @@ -260,6 +259,7 @@ int32_t CollationElementIterator::previous(UErrorCode& status) */ void CollationElementIterator::reset() { + /* if (text != NULL) { text->reset(); @@ -268,11 +268,14 @@ void CollationElementIterator::reset() bufferAlias = NULL; expIndex = 0; + */ + ucol_reset(m_data_); } void CollationElementIterator::setOffset(UTextOffset newOffset, UErrorCode& status) { + /* if (U_FAILURE(status)) return; @@ -280,6 +283,8 @@ void CollationElementIterator::setOffset(UTextOffset newOffset, text->setIndex(newOffset); bufferAlias = NULL; + */ + ucol_setOffset(m_data_, newOffset, &status); } /** @@ -290,7 +295,7 @@ void CollationElementIterator::setText(const UnicodeString& source, { if (U_FAILURE(status)) return; - + /* bufferAlias = 0; if (text == NULL) @@ -300,6 +305,17 @@ void CollationElementIterator::setText(const UnicodeString& source, text->setText(source, status); text->setMode(orderAlias->getDecomposition()); } + */ + int32_t length = source.length(); + UChar *string = new UChar[length]; + source.extract(0, length, string); + + m_data_->length_ = length; + + if (m_data_->iteratordata_.isWritable && + m_data_->iteratordata_.string != NULL) + uprv_free(m_data_->iteratordata_.string); + init_collIterate(string, length, &m_data_->iteratordata_, TRUE); } // Sets the source to the new character iterator. @@ -309,6 +325,7 @@ void CollationElementIterator::setText(CharacterIterator& source, if (U_FAILURE(status)) return; + /* bufferAlias = 0; if (text == NULL) @@ -318,38 +335,52 @@ void CollationElementIterator::setText(CharacterIterator& source, text->setMode(orderAlias->getDecomposition()); text->setText(source, status); } + */ + int32_t length = source.getLength(); + UChar *buffer = new UChar[length]; + /* + Using this constructor will prevent buffer from being removed when + string gets removed + */ + UnicodeString string(buffer, length, length); + source.getText(string); + string.extract(0, length, buffer); + m_data_->length_ = length; + + if (m_data_->iteratordata_.isWritable && + m_data_->iteratordata_.string != NULL) + uprv_free(m_data_->iteratordata_.string); + init_collIterate(buffer, length, &m_data_->iteratordata_, TRUE); } int32_t CollationElementIterator::strengthOrder(int32_t order) const { - Collator::ECollationStrength s = orderAlias->getStrength(); + UCollationStrength s = ucol_getStrength(m_data_->collator_); // Mask off the unwanted differences. - if (s == Collator::PRIMARY) + if (s == UCOL_PRIMARY) order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY; else - if (s == Collator::SECONDARY) + if (s == UCOL_SECONDARY) order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY; return order; } -// CollationElementIterator private constructors/destructors ------------------ +/* CollationElementIterator private constructors/destructors --------------- */ -// This private method will never be called, but it makes the linker happy -CollationElementIterator::CollationElementIterator() : text(0), bufferAlias(0), - ownBuffer(new VectorOfInt(2)), - reorderBuffer(0), expIndex(0), - orderAlias(0) +/* +This private method will never be called, but it makes the linker happy +CollationElementIterator::CollationElementIterator() : m_data_(0) { } +*/ CollationElementIterator::CollationElementIterator( - const RuleBasedCollator* order) - : text(0), bufferAlias(0), - ownBuffer(new VectorOfInt(2)), - reorderBuffer(0), expIndex(0), - orderAlias(order) + const RuleBasedCollator* order) + : isDataOwned_(TRUE) { + UErrorCode status = U_ZERO_ERROR; + m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status); } /** @@ -359,17 +390,12 @@ CollationElementIterator::CollationElementIterator( CollationElementIterator::CollationElementIterator( const UnicodeString& sourceText, const RuleBasedCollator* order, - UErrorCode& status) - : text(NULL), - bufferAlias(NULL), - ownBuffer(new VectorOfInt(2)), - reorderBuffer(0), - expIndex(0), - orderAlias(order) + UErrorCode& status) { if (U_FAILURE(status)) return; - + + /* if ( sourceText.length() != 0 ) { // A CollationElementIterator is really a two-layered beast. @@ -386,6 +412,8 @@ CollationElementIterator::CollationElementIterator( if (text == NULL) status = U_MEMORY_ALLOCATION_ERROR; } + */ + m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status); } /** @@ -393,20 +421,16 @@ CollationElementIterator::CollationElementIterator( * the source text using the specified collator */ CollationElementIterator::CollationElementIterator( - const CharacterIterator& sourceText, - const RuleBasedCollator* order, - UErrorCode& status) - : text(NULL), - bufferAlias(NULL), - ownBuffer(new VectorOfInt(2)), - reorderBuffer(0), - expIndex(0), - orderAlias(order) + const CharacterIterator& sourceText, + const RuleBasedCollator* order, + UErrorCode& status) + : isDataOwned_(TRUE) { if (U_FAILURE(status)) return; // **** should I just drop this test? **** + /* if ( sourceText.endIndex() != 0 ) { // A CollationElementIterator is really a two-layered beast. @@ -423,15 +447,29 @@ CollationElementIterator::CollationElementIterator( if (text == NULL) status = U_MEMORY_ALLOCATION_ERROR; } + */ + int32_t length = sourceText.getLength(); + UChar *buffer = new UChar[length]; + /* + Using this constructor will prevent buffer from being removed when + string gets removed + */ + UnicodeString string(buffer, length, length); + // synwee sourceText.getText(string); + string.extract(0, length, buffer); + + m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status); + // synwee ucol_setText(m_data_, buffer, length, TRUE, &status); } -// CollationElementIterator private methods ----------------------------------- +/* CollationElementIterator private methods -------------------------------- */ const CollationElementIterator& CollationElementIterator::operator=( - const CollationElementIterator& other) + const CollationElementIterator& other) { if (this != &other) { + /* expIndex = other.expIndex; delete text; text = (Normalizer*)other.text->clone(); @@ -455,6 +493,8 @@ const CollationElementIterator& CollationElementIterator::operator=( bufferAlias = other.bufferAlias; orderAlias = other.orderAlias; + */ + this->m_data_ = other.m_data_; } return *this; diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index ae02ad7797a..f598e6de8fd 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -3,6 +3,9 @@ * Copyright (C) 1996-1999, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* +* Modification history +* Date Name Comments +* 02/16/2001 synwee Added internal method getPrevSpecialCE */ #include "ucolimp.h" @@ -1089,6 +1092,140 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta return order; /* return the CE */ } +/* +* This function tries to get a CE from UCA, which should be always around +* UChar is passed in in order to speed things up here is also the generation +* of implicit CEs +*/ +uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource, + uint32_t length, UErrorCode *status) +{ + uint32_t order; + if (ch < 0xFF) + order = UCA->latinOneMapping[ch]; + else + order = ucmp32_get(UCA->mapping, ch); + + if (order >= UCOL_NOT_FOUND) + order = getSpecialPrevCE(UCA, order, collationSource, length, status); + + if (order == UCOL_NOT_FOUND) + { + /* + This is where we have to resort to algorithmical generation. + We have to check if ch is possibly a first surrogate - then we need to + take the next code unit and make a bigger CE + */ + UChar nextChar; + const int + SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7, + LCount = 19, VCount = 21, TCount = 28, + NCount = VCount * TCount, // 588 + SCount = LCount * NCount, // 11172 + LLimit = LBase + LCount, // 1113 + VLimit = VBase + VCount, // 1176 + TLimit = TBase + TCount, // 11C3 + SLimit = SBase + SCount; // D7A4 + + /* + once we have failed to find a match for codepoint cp, and are in the + implicit code. + */ + + unsigned int L = ch - SBase; + if (L < SCount) + { /* since it is unsigned, catchs zero case too */ + + /* + divide into pieces. + we do it in this order since some compilers can do % and / in one + operation + */ + int T = L % TCount; + L /= TCount; + int V = L % VCount; + L /= VCount; + + /* offset them */ + L += LBase; + V += VBase; + T += TBase; + + /* + return the first CE, but first put the rest into the expansion buffer + */ + if (!collationSource->JamoSpecial) + { + *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V); + if (T != TBase) + *(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T); + /* return first one */ + return ucmp32_get(UCA->mapping, L); + } else { + /* + Jamo is Special + do recursive processing of L, V, and T with fetchCE (but T only if not + equal to TBase!!) + Since fetchCE returns a CE, and (potentially) stuffs items into the ce + buffer, + this is how it is done. + */ + /* + int firstCE = fetchCE(L, ...); + // set pointer, leave gap! + int* lastExpansion = expansionBufferEnd++; + *lastExpansion = fetchCE(V,...); + if (T != TBase) { + lastExpansion = expansionBufferEnd++; // set pointer, leave gap! + *lastExpansion = fetchCE(T,...); + } + */ + } + } + + if (UTF_IS_SECOND_SURROGATE(ch)) + { + if ((collationSource->len - collationSource->pos != length) && + (UTF_IS_FIRST_SURROGATE(nextChar = *collationSource->pos))) + { + uint32_t cp = ((ch << 10UL) + nextChar - ((0xd800 << 10UL) + 0xdc00)); + if (collationSource->pos != collationSource->writableBuffer) + collationSource->pos --; + else + { + collationSource->pos = collationSource->string + + (length - (collationSource->len - collationSource->writableBuffer)); + collationSource->len = collationSource->string + length; + collationSource->isThai = TRUE; + } + if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) + return 0; /* illegal code value, use completely ignoreable! */ + + /* + This is a code point minus 0x10000, that's what algorithm requires + */ + order = 0xE0010303 | (cp & 0xFFE00) << 8; + *(collationSource->CEpos ++) = 0x80200080 | (cp & 0x001FF) << 22; + collationSource->toReturn ++; + } + else + return 0; /* completely ignorable */ + } + else + { + /* otherwise */ + if (UTF_IS_FIRST_SURROGATE(ch) || (ch & 0xFFFE) == 0xFFFE) + return 0; /* completely ignorable */ + + /* Make up an artifical CE from code point as per UCA */ + order = 0xD08003C3 | (ch & 0xF000) << 12 | (ch & 0x0FE0) << 11; + *(collationSource->CEpos ++) = 0x04000080 | (ch & 0x001F) << 27; + collationSource->toReturn ++; + } + } + return order; /* return the CE */ +} + /* This function handles the special CEs like contractions, expansions, surrogates, Thai */ /* It is called by both getNextCE and getNextUCA */ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, UErrorCode *status) { @@ -1201,6 +1338,175 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U return CE; } +/** +* This function handles the special CEs like contractions, expansions, +* surrogates, Thai. +* It is called by both getPrevCE and getPrevUCA +* synwee +*/ +uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE, + collIterate *source, uint32_t length, + UErrorCode *status) +{ + uint32_t count = 0; + const uint32_t *CEOffset = NULL; + const UChar *UCharOffset = NULL; + UChar schar, + tchar; + const UChar *strend = NULL; + const UChar *constart = NULL; + uint32_t size; + while (TRUE) + { + switch (getCETag(CE)) + { + case NOT_FOUND_TAG: + return CE; + case SURROGATE_TAG: + /* pending surrogate discussion with Markus and Mark */ + return UCOL_NOT_FOUND; + case THAI_TAG: + if (source->isThai == TRUE) + { /* if we encountered Thai prevowel & the string is not yet touched */ + source->isThai = FALSE; + /* + sigh... to cater for getNextCE, we'll have to modify and store the + whole string instead of a substring as in getSpecialCE + */ + UCharOffset = source->pos; + strend = source->len; + size = strend - source->string; + if (size > UCOL_WRITABLE_BUFFER_SIZE) + { + /* + someone else has already allocated something + */ + if (source->writableBuffer != source->stackWritableBuffer) + uprv_free(source->writableBuffer); + source->writableBuffer = + (UChar *)uprv_malloc(size * sizeof(UChar)); + source->isThai = FALSE; + } + UChar *sourceCopy = source->string; + UChar *targetCopy = source->writableBuffer; + while (sourceCopy < strend) + { + if (UCOL_ISTHAIPREVOWEL(*sourceCopy) && + /* This is the combination that needs to be swapped */ + UCOL_ISTHAIBASECONSONANT(*(sourceCopy + 1))) + { + *(targetCopy) = *(sourceCopy + count + 1); + *(targetCopy+1) = *(sourceCopy + count); + targetCopy+=2; + sourceCopy+=2; + } + else + *(targetCopy++) = *(sourceCopy++); + } + source->pos = source->writableBuffer + + (UCharOffset - source->string); + source->len = targetCopy; + source->CEpos = source->toReturn = source->CEs; + CE = UCOL_IGNORABLE; + } + else + { + /* + we have already played with the string, so treat Thai as a length one + expansion + */ + /* find the offset to expansion table */ + CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); + CE = *CEOffset ++; + } + break; + case CONTRACTION_TAG: + /* This should handle contractions */ + while (TRUE) + { + /* + First we position ourselves at the begining of contraction sequence + */ + constart = UCharOffset = (UChar *)coll->image + getContractOffset(CE); + strend = source->len; + + if (strend - source->pos == length) + { /* this is the start of string */ + CE = *(coll->contractionCEs + + (UCharOffset - coll->contractionIndex)); + break; + } + + /* + Progressing to backwards block + */ + UCharOffset += *UCharOffset; + + schar = *source->pos; + while (schar > (tchar = *UCharOffset)) + UCharOffset ++; + + if (schar != tchar) + { + /* + we didn't find the correct codepoint. We can use either the first or + the last CE + */ + if (tchar != 0xFFFF) + UCharOffset = constart; + } + else + { + /* Move up one character */ + if (source->pos != source->writableBuffer) + source->pos --; + else + { + source->pos = source->string + + (length - (source->len - source->writableBuffer)); + source->len = source->string + length; + source->isThai = TRUE; + } + } + CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex)); + if (!isContraction(CE)) + break; + } + break; + case EXPANSION_TAG: + /* + This should handle expansion. + NOTE: we can encounter both continuations and expansions in an expansion! + I have to decide where continuations are going to be dealt with + */ + /* find the offset to expansion table */ + CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); + size = getExpansionCount(CE); + if (size != 0) + /* + if there are less than 16 elements in expansion, we don't terminate + */ + for (count = 0; count < size; count++) + *(source->CEpos ++) = *CEOffset++; + else + /* else, we do */ + while (*CEOffset != 0) + *(source->CEpos ++) = *CEOffset ++; + source->toReturn = source->CEpos - 1; + return *(source->toReturn --); + case CHARSET_TAG: + /* probably after 1.8 */ + return UCOL_NOT_FOUND; + default: + *status = U_INTERNAL_PROGRAM_ERROR; + CE=0; + break; + } + if (CE <= UCOL_NOT_FOUND) break; + } + return CE; +} + /* This should really be a macro */ /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */ /* anyway */ diff --git a/icu4c/source/i18n/ucoleitr.cpp b/icu4c/source/i18n/ucoleitr.cpp index efddee32f76..1e86d9adaf7 100644 --- a/icu4c/source/i18n/ucoleitr.cpp +++ b/icu4c/source/i18n/ucoleitr.cpp @@ -1,18 +1,36 @@ /* -******************************************************************************* +****************************************************************************** * Copyright (C) 2001, International Business Machines * Corporation and others. All Rights Reserved. -******************************************************************************* -*/ +****************************************************************************** +* +* File ucoleitr.cpp +* +* Modification History: +* +* Date Name Description +* 02/15/2001 synwee Modified all methods to process its own function +* instead of calling the equivalent c++ api (coleitr.h) +******************************************************************************/ #include "unicode/ucoleitr.h" #include "unicode/ustring.h" -#include "unicode/coleitr.h" +#include "unicode/sortkey.h" +#include "ucolimp.h" +#include "cmemory.h" +#define BUFFER_LENGTH 100 +typedef struct collIterate collIterator; + +/* public methods ---------------------------------------------------- */ + +/** +* Since this is going to be deprecated, I'll leave it as it is +*/ U_CAPI int32_t -ucol_keyHashCode( const uint8_t* key, - int32_t length) +ucol_keyHashCode(const uint8_t *key, + int32_t length) { CollationKey newKey(key, length); return newKey.hashCode(); @@ -20,88 +38,160 @@ ucol_keyHashCode( const uint8_t* key, UCollationElements* -ucol_openElements( const UCollator *coll, - const UChar *text, - int32_t textLength, - UErrorCode *status) +ucol_openElements(const UCollator *coll, + const UChar *text, + int32_t textLength, + UErrorCode *status) { - int32_t len = (textLength == -1 ? u_strlen(text) : textLength); - const UnicodeString src((UChar*)text, len, len); + UCollationElements *result; - CollationElementIterator *iter = 0; - iter = ((RuleBasedCollator*)coll)->createCollationElementIterator(src); - if(iter == 0) { - *status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } + if (U_FAILURE(*status)) + return NULL; - return (UCollationElements*) iter; + result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements)); + + result->collator_ = coll; + + /* gets the correct length of the null-terminated string */ + if (textLength == -1) + textLength = u_strlen(text); + + result->length_ = textLength; + init_collIterate(text, textLength, &result->iteratordata_, FALSE); + + return result; } U_CAPI void ucol_closeElements(UCollationElements *elems) { - delete (CollationElementIterator*)elems; + collIterate *ci = &elems->iteratordata_; + if (ci->writableBuffer != ci->stackWritableBuffer) + uprv_free(ci->writableBuffer); + if (elems->iteratordata_.isWritable && elems->iteratordata_.string != NULL) + uprv_free(elems->iteratordata_.string); + uprv_free(elems); } U_CAPI void ucol_reset(UCollationElements *elems) { - ((CollationElementIterator*)elems)->reset(); + collIterate *ci = &(elems->iteratordata_); + ci->pos = ci->string; + ci->len = ci->string + elems->length_; + ci->CEpos = ci->toReturn = ci->CEs; + /* + problem here, that means we'll have to keep calculating the new thai set + whenever we reset. maybe getSpecialCE should just do up the whole string + instead of only a substring of it. + */ + ci->isThai = TRUE; + if (ci->stackWritableBuffer != ci->writableBuffer) + { + uprv_free(ci->writableBuffer); + ci->writableBuffer = ci->stackWritableBuffer; + } } U_CAPI int32_t -ucol_next( UCollationElements *elems, - UErrorCode *status) +ucol_next(UCollationElements *elems, + UErrorCode *status) { - if(U_FAILURE(*status)) return UCOL_NULLORDER; + if (U_FAILURE(*status)) + return UCOL_NULLORDER; - return ((CollationElementIterator*)elems)->next(*status); + int32_t result; + UCOL_GETNEXTCE(result, elems->collator_, elems->iteratordata_, status); + return result; } U_CAPI int32_t -ucol_previous( UCollationElements *elems, - UErrorCode *status) +ucol_previous(UCollationElements *elems, + UErrorCode *status) { - if(U_FAILURE(*status)) return UCOL_NULLORDER; + if(U_FAILURE(*status)) + return UCOL_NULLORDER; - return ((CollationElementIterator*)elems)->previous(*status); + int32_t result; + UCOL_GETPREVCE(result, elems->collator_, elems->iteratordata_, + elems->length_, status); + return result; } U_CAPI int32_t -ucol_getMaxExpansion( const UCollationElements *elems, - int32_t order) +ucol_getMaxExpansion(const UCollationElements *elems, + int32_t order) { - return ((CollationElementIterator*)elems)->getMaxExpansion(order); + /* + synwee : requested this implementation from vladimir, need discussion. so + hang on. + */ + /* return ((CollationElementIterator*)elems)->getMaxExpansion(order); */ + return -1; } U_CAPI void -ucol_setText(UCollationElements *elems, - const UChar *text, - int32_t textLength, - UErrorCode *status) +ucol_setText( UCollationElements *elems, + const UChar *text, + int32_t textLength, + UErrorCode *status) { - if(U_FAILURE(*status)) return; + if (U_FAILURE(*status)) + return; + + /* gets the correct length of the null-terminated string */ + if (textLength == -1) + textLength = u_strlen(text); - int32_t len = (textLength == -1 ? u_strlen(text) : textLength); - const UnicodeString src((UChar*)text, len, len); + elems->length_ = textLength; - ((CollationElementIterator*)elems)->setText(src, *status); + if (elems->iteratordata_.isWritable && elems->iteratordata_.string != NULL) + uprv_free(elems->iteratordata_.string); + init_collIterate(text, textLength, &elems->iteratordata_, FALSE); } U_CAPI UTextOffset ucol_getOffset(const UCollationElements *elems) { - return ((CollationElementIterator*)elems)->getOffset(); + /* return ((CollationElementIterator*)elems)->getOffset(); */ + const collIterate *ci = &(elems->iteratordata_); + if (ci->isThai == TRUE) + return ci->pos - ci->string; + + /* + if it is a thai string with reversed elements, since getNextCE does not + store only a substring in writeablebuffer, we'll have to do some calculation + to get the offset out. + need discussion to see if it is a better idea to store the whole string + instead. + */ + return elems->length_ - (ci->len - ci->pos); } U_CAPI void -ucol_setOffset( UCollationElements *elems, - UTextOffset offset, - UErrorCode *status) +ucol_setOffset(UCollationElements *elems, + UTextOffset offset, + UErrorCode *status) { - if(U_FAILURE(*status)) return; - - ((CollationElementIterator*)elems)->setOffset(offset, *status); + if (U_FAILURE(*status)) + return; + + collIterate *ci = &(elems->iteratordata_); + ci->pos = ci->string + offset; + ci->CEpos = ci->toReturn = ci->CEs; + /* + problem here, that means we'll have to keep calculating the new thai set + whenever we reset. maybe getSpecialCE should just do up the whole string + instead of only a substring of it. + */ + ci->isThai = TRUE; + if (ci->stackWritableBuffer != ci->writableBuffer) + { + uprv_free(ci->writableBuffer); + ci->writableBuffer = ci->stackWritableBuffer; + } } + + + diff --git a/icu4c/source/i18n/ucolimp.h b/icu4c/source/i18n/ucolimp.h index da0fd1cbb5b..70a22cac371 100644 --- a/icu4c/source/i18n/ucolimp.h +++ b/icu4c/source/i18n/ucolimp.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1998-2000, International Business Machines +* Copyright (C) 1998-2001, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -14,6 +14,11 @@ * * created on: 2000dec11 * created by: Vladimir Weinstein +* +* Modification history +* Date Name Comments +* 02/16/2001 synwee Added UCOL_GETPREVCE for the use in ucoleitr +* */ #ifndef UCOL_IMP_H @@ -62,6 +67,28 @@ struct collIterate { UChar *writableBuffer; }; +struct UCollationElements +{ + /** + * Locale specific collator for generating the collation elements + */ + const UCollator *collator_; + /** + * Normalization mode, not exactly the same as the data in collator_. + * If collation strength requested is UCOL_IDENTICAL, this modes will be + * UNORM_NONE other it follows collator_. + */ + UNormalizationMode normalization_; + /** + * Struct wrapper for source data + */ + collIterate iteratordata_; + /** + * Source text length + */ + int32_t length_; +}; + struct incrementalContext { UCharForwardIterator *source; void *sourceContext; @@ -196,9 +223,61 @@ struct incrementalContext { } \ } +/** +* Macro that gets a simple CE. +* So what it does is that it will first check the expansion buffer. If the +* expansion buffer is not empty, ie the end pointer to the expansion buffer +* is different from the start pointer, we return the collation element at the +* return pointer and decrement it. +* For more complicated CEs it resorts to getComplicatedCE. +*/ +#define UCOL_GETPREVCE(order, coll, data, length, status) { \ + if (data.CEpos > data.CEs) { \ + (order) = *(data.toReturn --); \ + if (data.CEs == data.toReturn) { \ + data.CEpos = data.toReturn = data.CEs; \ + } \ + } \ + else { \ + if (data.len - data.pos == length) { \ + (order) = UCOL_NO_MORE_CES; \ + } \ + else { \ + UChar ch = *(data.pos); \ + if (data.pos != data.writableBuffer) { \ + data.pos --; \ + } \ + else { \ + data.pos = data.string + \ + (length - (data.len - data.writableBuffer)); \ + data.len = data.string + length; \ + data.isThai = TRUE; \ + } \ + if (ch <= 0xFF) { \ + (order) = (coll)->latinOneMapping[ch]; \ + } \ + else { \ + (order) = ucmp32_get((coll)->mapping, ch); \ + } \ + if ((order) >= UCOL_NOT_FOUND) { \ + (order) = getSpecialPrevCE((coll), (order), &(data), (length), \ + (status)); \ + if ((order) == UCOL_NOT_FOUND) { \ + (order) = ucol_getPrevUCA(ch, &(data), (length), (status)); \ + } \ + } \ + } \ + } \ +} + uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, UErrorCode *status); +uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE, + collIterate *source, uint32_t length, + UErrorCode *status); U_CFUNC uint32_t ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status); uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *status); +uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource, + uint32_t length, UErrorCode *status); void incctx_cleanUpContext(incrementalContext *ctx); UChar incctx_appendChar(incrementalContext *ctx, UChar c); diff --git a/icu4c/source/i18n/unicode/coleitr.h b/icu4c/source/i18n/unicode/coleitr.h index 9c3dc9422b4..394acb45f23 100644 --- a/icu4c/source/i18n/unicode/coleitr.h +++ b/icu4c/source/i18n/unicode/coleitr.h @@ -1,8 +1,8 @@ /* -***************************************************************************************** +****************************************************************************** * Copyright (C) 1997-1999, International Business Machines * Corporation and others. All Rights Reserved. -***************************************************************************************** +****************************************************************************** */ /** @@ -14,12 +14,14 @@ * * Modification History: * -* Date Name Description +* Date Name Description * -* 8/18/97 helena Added internal API documentation. -* 08/03/98 erm Synched with 1.2 version CollationElementIterator.java -* 12/10/99 aliu Ported Thai collation support from Java. -* 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h) +* 8/18/97 helena Added internal API documentation. +* 08/03/98 erm Synched with 1.2 version CollationElementIterator.java +* 12/10/99 aliu Ported Thai collation support from Java. +* 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h) +* 02/19/01 swquek Removed CollationElementsIterator() since it is +* private constructor and no calls are made to it */ #ifndef COLEITR_H @@ -27,16 +29,22 @@ // #include "unicode/unistr.h" #include "unicode/tblcoll.h" +#include "unicode/ucoleitr.h" + // #include "tables.h" // #include "unicode/chariter.h" // have to do this because the include path in the main project does not have // tables.h. -class VectorOfInt; +// class VectorOfInt; // class Normalizer; // class VectorOfPToContractElement; // class RuleBasedCollator; +// typedef void * UCollationElements; +// struct UCollationElements; +typedef struct UCollationElements UCollationElements; + /** * The CollationElementIterator class is used as an iterator to walk through * each character of an international string. Use the iterator to return the @@ -225,6 +233,8 @@ protected: // CollationElementIterator protected constructors -------------------------- + friend RuleBasedCollator; + /** * CollationElementIterator constructor. This takes the source string and the * collation object. The cursor will walk thru the source string based on the @@ -265,15 +275,17 @@ protected: private: - friend class RuleBasedCollator; + // friend class RuleBasedCollator; // CollationElementIterator private data members ---------------------------- - static const int32_t UNMAPPEDCHARVALUE; + // static const int32_t UNMAPPEDCHARVALUE; + /* Normalizer* text; // owning VectorOfInt* bufferAlias; // not owned + */ /** * ownBuffer wants to be a subobject, not a pointer, but that means exposing @@ -282,7 +294,7 @@ private: * is used to handle Thai collation; bufferAlias points to ownBuffer in some * situations. [j159 - aliu] */ - VectorOfInt* ownBuffer; + // VectorOfInt* ownBuffer; /** * reorderBuffer is created on demand, so it doesn't want to be a subobject -- @@ -290,18 +302,30 @@ private: * conditions. Once created, it is reused for the life of this object. Because * of the implementation of VectorOfInt, it grows monotonically. [j159 - aliu] */ + /* VectorOfInt* reorderBuffer; int32_t expIndex; UnicodeString key; const RuleBasedCollator* orderAlias; + */ + + /** + * Data wrapper for collation elements + */ + UCollationElements *m_data_; + + /** + * Indicates if m_data_ belongs to this object. + */ + UBool isDataOwned_; // CollationElementIterator private constructor/destructor ------------------ /** * Default constructor. */ - CollationElementIterator(); + /* CollationElementIterator(); */ /** * Constructor. @@ -377,7 +401,7 @@ inline int32_t CollationElementIterator::tertiaryOrder(int32_t order) inline int32_t CollationElementIterator::getMaxExpansion(int32_t order) const { - return orderAlias->getMaxExpansion(order); + return ucol_getMaxExpansion(m_data_, order); } inline UBool CollationElementIterator::isIgnorable(int32_t order) diff --git a/icu4c/source/i18n/unicode/ucoleitr.h b/icu4c/source/i18n/unicode/ucoleitr.h index 4b26ab04f27..13ea9623508 100644 --- a/icu4c/source/i18n/unicode/ucoleitr.h +++ b/icu4c/source/i18n/unicode/ucoleitr.h @@ -3,22 +3,32 @@ * Copyright (C) 2001, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* -*/ +* +* File ucoleitr.cpp +* +* Modification History: +* +* Date Name Description +* 02/15/2001 synwee Modified all methods to process its own function +* instead of calling the equivalent c++ api (coleitr.h) +*******************************************************************************/ #ifndef UCOLEITR_H #define UCOLEITR_H -/** This indicates the last element in a UCollationElements has been consumed. - * +/** + * This indicates the last element in a UCollationElements has been consumed. */ #define UCOL_NULLORDER 0xFFFFFFFF #include "unicode/ucol.h" -/** The UCollationElements struct. - * For usage in C programs. +/** + * The UCollationElements struct. + * For usage in C programs. */ -typedef void * UCollationElements; +// typedef void * UCollationElements; +typedef struct UCollationElements UCollationElements; /** * The UCollationElements is used as an iterator to walk through @@ -66,7 +76,7 @@ typedef void * UCollationElements; * a collation order is its primary order; the next 8 bits is the secondary * order and the last 8 bits is the tertiary order. * - * @see Collator + * @see UCollator */ /** @@ -76,13 +86,13 @@ typedef void * UCollationElements; * @param text The text to iterate over. * @param textLength The number of characters in text, or -1 if null-terminated * @param status A pointer to an UErrorCode to receive any errors. - * @stable + * @return a struct containing collation element information */ U_CAPI UCollationElements* -ucol_openElements( const UCollator *coll, - const UChar *text, - int32_t textLength, - UErrorCode *status); +ucol_openElements(const UCollator *coll, + const UChar *text, + int32_t textLength, + UErrorCode *status); /** * get a hash code for a key... Not very useful! @@ -95,7 +105,6 @@ ucol_keyHashCode(const uint8_t* key, int32_t length); * Close a UCollationElements. * Once closed, a UCollationElements may no longer be used. * @param elems The UCollationElements to close. - * @stable */ U_CAPI void ucol_closeElements(UCollationElements *elems); @@ -106,7 +115,6 @@ ucol_closeElements(UCollationElements *elems); * @param elems The UCollationElements to reset. * @see ucol_next * @see ucol_previous - * @stable */ U_CAPI void ucol_reset(UCollationElements *elems); @@ -116,13 +124,11 @@ ucol_reset(UCollationElements *elems); * A single character may contain more than one collation element. * @param elems The UCollationElements containing the text. * @param status A pointer to an UErrorCode to receive any errors. - * @return The next collation elements ordering, or \Ref{UCOL_NULLORDER} if the - * end of the text is reached. - * @stable + * @return The next collation elements ordering, or \Ref{UCOL_NULLORDER} if + * the end of the text is reached. */ U_CAPI int32_t -ucol_next( UCollationElements *elems, - UErrorCode *status); +ucol_next(UCollationElements *elems, UErrorCode *status); /** * Get the ordering priority of the previous collation element in the text. @@ -131,11 +137,9 @@ ucol_next( UCollationElements *elems, * @param status A pointer to an UErrorCode to receive any errors. * @return The previous collation elements ordering, or \Ref{UCOL_NULLORDER} * if the end of the text is reached. - * @stable */ U_CAPI int32_t -ucol_previous( UCollationElements *elems, - UErrorCode *status); +ucol_previous(UCollationElements *elems, UErrorCode *status); /** * Get the maximum length of any expansion sequences that end with the @@ -144,28 +148,24 @@ ucol_previous( UCollationElements *elems, * @param elems The UCollationElements containing the text. * @param order A collation order returned by previous or next. * @return The maximum length of any expansion sequences ending with the - * specified order. - * @stable + * specified order. */ U_CAPI int32_t -ucol_getMaxExpansion( const UCollationElements *elems, - int32_t order); +ucol_getMaxExpansion(const UCollationElements *elems, int32_t order); /** * Set the text containing the collation elements. - * This * @param elems The UCollationElements to set. * @param text The source text containing the collation elements. * @param textLength The length of text, or -1 if null-terminated. * @param status A pointer to an UErrorCode to receive any errors. * @see ucol_getText - * @stable */ U_CAPI void -ucol_setText( UCollationElements *elems, - const UChar *text, - int32_t textLength, - UErrorCode *status); +ucol_setText( UCollationElements *elems, + const UChar *text, + int32_t textLength, + UErrorCode *status); /** * Get the offset of the current source character. @@ -174,7 +174,6 @@ ucol_setText( UCollationElements *elems, * @param elems The UCollationElements to query. * @return The offset of the current source character. * @see ucol_setOffset - * @stable */ U_CAPI UTextOffset ucol_getOffset(const UCollationElements *elems); @@ -186,11 +185,10 @@ ucol_getOffset(const UCollationElements *elems); * @param offset The desired character offset. * @param status A pointer to an UErrorCode to receive any errors. * @see ucol_getOffset - * @stable */ U_CAPI void -ucol_setOffset( UCollationElements *elems, - UTextOffset offset, - UErrorCode *status); +ucol_setOffset(UCollationElements *elems, + UTextOffset offset, + UErrorCode *status); #endif