Implemented backwards iterator for collation

X-SVN-Rev: 3679
This commit is contained in:
Syn Wee Quek 2001-02-20 00:26:50 +00:00
parent 2352c3b293
commit 78a57a7680
6 changed files with 704 additions and 167 deletions

View file

@ -14,58 +14,53 @@
*
* Modification History:
*
* Date Name Description
* Date Name Description
*
* 6/23/97 helena Adding comments to make code more readable.
* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
* 12/10/99 aliu Ported Thai collation support from Java.
* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
* 6/23/97 helena Adding comments to make code more readable.
* 08/03/98 erm Synched with 1.2 version of CollationElementIterator.java
* 12/10/99 aliu Ported Thai collation support from Java.
* 01/25/01 swquek Modified to a C++ wrapper calling C APIs (ucoliter.h)
* 02/19/01 swquek Removed CollationElementsIterator() since it is
* private constructor and no calls are made to it
*/
// #include "unicode/sortkey.h"
#include "unicode/coleitr.h"
#include "ucolimp.h"
#include "cmemory.h"
// #include "unicode/chariter.h"
#include "tables.h"
// #include "tables.h"
// #include "unicode/normlzr.h"
// #include "unicode/unicode.h"
// #include "tcoldata.h"
// #include "ucmp32.h"
// Constants ------------------------------------------------------------------
/* Constants --------------------------------------------------------------- */
/* synwee : public can't remove */
int32_t const CollationElementIterator::NULLORDER = 0xffffffff;
int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000;
// int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000;
// CollationElementIterator public constructor/destructor ---------------------
/* CollationElementIterator public constructor/destructor ------------------ */
CollationElementIterator::CollationElementIterator(
const CollationElementIterator& other)
: text(0),
ownBuffer(new VectorOfInt(2)),
reorderBuffer(0),
expIndex(other.expIndex)
const CollationElementIterator& other)
: isDataOwned_(TRUE)
{
*this = other;
}
CollationElementIterator::~CollationElementIterator()
{
delete text;
text = NULL;
bufferAlias = NULL;
orderAlias = NULL;
delete ownBuffer;
delete reorderBuffer;
ucol_closeElements(m_data_);
}
// CollationElementIterator public methods ------------------------------------
/* CollationElementIterator public methods --------------------------------- */
UTextOffset CollationElementIterator::getOffset() const
{
// Since the DecompositionIterator is doing the work of iterating through
// the text string, we can just ask it what its offset is.
return (text != NULL) ? text->getIndex() : 0;
return ucol_getOffset(m_data_);
}
/**
@ -75,6 +70,7 @@ UTextOffset CollationElementIterator::getOffset() const
*/
int32_t CollationElementIterator::next(UErrorCode& status)
{
/*
if (text == NULL || U_FAILURE(status))
return NULLORDER;
@ -111,9 +107,8 @@ int32_t CollationElementIterator::next(UErrorCode& status)
// Ask the collator for this character's ordering.
// Used to be RuleBasedCollator.getUnicodeOrder().
// It can't be inlined in tblcoll.h file unfortunately.
/*
synwee : have to modify this part
int32_t value = ucmp32_get(orderAlias->data->mapping, ch);
int32_t value = ucmp32_get(orderAlias->data->mapping, ch);
if (value == RuleBasedCollator::UNMAPPED)
{
@ -153,21 +148,22 @@ int32_t CollationElementIterator::next(UErrorCode& status)
return strengthOrder(value);
*/
return 0;
return ucol_next(m_data_, &status);
}
UBool CollationElementIterator::operator!=(
const CollationElementIterator& other) const
const CollationElementIterator& other) const
{
return !(*this == other);
}
UBool CollationElementIterator::operator==(const CollationElementIterator& that)
const
UBool CollationElementIterator::operator==(
const CollationElementIterator& that) const
{
if (this == &that)
return TRUE;
/*
if (*text != *(that.text))
return FALSE;
@ -182,6 +178,9 @@ UBool CollationElementIterator::operator==(const CollationElementIterator& that)
return FALSE;
return TRUE;
*/
return m_data_ == that.m_data_;
}
/**
@ -192,6 +191,7 @@ UBool CollationElementIterator::operator==(const CollationElementIterator& that)
*/
int32_t CollationElementIterator::previous(UErrorCode& status)
{
/*
if (text == NULL || U_FAILURE(status))
return NULLORDER;
@ -212,8 +212,7 @@ int32_t CollationElementIterator::previous(UErrorCode& status)
// Used to be RuleBasedCollator.getUnicodeOrder(). It can't be inlined in
// tblcoll.h file unfortunately.
/*
int32_t value = ucmp32_get(orderAlias->data->mapping, ch);
if (value == RuleBasedCollator::UNMAPPED)
@ -252,7 +251,7 @@ int32_t CollationElementIterator::previous(UErrorCode& status)
return strengthOrder(value);
*/
return 0;
return ucol_previous(m_data_, &status);
}
/**
@ -260,6 +259,7 @@ int32_t CollationElementIterator::previous(UErrorCode& status)
*/
void CollationElementIterator::reset()
{
/*
if (text != NULL)
{
text->reset();
@ -268,11 +268,14 @@ void CollationElementIterator::reset()
bufferAlias = NULL;
expIndex = 0;
*/
ucol_reset(m_data_);
}
void CollationElementIterator::setOffset(UTextOffset newOffset,
UErrorCode& status)
{
/*
if (U_FAILURE(status))
return;
@ -280,6 +283,8 @@ void CollationElementIterator::setOffset(UTextOffset newOffset,
text->setIndex(newOffset);
bufferAlias = NULL;
*/
ucol_setOffset(m_data_, newOffset, &status);
}
/**
@ -290,7 +295,7 @@ void CollationElementIterator::setText(const UnicodeString& source,
{
if (U_FAILURE(status))
return;
/*
bufferAlias = 0;
if (text == NULL)
@ -300,6 +305,17 @@ void CollationElementIterator::setText(const UnicodeString& source,
text->setText(source, status);
text->setMode(orderAlias->getDecomposition());
}
*/
int32_t length = source.length();
UChar *string = new UChar[length];
source.extract(0, length, string);
m_data_->length_ = length;
if (m_data_->iteratordata_.isWritable &&
m_data_->iteratordata_.string != NULL)
uprv_free(m_data_->iteratordata_.string);
init_collIterate(string, length, &m_data_->iteratordata_, TRUE);
}
// Sets the source to the new character iterator.
@ -309,6 +325,7 @@ void CollationElementIterator::setText(CharacterIterator& source,
if (U_FAILURE(status))
return;
/*
bufferAlias = 0;
if (text == NULL)
@ -318,38 +335,52 @@ void CollationElementIterator::setText(CharacterIterator& source,
text->setMode(orderAlias->getDecomposition());
text->setText(source, status);
}
*/
int32_t length = source.getLength();
UChar *buffer = new UChar[length];
/*
Using this constructor will prevent buffer from being removed when
string gets removed
*/
UnicodeString string(buffer, length, length);
source.getText(string);
string.extract(0, length, buffer);
m_data_->length_ = length;
if (m_data_->iteratordata_.isWritable &&
m_data_->iteratordata_.string != NULL)
uprv_free(m_data_->iteratordata_.string);
init_collIterate(buffer, length, &m_data_->iteratordata_, TRUE);
}
int32_t CollationElementIterator::strengthOrder(int32_t order) const
{
Collator::ECollationStrength s = orderAlias->getStrength();
UCollationStrength s = ucol_getStrength(m_data_->collator_);
// Mask off the unwanted differences.
if (s == Collator::PRIMARY)
if (s == UCOL_PRIMARY)
order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY;
else
if (s == Collator::SECONDARY)
if (s == UCOL_SECONDARY)
order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY;
return order;
}
// CollationElementIterator private constructors/destructors ------------------
/* CollationElementIterator private constructors/destructors --------------- */
// This private method will never be called, but it makes the linker happy
CollationElementIterator::CollationElementIterator() : text(0), bufferAlias(0),
ownBuffer(new VectorOfInt(2)),
reorderBuffer(0), expIndex(0),
orderAlias(0)
/*
This private method will never be called, but it makes the linker happy
CollationElementIterator::CollationElementIterator() : m_data_(0)
{
}
*/
CollationElementIterator::CollationElementIterator(
const RuleBasedCollator* order)
: text(0), bufferAlias(0),
ownBuffer(new VectorOfInt(2)),
reorderBuffer(0), expIndex(0),
orderAlias(order)
const RuleBasedCollator* order)
: isDataOwned_(TRUE)
{
UErrorCode status = U_ZERO_ERROR;
m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status);
}
/**
@ -359,17 +390,12 @@ CollationElementIterator::CollationElementIterator(
CollationElementIterator::CollationElementIterator(
const UnicodeString& sourceText,
const RuleBasedCollator* order,
UErrorCode& status)
: text(NULL),
bufferAlias(NULL),
ownBuffer(new VectorOfInt(2)),
reorderBuffer(0),
expIndex(0),
orderAlias(order)
UErrorCode& status)
{
if (U_FAILURE(status))
return;
/*
if ( sourceText.length() != 0 )
{
// A CollationElementIterator is really a two-layered beast.
@ -386,6 +412,8 @@ CollationElementIterator::CollationElementIterator(
if (text == NULL)
status = U_MEMORY_ALLOCATION_ERROR;
}
*/
m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status);
}
/**
@ -393,20 +421,16 @@ CollationElementIterator::CollationElementIterator(
* the source text using the specified collator
*/
CollationElementIterator::CollationElementIterator(
const CharacterIterator& sourceText,
const RuleBasedCollator* order,
UErrorCode& status)
: text(NULL),
bufferAlias(NULL),
ownBuffer(new VectorOfInt(2)),
reorderBuffer(0),
expIndex(0),
orderAlias(order)
const CharacterIterator& sourceText,
const RuleBasedCollator* order,
UErrorCode& status)
: isDataOwned_(TRUE)
{
if (U_FAILURE(status))
return;
// **** should I just drop this test? ****
/*
if ( sourceText.endIndex() != 0 )
{
// A CollationElementIterator is really a two-layered beast.
@ -423,15 +447,29 @@ CollationElementIterator::CollationElementIterator(
if (text == NULL)
status = U_MEMORY_ALLOCATION_ERROR;
}
*/
int32_t length = sourceText.getLength();
UChar *buffer = new UChar[length];
/*
Using this constructor will prevent buffer from being removed when
string gets removed
*/
UnicodeString string(buffer, length, length);
// synwee sourceText.getText(string);
string.extract(0, length, buffer);
m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status);
// synwee ucol_setText(m_data_, buffer, length, TRUE, &status);
}
// CollationElementIterator private methods -----------------------------------
/* CollationElementIterator private methods -------------------------------- */
const CollationElementIterator& CollationElementIterator::operator=(
const CollationElementIterator& other)
const CollationElementIterator& other)
{
if (this != &other)
{
/*
expIndex = other.expIndex;
delete text;
text = (Normalizer*)other.text->clone();
@ -455,6 +493,8 @@ const CollationElementIterator& CollationElementIterator::operator=(
bufferAlias = other.bufferAlias;
orderAlias = other.orderAlias;
*/
this->m_data_ = other.m_data_;
}
return *this;

View file

@ -3,6 +3,9 @@
* Copyright (C) 1996-1999, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* Modification history
* Date Name Comments
* 02/16/2001 synwee Added internal method getPrevSpecialCE
*/
#include "ucolimp.h"
@ -1089,6 +1092,140 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
return order; /* return the CE */
}
/*
* This function tries to get a CE from UCA, which should be always around
* UChar is passed in in order to speed things up here is also the generation
* of implicit CEs
*/
uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
uint32_t length, UErrorCode *status)
{
uint32_t order;
if (ch < 0xFF)
order = UCA->latinOneMapping[ch];
else
order = ucmp32_get(UCA->mapping, ch);
if (order >= UCOL_NOT_FOUND)
order = getSpecialPrevCE(UCA, order, collationSource, length, status);
if (order == UCOL_NOT_FOUND)
{
/*
This is where we have to resort to algorithmical generation.
We have to check if ch is possibly a first surrogate - then we need to
take the next code unit and make a bigger CE
*/
UChar nextChar;
const int
SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
LCount = 19, VCount = 21, TCount = 28,
NCount = VCount * TCount, // 588
SCount = LCount * NCount, // 11172
LLimit = LBase + LCount, // 1113
VLimit = VBase + VCount, // 1176
TLimit = TBase + TCount, // 11C3
SLimit = SBase + SCount; // D7A4
/*
once we have failed to find a match for codepoint cp, and are in the
implicit code.
*/
unsigned int L = ch - SBase;
if (L < SCount)
{ /* since it is unsigned, catchs zero case too */
/*
divide into pieces.
we do it in this order since some compilers can do % and / in one
operation
*/
int T = L % TCount;
L /= TCount;
int V = L % VCount;
L /= VCount;
/* offset them */
L += LBase;
V += VBase;
T += TBase;
/*
return the first CE, but first put the rest into the expansion buffer
*/
if (!collationSource->JamoSpecial)
{
*(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V);
if (T != TBase)
*(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T);
/* return first one */
return ucmp32_get(UCA->mapping, L);
} else {
/*
Jamo is Special
do recursive processing of L, V, and T with fetchCE (but T only if not
equal to TBase!!)
Since fetchCE returns a CE, and (potentially) stuffs items into the ce
buffer,
this is how it is done.
*/
/*
int firstCE = fetchCE(L, ...);
// set pointer, leave gap!
int* lastExpansion = expansionBufferEnd++;
*lastExpansion = fetchCE(V,...);
if (T != TBase) {
lastExpansion = expansionBufferEnd++; // set pointer, leave gap!
*lastExpansion = fetchCE(T,...);
}
*/
}
}
if (UTF_IS_SECOND_SURROGATE(ch))
{
if ((collationSource->len - collationSource->pos != length) &&
(UTF_IS_FIRST_SURROGATE(nextChar = *collationSource->pos)))
{
uint32_t cp = ((ch << 10UL) + nextChar - ((0xd800 << 10UL) + 0xdc00));
if (collationSource->pos != collationSource->writableBuffer)
collationSource->pos --;
else
{
collationSource->pos = collationSource->string +
(length - (collationSource->len - collationSource->writableBuffer));
collationSource->len = collationSource->string + length;
collationSource->isThai = TRUE;
}
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00))
return 0; /* illegal code value, use completely ignoreable! */
/*
This is a code point minus 0x10000, that's what algorithm requires
*/
order = 0xE0010303 | (cp & 0xFFE00) << 8;
*(collationSource->CEpos ++) = 0x80200080 | (cp & 0x001FF) << 22;
collationSource->toReturn ++;
}
else
return 0; /* completely ignorable */
}
else
{
/* otherwise */
if (UTF_IS_FIRST_SURROGATE(ch) || (ch & 0xFFFE) == 0xFFFE)
return 0; /* completely ignorable */
/* Make up an artifical CE from code point as per UCA */
order = 0xD08003C3 | (ch & 0xF000) << 12 | (ch & 0x0FE0) << 11;
*(collationSource->CEpos ++) = 0x04000080 | (ch & 0x001F) << 27;
collationSource->toReturn ++;
}
}
return order; /* return the CE */
}
/* This function handles the special CEs like contractions, expansions, surrogates, Thai */
/* It is called by both getNextCE and getNextUCA */
uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, UErrorCode *status) {
@ -1201,6 +1338,175 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
return CE;
}
/**
* This function handles the special CEs like contractions, expansions,
* surrogates, Thai.
* It is called by both getPrevCE and getPrevUCA
* synwee
*/
uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
collIterate *source, uint32_t length,
UErrorCode *status)
{
uint32_t count = 0;
const uint32_t *CEOffset = NULL;
const UChar *UCharOffset = NULL;
UChar schar,
tchar;
const UChar *strend = NULL;
const UChar *constart = NULL;
uint32_t size;
while (TRUE)
{
switch (getCETag(CE))
{
case NOT_FOUND_TAG:
return CE;
case SURROGATE_TAG:
/* pending surrogate discussion with Markus and Mark */
return UCOL_NOT_FOUND;
case THAI_TAG:
if (source->isThai == TRUE)
{ /* if we encountered Thai prevowel & the string is not yet touched */
source->isThai = FALSE;
/*
sigh... to cater for getNextCE, we'll have to modify and store the
whole string instead of a substring as in getSpecialCE
*/
UCharOffset = source->pos;
strend = source->len;
size = strend - source->string;
if (size > UCOL_WRITABLE_BUFFER_SIZE)
{
/*
someone else has already allocated something
*/
if (source->writableBuffer != source->stackWritableBuffer)
uprv_free(source->writableBuffer);
source->writableBuffer =
(UChar *)uprv_malloc(size * sizeof(UChar));
source->isThai = FALSE;
}
UChar *sourceCopy = source->string;
UChar *targetCopy = source->writableBuffer;
while (sourceCopy < strend)
{
if (UCOL_ISTHAIPREVOWEL(*sourceCopy) &&
/* This is the combination that needs to be swapped */
UCOL_ISTHAIBASECONSONANT(*(sourceCopy + 1)))
{
*(targetCopy) = *(sourceCopy + count + 1);
*(targetCopy+1) = *(sourceCopy + count);
targetCopy+=2;
sourceCopy+=2;
}
else
*(targetCopy++) = *(sourceCopy++);
}
source->pos = source->writableBuffer +
(UCharOffset - source->string);
source->len = targetCopy;
source->CEpos = source->toReturn = source->CEs;
CE = UCOL_IGNORABLE;
}
else
{
/*
we have already played with the string, so treat Thai as a length one
expansion
*/
/* find the offset to expansion table */
CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
CE = *CEOffset ++;
}
break;
case CONTRACTION_TAG:
/* This should handle contractions */
while (TRUE)
{
/*
First we position ourselves at the begining of contraction sequence
*/
constart = UCharOffset = (UChar *)coll->image + getContractOffset(CE);
strend = source->len;
if (strend - source->pos == length)
{ /* this is the start of string */
CE = *(coll->contractionCEs +
(UCharOffset - coll->contractionIndex));
break;
}
/*
Progressing to backwards block
*/
UCharOffset += *UCharOffset;
schar = *source->pos;
while (schar > (tchar = *UCharOffset))
UCharOffset ++;
if (schar != tchar)
{
/*
we didn't find the correct codepoint. We can use either the first or
the last CE
*/
if (tchar != 0xFFFF)
UCharOffset = constart;
}
else
{
/* Move up one character */
if (source->pos != source->writableBuffer)
source->pos --;
else
{
source->pos = source->string +
(length - (source->len - source->writableBuffer));
source->len = source->string + length;
source->isThai = TRUE;
}
}
CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
if (!isContraction(CE))
break;
}
break;
case EXPANSION_TAG:
/*
This should handle expansion.
NOTE: we can encounter both continuations and expansions in an expansion!
I have to decide where continuations are going to be dealt with
*/
/* find the offset to expansion table */
CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
size = getExpansionCount(CE);
if (size != 0)
/*
if there are less than 16 elements in expansion, we don't terminate
*/
for (count = 0; count < size; count++)
*(source->CEpos ++) = *CEOffset++;
else
/* else, we do */
while (*CEOffset != 0)
*(source->CEpos ++) = *CEOffset ++;
source->toReturn = source->CEpos - 1;
return *(source->toReturn --);
case CHARSET_TAG:
/* probably after 1.8 */
return UCOL_NOT_FOUND;
default:
*status = U_INTERNAL_PROGRAM_ERROR;
CE=0;
break;
}
if (CE <= UCOL_NOT_FOUND) break;
}
return CE;
}
/* This should really be a macro */
/* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
/* anyway */

View file

@ -1,18 +1,36 @@
/*
*******************************************************************************
******************************************************************************
* Copyright (C) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
******************************************************************************
*
* File ucoleitr.cpp
*
* Modification History:
*
* Date Name Description
* 02/15/2001 synwee Modified all methods to process its own function
* instead of calling the equivalent c++ api (coleitr.h)
******************************************************************************/
#include "unicode/ucoleitr.h"
#include "unicode/ustring.h"
#include "unicode/coleitr.h"
#include "unicode/sortkey.h"
#include "ucolimp.h"
#include "cmemory.h"
#define BUFFER_LENGTH 100
typedef struct collIterate collIterator;
/* public methods ---------------------------------------------------- */
/**
* Since this is going to be deprecated, I'll leave it as it is
*/
U_CAPI int32_t
ucol_keyHashCode( const uint8_t* key,
int32_t length)
ucol_keyHashCode(const uint8_t *key,
int32_t length)
{
CollationKey newKey(key, length);
return newKey.hashCode();
@ -20,88 +38,160 @@ ucol_keyHashCode( const uint8_t* key,
UCollationElements*
ucol_openElements( const UCollator *coll,
const UChar *text,
int32_t textLength,
UErrorCode *status)
ucol_openElements(const UCollator *coll,
const UChar *text,
int32_t textLength,
UErrorCode *status)
{
int32_t len = (textLength == -1 ? u_strlen(text) : textLength);
const UnicodeString src((UChar*)text, len, len);
UCollationElements *result;
CollationElementIterator *iter = 0;
iter = ((RuleBasedCollator*)coll)->createCollationElementIterator(src);
if(iter == 0) {
*status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_FAILURE(*status))
return NULL;
return (UCollationElements*) iter;
result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements));
result->collator_ = coll;
/* gets the correct length of the null-terminated string */
if (textLength == -1)
textLength = u_strlen(text);
result->length_ = textLength;
init_collIterate(text, textLength, &result->iteratordata_, FALSE);
return result;
}
U_CAPI void
ucol_closeElements(UCollationElements *elems)
{
delete (CollationElementIterator*)elems;
collIterate *ci = &elems->iteratordata_;
if (ci->writableBuffer != ci->stackWritableBuffer)
uprv_free(ci->writableBuffer);
if (elems->iteratordata_.isWritable && elems->iteratordata_.string != NULL)
uprv_free(elems->iteratordata_.string);
uprv_free(elems);
}
U_CAPI void
ucol_reset(UCollationElements *elems)
{
((CollationElementIterator*)elems)->reset();
collIterate *ci = &(elems->iteratordata_);
ci->pos = ci->string;
ci->len = ci->string + elems->length_;
ci->CEpos = ci->toReturn = ci->CEs;
/*
problem here, that means we'll have to keep calculating the new thai set
whenever we reset. maybe getSpecialCE should just do up the whole string
instead of only a substring of it.
*/
ci->isThai = TRUE;
if (ci->stackWritableBuffer != ci->writableBuffer)
{
uprv_free(ci->writableBuffer);
ci->writableBuffer = ci->stackWritableBuffer;
}
}
U_CAPI int32_t
ucol_next( UCollationElements *elems,
UErrorCode *status)
ucol_next(UCollationElements *elems,
UErrorCode *status)
{
if(U_FAILURE(*status)) return UCOL_NULLORDER;
if (U_FAILURE(*status))
return UCOL_NULLORDER;
return ((CollationElementIterator*)elems)->next(*status);
int32_t result;
UCOL_GETNEXTCE(result, elems->collator_, elems->iteratordata_, status);
return result;
}
U_CAPI int32_t
ucol_previous( UCollationElements *elems,
UErrorCode *status)
ucol_previous(UCollationElements *elems,
UErrorCode *status)
{
if(U_FAILURE(*status)) return UCOL_NULLORDER;
if(U_FAILURE(*status))
return UCOL_NULLORDER;
return ((CollationElementIterator*)elems)->previous(*status);
int32_t result;
UCOL_GETPREVCE(result, elems->collator_, elems->iteratordata_,
elems->length_, status);
return result;
}
U_CAPI int32_t
ucol_getMaxExpansion( const UCollationElements *elems,
int32_t order)
ucol_getMaxExpansion(const UCollationElements *elems,
int32_t order)
{
return ((CollationElementIterator*)elems)->getMaxExpansion(order);
/*
synwee : requested this implementation from vladimir, need discussion. so
hang on.
*/
/* return ((CollationElementIterator*)elems)->getMaxExpansion(order); */
return -1;
}
U_CAPI void
ucol_setText(UCollationElements *elems,
const UChar *text,
int32_t textLength,
UErrorCode *status)
ucol_setText( UCollationElements *elems,
const UChar *text,
int32_t textLength,
UErrorCode *status)
{
if(U_FAILURE(*status)) return;
if (U_FAILURE(*status))
return;
/* gets the correct length of the null-terminated string */
if (textLength == -1)
textLength = u_strlen(text);
int32_t len = (textLength == -1 ? u_strlen(text) : textLength);
const UnicodeString src((UChar*)text, len, len);
elems->length_ = textLength;
((CollationElementIterator*)elems)->setText(src, *status);
if (elems->iteratordata_.isWritable && elems->iteratordata_.string != NULL)
uprv_free(elems->iteratordata_.string);
init_collIterate(text, textLength, &elems->iteratordata_, FALSE);
}
U_CAPI UTextOffset
ucol_getOffset(const UCollationElements *elems)
{
return ((CollationElementIterator*)elems)->getOffset();
/* return ((CollationElementIterator*)elems)->getOffset(); */
const collIterate *ci = &(elems->iteratordata_);
if (ci->isThai == TRUE)
return ci->pos - ci->string;
/*
if it is a thai string with reversed elements, since getNextCE does not
store only a substring in writeablebuffer, we'll have to do some calculation
to get the offset out.
need discussion to see if it is a better idea to store the whole string
instead.
*/
return elems->length_ - (ci->len - ci->pos);
}
U_CAPI void
ucol_setOffset( UCollationElements *elems,
UTextOffset offset,
UErrorCode *status)
ucol_setOffset(UCollationElements *elems,
UTextOffset offset,
UErrorCode *status)
{
if(U_FAILURE(*status)) return;
((CollationElementIterator*)elems)->setOffset(offset, *status);
if (U_FAILURE(*status))
return;
collIterate *ci = &(elems->iteratordata_);
ci->pos = ci->string + offset;
ci->CEpos = ci->toReturn = ci->CEs;
/*
problem here, that means we'll have to keep calculating the new thai set
whenever we reset. maybe getSpecialCE should just do up the whole string
instead of only a substring of it.
*/
ci->isThai = TRUE;
if (ci->stackWritableBuffer != ci->writableBuffer)
{
uprv_free(ci->writableBuffer);
ci->writableBuffer = ci->stackWritableBuffer;
}
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1998-2000, International Business Machines
* Copyright (C) 1998-2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -14,6 +14,11 @@
*
* created on: 2000dec11
* created by: Vladimir Weinstein
*
* Modification history
* Date Name Comments
* 02/16/2001 synwee Added UCOL_GETPREVCE for the use in ucoleitr
*
*/
#ifndef UCOL_IMP_H
@ -62,6 +67,28 @@ struct collIterate {
UChar *writableBuffer;
};
struct UCollationElements
{
/**
* Locale specific collator for generating the collation elements
*/
const UCollator *collator_;
/**
* Normalization mode, not exactly the same as the data in collator_.
* If collation strength requested is UCOL_IDENTICAL, this modes will be
* UNORM_NONE other it follows collator_.
*/
UNormalizationMode normalization_;
/**
* Struct wrapper for source data
*/
collIterate iteratordata_;
/**
* Source text length
*/
int32_t length_;
};
struct incrementalContext {
UCharForwardIterator *source;
void *sourceContext;
@ -196,9 +223,61 @@ struct incrementalContext {
} \
}
/**
* Macro that gets a simple CE.
* So what it does is that it will first check the expansion buffer. If the
* expansion buffer is not empty, ie the end pointer to the expansion buffer
* is different from the start pointer, we return the collation element at the
* return pointer and decrement it.
* For more complicated CEs it resorts to getComplicatedCE.
*/
#define UCOL_GETPREVCE(order, coll, data, length, status) { \
if (data.CEpos > data.CEs) { \
(order) = *(data.toReturn --); \
if (data.CEs == data.toReturn) { \
data.CEpos = data.toReturn = data.CEs; \
} \
} \
else { \
if (data.len - data.pos == length) { \
(order) = UCOL_NO_MORE_CES; \
} \
else { \
UChar ch = *(data.pos); \
if (data.pos != data.writableBuffer) { \
data.pos --; \
} \
else { \
data.pos = data.string + \
(length - (data.len - data.writableBuffer)); \
data.len = data.string + length; \
data.isThai = TRUE; \
} \
if (ch <= 0xFF) { \
(order) = (coll)->latinOneMapping[ch]; \
} \
else { \
(order) = ucmp32_get((coll)->mapping, ch); \
} \
if ((order) >= UCOL_NOT_FOUND) { \
(order) = getSpecialPrevCE((coll), (order), &(data), (length), \
(status)); \
if ((order) == UCOL_NOT_FOUND) { \
(order) = ucol_getPrevUCA(ch, &(data), (length), (status)); \
} \
} \
} \
} \
}
uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, UErrorCode *status);
uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE,
collIterate *source, uint32_t length,
UErrorCode *status);
U_CFUNC uint32_t ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status);
uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *status);
uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource,
uint32_t length, UErrorCode *status);
void incctx_cleanUpContext(incrementalContext *ctx);
UChar incctx_appendChar(incrementalContext *ctx, UChar c);

View file

@ -1,8 +1,8 @@
/*
*****************************************************************************************
******************************************************************************
* Copyright (C) 1997-1999, International Business Machines
* Corporation and others. All Rights Reserved.
*****************************************************************************************
******************************************************************************
*/
/**
@ -14,12 +14,14 @@
*
* Modification History:
*
* Date Name Description
* Date Name Description
*
* 8/18/97 helena Added internal API documentation.
* 08/03/98 erm Synched with 1.2 version CollationElementIterator.java
* 12/10/99 aliu Ported Thai collation support from Java.
* 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h)
* 8/18/97 helena Added internal API documentation.
* 08/03/98 erm Synched with 1.2 version CollationElementIterator.java
* 12/10/99 aliu Ported Thai collation support from Java.
* 01/25/01 swquek Modified into a C++ wrapper calling C APIs (ucoliter.h)
* 02/19/01 swquek Removed CollationElementsIterator() since it is
* private constructor and no calls are made to it
*/
#ifndef COLEITR_H
@ -27,16 +29,22 @@
// #include "unicode/unistr.h"
#include "unicode/tblcoll.h"
#include "unicode/ucoleitr.h"
// #include "tables.h"
// #include "unicode/chariter.h"
// have to do this because the include path in the main project does not have
// tables.h.
class VectorOfInt;
// class VectorOfInt;
// class Normalizer;
// class VectorOfPToContractElement;
// class RuleBasedCollator;
// typedef void * UCollationElements;
// struct UCollationElements;
typedef struct UCollationElements UCollationElements;
/**
* The CollationElementIterator class is used as an iterator to walk through
* each character of an international string. Use the iterator to return the
@ -225,6 +233,8 @@ protected:
// CollationElementIterator protected constructors --------------------------
friend RuleBasedCollator;
/**
* CollationElementIterator constructor. This takes the source string and the
* collation object. The cursor will walk thru the source string based on the
@ -265,15 +275,17 @@ protected:
private:
friend class RuleBasedCollator;
// friend class RuleBasedCollator;
// CollationElementIterator private data members ----------------------------
static const int32_t UNMAPPEDCHARVALUE;
// static const int32_t UNMAPPEDCHARVALUE;
/*
Normalizer* text; // owning
VectorOfInt* bufferAlias; // not owned
*/
/**
* ownBuffer wants to be a subobject, not a pointer, but that means exposing
@ -282,7 +294,7 @@ private:
* is used to handle Thai collation; bufferAlias points to ownBuffer in some
* situations. [j159 - aliu]
*/
VectorOfInt* ownBuffer;
// VectorOfInt* ownBuffer;
/**
* reorderBuffer is created on demand, so it doesn't want to be a subobject --
@ -290,18 +302,30 @@ private:
* conditions. Once created, it is reused for the life of this object. Because
* of the implementation of VectorOfInt, it grows monotonically. [j159 - aliu]
*/
/*
VectorOfInt* reorderBuffer;
int32_t expIndex;
UnicodeString key;
const RuleBasedCollator* orderAlias;
*/
/**
* Data wrapper for collation elements
*/
UCollationElements *m_data_;
/**
* Indicates if m_data_ belongs to this object.
*/
UBool isDataOwned_;
// CollationElementIterator private constructor/destructor ------------------
/**
* Default constructor.
*/
CollationElementIterator();
/* CollationElementIterator(); */
/**
* Constructor.
@ -377,7 +401,7 @@ inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)
inline int32_t CollationElementIterator::getMaxExpansion(int32_t order) const
{
return orderAlias->getMaxExpansion(order);
return ucol_getMaxExpansion(m_data_, order);
}
inline UBool CollationElementIterator::isIgnorable(int32_t order)

View file

@ -3,22 +3,32 @@
* Copyright (C) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
*
* File ucoleitr.cpp
*
* Modification History:
*
* Date Name Description
* 02/15/2001 synwee Modified all methods to process its own function
* instead of calling the equivalent c++ api (coleitr.h)
*******************************************************************************/
#ifndef UCOLEITR_H
#define UCOLEITR_H
/** This indicates the last element in a UCollationElements has been consumed.
*
/**
* This indicates the last element in a UCollationElements has been consumed.
*/
#define UCOL_NULLORDER 0xFFFFFFFF
#include "unicode/ucol.h"
/** The UCollationElements struct.
* For usage in C programs.
/**
* The UCollationElements struct.
* For usage in C programs.
*/
typedef void * UCollationElements;
// typedef void * UCollationElements;
typedef struct UCollationElements UCollationElements;
/**
* The UCollationElements is used as an iterator to walk through
@ -66,7 +76,7 @@ typedef void * UCollationElements;
* a collation order is its primary order; the next 8 bits is the secondary
* order and the last 8 bits is the tertiary order.
*
* @see Collator
* @see UCollator
*/
/**
@ -76,13 +86,13 @@ typedef void * UCollationElements;
* @param text The text to iterate over.
* @param textLength The number of characters in text, or -1 if null-terminated
* @param status A pointer to an UErrorCode to receive any errors.
* @stable
* @return a struct containing collation element information
*/
U_CAPI UCollationElements*
ucol_openElements( const UCollator *coll,
const UChar *text,
int32_t textLength,
UErrorCode *status);
ucol_openElements(const UCollator *coll,
const UChar *text,
int32_t textLength,
UErrorCode *status);
/**
* get a hash code for a key... Not very useful!
@ -95,7 +105,6 @@ ucol_keyHashCode(const uint8_t* key, int32_t length);
* Close a UCollationElements.
* Once closed, a UCollationElements may no longer be used.
* @param elems The UCollationElements to close.
* @stable
*/
U_CAPI void
ucol_closeElements(UCollationElements *elems);
@ -106,7 +115,6 @@ ucol_closeElements(UCollationElements *elems);
* @param elems The UCollationElements to reset.
* @see ucol_next
* @see ucol_previous
* @stable
*/
U_CAPI void
ucol_reset(UCollationElements *elems);
@ -116,13 +124,11 @@ ucol_reset(UCollationElements *elems);
* A single character may contain more than one collation element.
* @param elems The UCollationElements containing the text.
* @param status A pointer to an UErrorCode to receive any errors.
* @return The next collation elements ordering, or \Ref{UCOL_NULLORDER} if the
* end of the text is reached.
* @stable
* @return The next collation elements ordering, or \Ref{UCOL_NULLORDER} if
* the end of the text is reached.
*/
U_CAPI int32_t
ucol_next( UCollationElements *elems,
UErrorCode *status);
ucol_next(UCollationElements *elems, UErrorCode *status);
/**
* Get the ordering priority of the previous collation element in the text.
@ -131,11 +137,9 @@ ucol_next( UCollationElements *elems,
* @param status A pointer to an UErrorCode to receive any errors.
* @return The previous collation elements ordering, or \Ref{UCOL_NULLORDER}
* if the end of the text is reached.
* @stable
*/
U_CAPI int32_t
ucol_previous( UCollationElements *elems,
UErrorCode *status);
ucol_previous(UCollationElements *elems, UErrorCode *status);
/**
* Get the maximum length of any expansion sequences that end with the
@ -144,28 +148,24 @@ ucol_previous( UCollationElements *elems,
* @param elems The UCollationElements containing the text.
* @param order A collation order returned by previous or next.
* @return The maximum length of any expansion sequences ending with the
* specified order.
* @stable
* specified order.
*/
U_CAPI int32_t
ucol_getMaxExpansion( const UCollationElements *elems,
int32_t order);
ucol_getMaxExpansion(const UCollationElements *elems, int32_t order);
/**
* Set the text containing the collation elements.
* This
* @param elems The UCollationElements to set.
* @param text The source text containing the collation elements.
* @param textLength The length of text, or -1 if null-terminated.
* @param status A pointer to an UErrorCode to receive any errors.
* @see ucol_getText
* @stable
*/
U_CAPI void
ucol_setText( UCollationElements *elems,
const UChar *text,
int32_t textLength,
UErrorCode *status);
ucol_setText( UCollationElements *elems,
const UChar *text,
int32_t textLength,
UErrorCode *status);
/**
* Get the offset of the current source character.
@ -174,7 +174,6 @@ ucol_setText( UCollationElements *elems,
* @param elems The UCollationElements to query.
* @return The offset of the current source character.
* @see ucol_setOffset
* @stable
*/
U_CAPI UTextOffset
ucol_getOffset(const UCollationElements *elems);
@ -186,11 +185,10 @@ ucol_getOffset(const UCollationElements *elems);
* @param offset The desired character offset.
* @param status A pointer to an UErrorCode to receive any errors.
* @see ucol_getOffset
* @stable
*/
U_CAPI void
ucol_setOffset( UCollationElements *elems,
UTextOffset offset,
UErrorCode *status);
ucol_setOffset(UCollationElements *elems,
UTextOffset offset,
UErrorCode *status);
#endif