ICU-861

Implemented backwards iterator for collation X-SVN-Rev: 3679
2025-04-07 22:44:49 +00:00 · 2001-02-20 00:26:50 +00:00 · 2001-02-20 00:26:50 +00:00 · 78a57a7680
commit 78a57a7680
parent 2352c3b293
6 changed files with 704 additions and 167 deletions
--- a/icu4c/source/i18n/coleitr.cpp
+++ b/icu4c/source/i18n/coleitr.cpp
@ -14,58 +14,53 @@
 *
 * Modification History:
 *
-*  Date         Name          Description
+*  Date      Name        Description
 *
-*  6/23/97     helena      Adding comments to make code more readable.
-* 08/03/98     erm         Synched with 1.2 version of CollationElementIterator.java
-* 12/10/99      aliu          Ported Thai collation support from Java.
-* 01/25/01     swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
+*  6/23/97   helena      Adding comments to make code more readable.
+* 08/03/98   erm         Synched with 1.2 version of CollationElementIterator.java
+* 12/10/99   aliu        Ported Thai collation support from Java.
+* 01/25/01   swquek      Modified to a C++ wrapper calling C APIs (ucoliter.h)
+* 02/19/01   swquek      Removed CollationElementsIterator() since it is 
+*                        private constructor and no calls are made to it
 */

 // #include "unicode/sortkey.h"
 #include "unicode/coleitr.h"
+#include "ucolimp.h"
+#include "cmemory.h"

 // #include "unicode/chariter.h"
-#include "tables.h"
+// #include "tables.h"
 // #include "unicode/normlzr.h"
 // #include "unicode/unicode.h"
 // #include "tcoldata.h"
 // #include "ucmp32.h"

-// Constants ------------------------------------------------------------------
+/* Constants --------------------------------------------------------------- */

+/* synwee : public can't remove */
 int32_t const CollationElementIterator::NULLORDER = 0xffffffff;
-int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000;
+// int32_t const CollationElementIterator::UNMAPPEDCHARVALUE = 0x7fff0000;

-// CollationElementIterator public constructor/destructor ---------------------
+/* CollationElementIterator public constructor/destructor ------------------ */

 CollationElementIterator::CollationElementIterator(
-                                          const CollationElementIterator& other)
-                                          : text(0), 
-                                          ownBuffer(new VectorOfInt(2)),
-                                          reorderBuffer(0), 
-                                          expIndex(other.expIndex)
+                                         const CollationElementIterator& other) 
+                                         : isDataOwned_(TRUE)
 {
  *this = other;
 }

 CollationElementIterator::~CollationElementIterator()
 {
-  delete text;
-  text = NULL;
-  bufferAlias = NULL;
-  orderAlias = NULL;
-  delete ownBuffer;
-  delete reorderBuffer;
+  ucol_closeElements(m_data_);
 }

-// CollationElementIterator public methods ------------------------------------
+/* CollationElementIterator public methods --------------------------------- */

 UTextOffset CollationElementIterator::getOffset() const
 {
-  // Since the DecompositionIterator is doing the work of iterating through
-  // the text string, we can just ask it what its offset is.
-  return (text != NULL) ? text->getIndex() : 0;
+  return ucol_getOffset(m_data_);
 }

 /**
@ -75,6 +70,7 @@ UTextOffset CollationElementIterator::getOffset() const
 */
 int32_t CollationElementIterator::next(UErrorCode& status)
 {
+  /*
  if (text == NULL || U_FAILURE(status))
    return NULLORDER;
    
@ -111,9 +107,8 @@ int32_t CollationElementIterator::next(UErrorCode& status)
  // Ask the collator for this character's ordering.
  // Used to be RuleBasedCollator.getUnicodeOrder(). 
  // It can't be inlined in tblcoll.h file unfortunately.
-  /*
-  synwee : have to modify this part
-  int32_t value = ucmp32_get(orderAlias->data->mapping, ch);
+  
+    int32_t value = ucmp32_get(orderAlias->data->mapping, ch);

  if (value == RuleBasedCollator::UNMAPPED)
  {
@ -153,21 +148,22 @@ int32_t CollationElementIterator::next(UErrorCode& status)

  return strengthOrder(value);
  */
-  return 0;
+  return ucol_next(m_data_, &status);
 }

 UBool CollationElementIterator::operator!=(
-                                   const CollationElementIterator& other) const
+                                  const CollationElementIterator& other) const
 {
  return !(*this == other);
 }

-UBool CollationElementIterator::operator==(const CollationElementIterator& that) 
-                                                                           const
+UBool CollationElementIterator::operator==(
+                                    const CollationElementIterator& that) const
 {
  if (this == &that)
    return TRUE;
-    
+  
+  /*
  if (*text != *(that.text))
    return FALSE;
    
@ -182,6 +178,9 @@ UBool CollationElementIterator::operator==(const CollationElementIterator& that)
    return FALSE;
    
  return TRUE;
+  */
+  
+  return m_data_ == that.m_data_;
 }

 /**
@ -192,6 +191,7 @@ UBool CollationElementIterator::operator==(const CollationElementIterator& that)
 */
 int32_t CollationElementIterator::previous(UErrorCode& status)
 {
+  /*
  if (text == NULL || U_FAILURE(status))
    return NULLORDER;
    
@ -212,8 +212,7 @@ int32_t CollationElementIterator::previous(UErrorCode& status)
    
  // Used to be RuleBasedCollator.getUnicodeOrder(). It can't be inlined in 
  // tblcoll.h file unfortunately.
-  /*
-
+  
  int32_t value = ucmp32_get(orderAlias->data->mapping, ch);

  if (value == RuleBasedCollator::UNMAPPED)
@ -252,7 +251,7 @@ int32_t CollationElementIterator::previous(UErrorCode& status)

  return strengthOrder(value);
  */
-  return 0;
+  return ucol_previous(m_data_, &status);
 }

 /**
@ -260,6 +259,7 @@ int32_t CollationElementIterator::previous(UErrorCode& status)
 */
 void CollationElementIterator::reset()
 {
+  /*
  if (text != NULL)
  {
    text->reset();
@ -268,11 +268,14 @@ void CollationElementIterator::reset()

  bufferAlias = NULL;
  expIndex = 0;
+  */
+  ucol_reset(m_data_);
 }

 void CollationElementIterator::setOffset(UTextOffset newOffset, 
                                         UErrorCode& status)
 {
+  /*
  if (U_FAILURE(status))
    return;
    
@ -280,6 +283,8 @@ void CollationElementIterator::setOffset(UTextOffset newOffset,
    text->setIndex(newOffset);
    
  bufferAlias = NULL;
+  */
+  ucol_setOffset(m_data_, newOffset, &status);
 }

 /**
@ -290,7 +295,7 @@ void CollationElementIterator::setText(const UnicodeString& source,
 {
  if (U_FAILURE(status))
    return;
-    
+  /*
  bufferAlias = 0;

  if (text == NULL)
@ -300,6 +305,17 @@ void CollationElementIterator::setText(const UnicodeString& source,
    text->setText(source, status);
    text->setMode(orderAlias->getDecomposition());
  }
+  */
+  int32_t length = source.length();
+  UChar *string = new UChar[length];
+  source.extract(0, length, string);
+	
+  m_data_->length_ = length;
+
+  if (m_data_->iteratordata_.isWritable && 
+      m_data_->iteratordata_.string != NULL)
+    uprv_free(m_data_->iteratordata_.string);
+  init_collIterate(string, length, &m_data_->iteratordata_, TRUE);
 }

 // Sets the source to the new character iterator.
@ -309,6 +325,7 @@ void CollationElementIterator::setText(CharacterIterator& source,
  if (U_FAILURE(status)) 
    return;
    
+  /*
  bufferAlias = 0;

  if (text == NULL)
@ -318,38 +335,52 @@ void CollationElementIterator::setText(CharacterIterator& source,
    text->setMode(orderAlias->getDecomposition());
    text->setText(source, status);
  }
+  */
+  int32_t length = source.getLength();
+  UChar *buffer = new UChar[length];
+  /* 
+  Using this constructor will prevent buffer from being removed when
+  string gets removed
+  */
+  UnicodeString string(buffer, length, length);
+  source.getText(string);
+  string.extract(0, length, buffer);
+  m_data_->length_ = length;
+
+  if (m_data_->iteratordata_.isWritable && 
+      m_data_->iteratordata_.string != NULL)
+    uprv_free(m_data_->iteratordata_.string);
+  init_collIterate(buffer, length, &m_data_->iteratordata_, TRUE);
 }

 int32_t CollationElementIterator::strengthOrder(int32_t order) const
 {
-  Collator::ECollationStrength s = orderAlias->getStrength();
+  UCollationStrength s = ucol_getStrength(m_data_->collator_);
  // Mask off the unwanted differences.
-  if (s == Collator::PRIMARY)
+  if (s == UCOL_PRIMARY)
    order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY;
  else 
-    if (s == Collator::SECONDARY)
+    if (s == UCOL_SECONDARY)
      order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY;
    
  return order;
 }

-// CollationElementIterator private constructors/destructors ------------------
+/* CollationElementIterator private constructors/destructors --------------- */

-// This private method will never be called, but it makes the linker happy
-CollationElementIterator::CollationElementIterator() : text(0), bufferAlias(0),
-                                                  ownBuffer(new VectorOfInt(2)), 
-                                                  reorderBuffer(0), expIndex(0),
-                                                  orderAlias(0)
+/* 
+This private method will never be called, but it makes the linker happy
+CollationElementIterator::CollationElementIterator() : m_data_(0)
 {
 }
+*/

 CollationElementIterator::CollationElementIterator(
-                                                 const RuleBasedCollator* order)
-                                               : text(0), bufferAlias(0),
-                                                 ownBuffer(new VectorOfInt(2)),
-                                                 reorderBuffer(0), expIndex(0),
-                                                 orderAlias(order)
+                                              const RuleBasedCollator* order)
+                                              : isDataOwned_(TRUE)
 {
+  UErrorCode status = U_ZERO_ERROR;
+  m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status);
 }

 /** 
@ -359,17 +390,12 @@ CollationElementIterator::CollationElementIterator(
 CollationElementIterator::CollationElementIterator(
                                                const UnicodeString& sourceText,
                                                const RuleBasedCollator* order,
-                                                UErrorCode& status) 
-                                                : text(NULL),
-                                                  bufferAlias(NULL),
-                                                  ownBuffer(new VectorOfInt(2)),
-                                                  reorderBuffer(0),
-                                                  expIndex(0), 
-                                                  orderAlias(order)
+                                                UErrorCode& status)
 {
  if (U_FAILURE(status))
    return;
-    
+ 
+  /*
  if ( sourceText.length() != 0 ) 
  {
    // A CollationElementIterator is really a two-layered beast.
@ -386,6 +412,8 @@ CollationElementIterator::CollationElementIterator(
    if (text == NULL)
      status = U_MEMORY_ALLOCATION_ERROR;
  }
+  */
+  m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status);
 }

 /** 
@ -393,20 +421,16 @@ CollationElementIterator::CollationElementIterator(
 * the source text using the specified collator
 */
 CollationElementIterator::CollationElementIterator(
-                                            const CharacterIterator& sourceText,
-                                            const RuleBasedCollator* order,
-                                            UErrorCode& status) 
-                                            : text(NULL),
-                                              bufferAlias(NULL),
-                                              ownBuffer(new VectorOfInt(2)),
-                                              reorderBuffer(0),
-                                              expIndex(0), 
-                                              orderAlias(order)
+                                           const CharacterIterator& sourceText,
+                                           const RuleBasedCollator* order,
+                                           UErrorCode& status)
+                                           : isDataOwned_(TRUE)
 {
  if (U_FAILURE(status))
    return;
    
  // **** should I just drop this test? ****
+  /*
  if ( sourceText.endIndex() != 0 )
  {
    // A CollationElementIterator is really a two-layered beast.
@ -423,15 +447,29 @@ CollationElementIterator::CollationElementIterator(
    if (text == NULL)
      status = U_MEMORY_ALLOCATION_ERROR;    
  }
+  */
+  int32_t length = sourceText.getLength();
+  UChar *buffer = new UChar[length];
+  /* 
+  Using this constructor will prevent buffer from being removed when
+  string gets removed
+  */
+  UnicodeString string(buffer, length, length);
+  // synwee sourceText.getText(string);
+  string.extract(0, length, buffer);
+  
+  m_data_ = ucol_openElements(order->ucollator, NULL, 0, &status);
+  // synwee ucol_setText(m_data_, buffer, length, TRUE, &status);
 }

-// CollationElementIterator private methods -----------------------------------
+/* CollationElementIterator private methods -------------------------------- */

 const CollationElementIterator& CollationElementIterator::operator=(
-                                          const CollationElementIterator& other)
+                                         const CollationElementIterator& other)
 {
  if (this != &other)
  {
+    /*
    expIndex = other.expIndex;
    delete text;
    text = (Normalizer*)other.text->clone();
@ -455,6 +493,8 @@ const CollationElementIterator& CollationElementIterator::operator=(
        bufferAlias = other.bufferAlias;
        
      orderAlias = other.orderAlias;
+    */
+    this->m_data_ = other.m_data_;
  }

  return *this;
--- a/icu4c/source/i18n/ucol.cpp
+++ b/icu4c/source/i18n/ucol.cpp
@ -3,6 +3,9 @@
 *   Copyright (C) 1996-1999, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
+* Modification history
+* Date        Name      Comments
+* 02/16/2001  synwee    Added internal method getPrevSpecialCE 
 */

 #include "ucolimp.h"
@ -1089,6 +1092,140 @@ uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *sta
    return order; /* return the CE */
 }

+/* 
+* This function tries to get a CE from UCA, which should be always around 
+* UChar is passed in in order to speed things up here is also the generation 
+* of implicit CEs                              
+*/
+uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource, 
+                         uint32_t length, UErrorCode *status) 
+{
+  uint32_t order;
+  if (ch < 0xFF) 
+    order = UCA->latinOneMapping[ch];
+  else
+    order = ucmp32_get(UCA->mapping, ch);
+  
+  if (order >= UCOL_NOT_FOUND)
+    order = getSpecialPrevCE(UCA, order, collationSource, length, status); 
+  
+  if (order == UCOL_NOT_FOUND) 
+  { 
+    /* 
+    This is where we have to resort to algorithmical generation.
+    We have to check if ch is possibly a first surrogate - then we need to 
+    take the next code unit and make a bigger CE 
+    */
+    UChar nextChar;
+    const int 
+      SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7,
+      LCount = 19, VCount = 21, TCount = 28,
+      NCount = VCount * TCount,   // 588
+      SCount = LCount * NCount,   // 11172
+      LLimit = LBase + LCount,    // 1113
+      VLimit = VBase + VCount,    // 1176
+      TLimit = TBase + TCount,    // 11C3
+      SLimit = SBase + SCount;    // D7A4
+
+    /* 
+    once we have failed to find a match for codepoint cp, and are in the 
+    implicit code.
+    */
+ 
+    unsigned int L = ch - SBase;
+    if (L < SCount) 
+    { /* since it is unsigned, catchs zero case too */
+
+      /* 
+      divide into pieces.
+      we do it in this order since some compilers can do % and / in one 
+      operation
+      */
+      int T = L % TCount; 
+      L /= TCount;
+      int V = L % VCount;
+      L /= VCount;
+
+      /* offset them */
+      L += LBase;
+      V += VBase;
+      T += TBase;
+
+      /* 
+      return the first CE, but first put the rest into the expansion buffer
+      */
+      if (!collationSource->JamoSpecial) 
+      { 
+        *(collationSource->CEpos ++) = ucmp32_get(UCA->mapping, V);
+        if (T != TBase)
+          *(collationSource->CEpos++) = ucmp32_get(UCA->mapping, T);
+        /* return first one */
+        return ucmp32_get(UCA->mapping, L); 
+      } else { 
+        /* 
+        Jamo is Special
+        do recursive processing of L, V, and T with fetchCE (but T only if not 
+        equal to TBase!!)
+        Since fetchCE returns a CE, and (potentially) stuffs items into the ce 
+        buffer,
+        this is how it is done.
+        */
+        /*
+          int firstCE = fetchCE(L, ...);
+          // set pointer, leave gap!
+          int* lastExpansion = expansionBufferEnd++; 
+          *lastExpansion = fetchCE(V,...);
+          if (T != TBase) {
+            lastExpansion = expansionBufferEnd++; // set pointer, leave gap!
+            *lastExpansion = fetchCE(T,...);
+          }
+        */
+        }
+    }
+
+    if (UTF_IS_SECOND_SURROGATE(ch)) 
+    {
+      if ((collationSource->len - collationSource->pos != length) &&
+                  (UTF_IS_FIRST_SURROGATE(nextChar = *collationSource->pos))) 
+      {
+        uint32_t cp = ((ch << 10UL) + nextChar - ((0xd800 << 10UL) + 0xdc00));
+        if (collationSource->pos != collationSource->writableBuffer)
+          collationSource->pos --;
+        else
+        {
+          collationSource->pos = collationSource->string + 
+           (length - (collationSource->len - collationSource->writableBuffer));
+          collationSource->len = collationSource->string + length;
+          collationSource->isThai = TRUE;
+        }
+        if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00))
+          return 0;  /* illegal code value, use completely ignoreable! */
+        
+        /* 
+        This is a code point minus 0x10000, that's what algorithm requires 
+        */
+        order = 0xE0010303 | (cp & 0xFFE00) << 8;
+        *(collationSource->CEpos ++) = 0x80200080 | (cp & 0x001FF) << 22;
+        collationSource->toReturn ++;
+      } 
+      else
+        return 0; /* completely ignorable */
+    } 
+    else 
+    {
+      /* otherwise */
+      if (UTF_IS_FIRST_SURROGATE(ch) || (ch & 0xFFFE) == 0xFFFE)
+        return 0; /* completely ignorable */
+      
+      /* Make up an artifical CE from code point as per UCA */
+      order = 0xD08003C3 | (ch & 0xF000) << 12 | (ch & 0x0FE0) << 11;
+      *(collationSource->CEpos ++) = 0x04000080 | (ch & 0x001F) << 27;
+      collationSource->toReturn ++;
+    }
+  }
+  return order; /* return the CE */
+}
+
 /* This function handles the special CEs like contractions, expansions, surrogates, Thai */
 /* It is called by both getNextCE and getNextUCA                                         */
 uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, UErrorCode *status) {
@ -1201,6 +1338,175 @@ uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, U
  return CE;
 }

+/** 
+* This function handles the special CEs like contractions, expansions, 
+* surrogates, Thai.
+* It is called by both getPrevCE and getPrevUCA                        
+* synwee 
+*/
+uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE, 
+                          collIterate *source, uint32_t length, 
+                          UErrorCode *status) 
+{
+        uint32_t count        = 0;
+  const uint32_t *CEOffset    = NULL;
+  const UChar    *UCharOffset = NULL;
+        UChar    schar, 
+                 tchar;
+  const UChar    *strend      = NULL;
+  const UChar    *constart    = NULL;
+        uint32_t size;
+  while (TRUE)
+  {
+    switch (getCETag(CE)) 
+    {
+    case NOT_FOUND_TAG:
+      return CE;
+    case SURROGATE_TAG:
+      /* pending surrogate discussion with Markus and Mark */
+      return UCOL_NOT_FOUND;
+    case THAI_TAG:
+      if (source->isThai == TRUE) 
+      { /* if we encountered Thai prevowel & the string is not yet touched */
+        source->isThai = FALSE;
+        /*
+        sigh... to cater for getNextCE, we'll have to modify and store the 
+        whole string instead of a substring as in getSpecialCE
+        */
+        UCharOffset = source->pos;
+        strend =  source->len;
+        size = strend - source->string;
+        if (size > UCOL_WRITABLE_BUFFER_SIZE) 
+        {
+          /*
+          someone else has already allocated something
+          */
+          if (source->writableBuffer != source->stackWritableBuffer)
+            uprv_free(source->writableBuffer);
+          source->writableBuffer = 
+            (UChar *)uprv_malloc(size * sizeof(UChar));
+          source->isThai = FALSE;
+        } 
+        UChar *sourceCopy = source->string;
+        UChar *targetCopy = source->writableBuffer;
+        while (sourceCopy < strend)
+        {
+	        if (UCOL_ISTHAIPREVOWEL(*sourceCopy) &&      
+            /* This is the combination that needs to be swapped */
+		        UCOL_ISTHAIBASECONSONANT(*(sourceCopy + 1))) 
+          {
+		        *(targetCopy) = *(sourceCopy + count + 1);
+		        *(targetCopy+1) = *(sourceCopy + count);
+		        targetCopy+=2;
+		        sourceCopy+=2;
+	        } 
+          else
+		        *(targetCopy++) = *(sourceCopy++);
+        }
+        source->pos   = source->writableBuffer + 
+                                               (UCharOffset - source->string);
+        source->len   = targetCopy;
+        source->CEpos = source->toReturn = source->CEs;
+        CE = UCOL_IGNORABLE;
+      } 
+      else 
+      { 
+        /* 
+        we have already played with the string, so treat Thai as a length one 
+        expansion 
+        */
+        /* find the offset to expansion table */
+        CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 
+        CE = *CEOffset ++;
+      }
+      break;
+    case CONTRACTION_TAG:
+      /* This should handle contractions */
+      while (TRUE)
+      {
+        /* 
+        First we position ourselves at the begining of contraction sequence 
+        */
+        constart = UCharOffset = (UChar *)coll->image + getContractOffset(CE);
+        strend = source->len;
+
+        if (strend - source->pos == length) 
+        { /* this is the start of string */
+          CE = *(coll->contractionCEs + 
+                 (UCharOffset - coll->contractionIndex)); 
+          break;
+        }
+
+        /*
+        Progressing to backwards block
+        */
+        UCharOffset += *UCharOffset; 
+
+        schar = *source->pos;
+        while (schar > (tchar = *UCharOffset)) 
+          UCharOffset ++;
+        
+        if (schar != tchar) 
+        { 
+          /* 
+          we didn't find the correct codepoint. We can use either the first or 
+          the last CE 
+          */
+          if (tchar != 0xFFFF)
+            UCharOffset = constart; 
+        } 
+        else
+        {
+          /* Move up one character */
+          if (source->pos != source->writableBuffer)
+            source->pos --;
+          else
+          {
+            source->pos = source->string + 
+                          (length - (source->len - source->writableBuffer));
+            source->len = source->string + length;
+            source->isThai = TRUE;
+          }
+        }
+        CE = *(coll->contractionCEs + (UCharOffset - coll->contractionIndex));
+        if (!isContraction(CE))
+          break;  
+      }
+      break;
+    case EXPANSION_TAG:
+      /* 
+      This should handle expansion.
+      NOTE: we can encounter both continuations and expansions in an expansion! 
+      I have to decide where continuations are going to be dealt with 
+      */
+      /* find the offset to expansion table */
+      CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE); 
+      size     = getExpansionCount(CE);
+      if (size != 0) 
+        /* 
+        if there are less than 16 elements in expansion, we don't terminate 
+        */
+        for (count = 0; count < size; count++) 
+          *(source->CEpos ++) = *CEOffset++;
+      else  
+        /* else, we do */
+        while (*CEOffset != 0) 
+          *(source->CEpos ++) = *CEOffset ++;
+      source->toReturn = source->CEpos - 1;
+      return *(source->toReturn --);
+    case CHARSET_TAG:
+      /* probably after 1.8 */
+      return UCOL_NOT_FOUND;
+    default:
+      *status = U_INTERNAL_PROGRAM_ERROR;
+      CE=0;
+      break;
+    }
+    if (CE <= UCOL_NOT_FOUND) break;
+  }
+  return CE;
+}
+
 /* This should really be a macro        */
 /* However, it is used only when stack buffers are not sufficiently big, and then we're messed up performance wise */
 /* anyway */
--- a/icu4c/source/i18n/ucoleitr.cpp
+++ b/icu4c/source/i18n/ucoleitr.cpp
@ -1,18 +1,36 @@
 /*
-*******************************************************************************
+******************************************************************************
 *   Copyright (C) 2001, International Business Machines
 *   Corporation and others.  All Rights Reserved.
-*******************************************************************************
-*/
+******************************************************************************
+*
+* File ucoleitr.cpp
+*
+* Modification History:
+*
+* Date        Name        Description
+* 02/15/2001  synwee      Modified all methods to process its own function 
+*                         instead of calling the equivalent c++ api (coleitr.h)
+******************************************************************************/

 #include "unicode/ucoleitr.h"
 #include "unicode/ustring.h"
-#include "unicode/coleitr.h"
+#include "unicode/sortkey.h"
+#include "ucolimp.h"
+#include "cmemory.h"

+#define BUFFER_LENGTH 100

+typedef struct collIterate collIterator;
+
+/* public methods ---------------------------------------------------- */
+
+/**
+* Since this is going to be deprecated, I'll leave it as it is
+*/
 U_CAPI int32_t
-ucol_keyHashCode(    const    uint8_t*    key, 
-            int32_t        length)
+ucol_keyHashCode(const uint8_t *key, 
+                       int32_t  length)
 {
  CollationKey newKey(key, length);
  return newKey.hashCode();
@ -20,88 +38,160 @@ ucol_keyHashCode(    const    uint8_t*    key,


 UCollationElements*
-ucol_openElements(    const    UCollator            *coll,
-            const    UChar                *text,
-            int32_t              textLength,
-            UErrorCode *status)
+ucol_openElements(const UCollator  *coll,
+                  const UChar      *text,
+                        int32_t    textLength,
+                        UErrorCode *status)
 {
-  int32_t len = (textLength == -1 ? u_strlen(text) : textLength);
-  const UnicodeString src((UChar*)text, len, len);
+  UCollationElements *result;

-  CollationElementIterator *iter = 0;
-  iter = ((RuleBasedCollator*)coll)->createCollationElementIterator(src);
-  if(iter == 0) {
-    *status = U_MEMORY_ALLOCATION_ERROR;
-    return 0;
-  }
+  if (U_FAILURE(*status))
+    return NULL;

-  return (UCollationElements*) iter;
+  result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements));
+
+  result->collator_ = coll;
+  
+  /* gets the correct length of the null-terminated string */
+  if (textLength == -1)
+    textLength = u_strlen(text);
+
+  result->length_ = textLength;
+  init_collIterate(text, textLength, &result->iteratordata_, FALSE);
+
+  return result;
 }

 U_CAPI void
 ucol_closeElements(UCollationElements *elems)
 {
-  delete (CollationElementIterator*)elems;
+  collIterate *ci = &elems->iteratordata_;
+  if (ci->writableBuffer != ci->stackWritableBuffer)
+    uprv_free(ci->writableBuffer);
+  if (elems->iteratordata_.isWritable && elems->iteratordata_.string != NULL)
+    uprv_free(elems->iteratordata_.string);
+  uprv_free(elems);
 }

 U_CAPI void
 ucol_reset(UCollationElements *elems)
 {
-  ((CollationElementIterator*)elems)->reset();
+  collIterate *ci = &(elems->iteratordata_);
+  ci->pos         = ci->string;
+  ci->len         = ci->string + elems->length_;
+  ci->CEpos       = ci->toReturn = ci->CEs;
+  /*
+  problem here, that means we'll have to keep calculating the new thai set
+  whenever we reset. maybe getSpecialCE should just do up the whole string
+  instead of only a substring of it.
+  */
+  ci->isThai      = TRUE;
+  if (ci->stackWritableBuffer != ci->writableBuffer)
+  {
+    uprv_free(ci->writableBuffer);
+    ci->writableBuffer = ci->stackWritableBuffer;
+  }
 }

 U_CAPI int32_t
-ucol_next(    UCollationElements    *elems,
-        UErrorCode            *status)
+ucol_next(UCollationElements *elems,
+          UErrorCode         *status)
 {
-  if(U_FAILURE(*status)) return UCOL_NULLORDER;
+  if (U_FAILURE(*status)) 
+    return UCOL_NULLORDER;

-  return ((CollationElementIterator*)elems)->next(*status);
+  int32_t result;
+  UCOL_GETNEXTCE(result, elems->collator_, elems->iteratordata_, status);
+  return result;
 }

 U_CAPI int32_t
-ucol_previous(    UCollationElements    *elems,
-        UErrorCode            *status)
+ucol_previous(UCollationElements *elems,
+              UErrorCode         *status)
 {
-  if(U_FAILURE(*status)) return UCOL_NULLORDER;
+  if(U_FAILURE(*status)) 
+    return UCOL_NULLORDER;

-  return ((CollationElementIterator*)elems)->previous(*status);
+  int32_t result;
+  UCOL_GETPREVCE(result, elems->collator_, elems->iteratordata_, 
+                 elems->length_, status);
+  return result;
 }

 U_CAPI int32_t
-ucol_getMaxExpansion(    const    UCollationElements    *elems,
-            int32_t                order)
+ucol_getMaxExpansion(const UCollationElements *elems,
+                           int32_t            order)
 {
-  return ((CollationElementIterator*)elems)->getMaxExpansion(order);
+  /* 
+  synwee : requested this implementation from vladimir, need discussion. so 
+  hang on.
+  */
+  /* return ((CollationElementIterator*)elems)->getMaxExpansion(order); */
+  return -1;
 }

 U_CAPI void
-ucol_setText(UCollationElements        *elems,
-         const    UChar                    *text,
-         int32_t                    textLength,
-         UErrorCode                *status)
+ucol_setText(      UCollationElements *elems,
+             const UChar              *text,
+                   int32_t            textLength,
+                   UErrorCode         *status)
 {
-  if(U_FAILURE(*status)) return;
+  if (U_FAILURE(*status)) 
+    return;
+  
+  /* gets the correct length of the null-terminated string */
+  if (textLength == -1)
+    textLength = u_strlen(text);

-  int32_t len = (textLength == -1 ? u_strlen(text) : textLength);
-  const UnicodeString src((UChar*)text, len, len);
+  elems->length_ = textLength;

-  ((CollationElementIterator*)elems)->setText(src, *status);
+  if (elems->iteratordata_.isWritable && elems->iteratordata_.string != NULL)
+    uprv_free(elems->iteratordata_.string);
+  init_collIterate(text, textLength, &elems->iteratordata_, FALSE);
 }

 U_CAPI UTextOffset
 ucol_getOffset(const UCollationElements *elems)
 {
-  return ((CollationElementIterator*)elems)->getOffset();
+  /* return ((CollationElementIterator*)elems)->getOffset(); */
+  const collIterate *ci = &(elems->iteratordata_);
+  if (ci->isThai == TRUE)
+    return ci->pos - ci->string;
+
+  /* 
+  if it is a thai string with reversed elements, since getNextCE does not 
+  store only a substring in writeablebuffer, we'll have to do some calculation
+  to get the offset out.
+  need discussion to see if it is a better idea to store the whole string 
+  instead.
+  */
+  return elems->length_ - (ci->len - ci->pos);
 }

 U_CAPI void
-ucol_setOffset(    UCollationElements    *elems,
-        UTextOffset            offset,
-        UErrorCode            *status)
+ucol_setOffset(UCollationElements    *elems,
+               UTextOffset           offset,
+               UErrorCode            *status)
 {
-  if(U_FAILURE(*status)) return;
-  
-  ((CollationElementIterator*)elems)->setOffset(offset, *status);
+  if (U_FAILURE(*status)) 
+    return;
+
+  collIterate *ci = &(elems->iteratordata_);
+  ci->pos         = ci->string + offset;
+  ci->CEpos       = ci->toReturn = ci->CEs;
+  /*
+  problem here, that means we'll have to keep calculating the new thai set
+  whenever we reset. maybe getSpecialCE should just do up the whole string
+  instead of only a substring of it.
+  */
+  ci->isThai      = TRUE;
+  if (ci->stackWritableBuffer != ci->writableBuffer)
+  {
+    uprv_free(ci->writableBuffer);
+    ci->writableBuffer = ci->stackWritableBuffer;
+  }
 }

+
+
+
--- a/icu4c/source/i18n/ucolimp.h
+++ b/icu4c/source/i18n/ucolimp.h
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 1998-2000, International Business Machines
+*   Copyright (C) 1998-2001, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -14,6 +14,11 @@
 *
 *   created on: 2000dec11
 *   created by: Vladimir Weinstein
+* 
+* Modification history
+* Date        Name      Comments
+* 02/16/2001  synwee    Added UCOL_GETPREVCE for the use in ucoleitr
+*                       
 */

 #ifndef UCOL_IMP_H
@ -62,6 +67,28 @@ struct collIterate {
  UChar *writableBuffer;
 };

+struct UCollationElements
+{
+  /**
+  * Locale specific collator for generating the collation elements
+  */
+  const UCollator          *collator_;
+  /**
+  * Normalization mode, not exactly the same as the data in collator_.
+  * If collation strength requested is UCOL_IDENTICAL, this modes will be 
+  * UNORM_NONE other it follows collator_.
+  */
+        UNormalizationMode normalization_;
+  /**
+  * Struct wrapper for source data
+  */
+        collIterate        iteratordata_;
+  /**
+  * Source text length
+  */
+        int32_t            length_;
+};
+
 struct incrementalContext {
    UCharForwardIterator *source; 
    void *sourceContext;
@ -196,9 +223,61 @@ struct incrementalContext {
    }                                                                                 \
 }

+/**
+* Macro that gets a simple CE.
+* So what it does is that it will first check the expansion buffer. If the 
+* expansion buffer is not empty, ie the end pointer to the expansion buffer 
+* is different from the start pointer, we return the collation element at the 
+* return pointer and decrement it.
+* For more complicated CEs it resorts to getComplicatedCE.
+*/
+#define UCOL_GETPREVCE(order, coll, data, length, status) {                  \
+  if (data.CEpos > data.CEs) {                                               \
+    (order) = *(data.toReturn --);                                           \
+    if (data.CEs == data.toReturn) {                                         \
+      data.CEpos = data.toReturn = data.CEs;                                 \
+    }                                                                        \
+  }                                                                          \
+  else {                                                                     \
+    if (data.len - data.pos == length) {                                     \
+      (order) = UCOL_NO_MORE_CES;                                            \
+    }                                                                        \
+    else {                                                                   \
+      UChar ch = *(data.pos);                                                \
+      if (data.pos != data.writableBuffer) {                                 \
+        data.pos --;                                                         \
+      }                                                                      \
+      else {                                                                 \
+        data.pos = data.string +                                             \
+                            (length - (data.len - data.writableBuffer));     \
+        data.len = data.string + length;                                     \
+        data.isThai = TRUE;                                                  \
+      }                                                                      \
+      if (ch <= 0xFF) {                                                      \
+        (order) = (coll)->latinOneMapping[ch];                               \
+      }                                                                      \
+      else {                                                                 \
+        (order) = ucmp32_get((coll)->mapping, ch);                           \
+      }                                                                      \
+      if ((order) >= UCOL_NOT_FOUND) {                                       \
+        (order) = getSpecialPrevCE((coll), (order), &(data), (length),       \
+                                                             (status));      \
+        if ((order) == UCOL_NOT_FOUND) {                                     \
+          (order) = ucol_getPrevUCA(ch, &(data), (length), (status));                  \
+        }                                                                    \
+      }                                                                      \
+    }                                                                        \
+  }                                                                          \
+}
+
 uint32_t getSpecialCE(const UCollator *coll, uint32_t CE, collIterate *source, UErrorCode *status);
+uint32_t getSpecialPrevCE(const UCollator *coll, uint32_t CE, 
+                          collIterate *source, uint32_t length, 
+                          UErrorCode *status);
 U_CFUNC uint32_t ucol_getNextCE(const UCollator *coll, collIterate *collationSource, UErrorCode *status);
 uint32_t ucol_getNextUCA(UChar ch, collIterate *collationSource, UErrorCode *status);
+uint32_t ucol_getPrevUCA(UChar ch, collIterate *collationSource, 
+                         uint32_t length, UErrorCode *status);
 void incctx_cleanUpContext(incrementalContext *ctx);
 UChar incctx_appendChar(incrementalContext *ctx, UChar c);

--- a/icu4c/source/i18n/unicode/coleitr.h
+++ b/icu4c/source/i18n/unicode/coleitr.h
@ -1,8 +1,8 @@
 /*
-*****************************************************************************************
+******************************************************************************
 *   Copyright (C) 1997-1999, International Business Machines
 *   Corporation and others.  All Rights Reserved.
-*****************************************************************************************
+******************************************************************************
 */

 /**
@ -14,12 +14,14 @@
 *
 * Modification History:
 *
-*  Date         Name          Description
+*  Date       Name        Description
 *
-*  8/18/97     helena      Added internal API documentation.
-* 08/03/98        erm            Synched with 1.2 version CollationElementIterator.java
-* 12/10/99      aliu          Ported Thai collation support from Java.
-* 01/25/01     swquek      Modified into a C++ wrapper calling C APIs (ucoliter.h)
+*  8/18/97    helena      Added internal API documentation.
+* 08/03/98    erm         Synched with 1.2 version CollationElementIterator.java
+* 12/10/99    aliu        Ported Thai collation support from Java.
+* 01/25/01    swquek      Modified into a C++ wrapper calling C APIs (ucoliter.h)
+* 02/19/01    swquek      Removed CollationElementsIterator() since it is 
+*                         private constructor and no calls are made to it
 */

 #ifndef COLEITR_H
@ -27,16 +29,22 @@

 // #include "unicode/unistr.h"
 #include "unicode/tblcoll.h"
+#include "unicode/ucoleitr.h"
+
 // #include "tables.h"
 // #include "unicode/chariter.h"

 // have to do this because the include path in the main project does not have 
 // tables.h.
-class VectorOfInt;
+// class VectorOfInt;
 // class Normalizer;
 // class VectorOfPToContractElement;
 // class RuleBasedCollator;

+// typedef void * UCollationElements;
+// struct UCollationElements;
+typedef struct UCollationElements UCollationElements;
+
 /**
 * The CollationElementIterator class is used as an iterator to walk through     
 * each character of an international string. Use the iterator to return the
@ -225,6 +233,8 @@ protected:
  
  // CollationElementIterator protected constructors --------------------------

+  friend RuleBasedCollator;
+
  /**
  * CollationElementIterator constructor. This takes the source string and the 
  * collation object. The cursor will walk thru the source string based on the 
@ -265,15 +275,17 @@ protected:

 private:

-  friend  class   RuleBasedCollator;
+  // friend  class   RuleBasedCollator;

  // CollationElementIterator private data members ----------------------------

-  static const int32_t UNMAPPEDCHARVALUE;
+  // static const int32_t UNMAPPEDCHARVALUE;

+  /* 
  Normalizer* text;       // owning 

  VectorOfInt* bufferAlias; // not owned
+  */

  /**
  * ownBuffer wants to be a subobject, not a pointer, but that means exposing 
@ -282,7 +294,7 @@ private:
  * is used to handle Thai collation; bufferAlias points to ownBuffer in some 
  * situations. [j159 - aliu]
  */
-  VectorOfInt* ownBuffer;
+  // VectorOfInt* ownBuffer;

  /**
  * reorderBuffer is created on demand, so it doesn't want to be a subobject -- 
@ -290,18 +302,30 @@ private:
  * conditions. Once created, it is reused for the life of this object. Because 
  * of the implementation of VectorOfInt, it grows monotonically. [j159 - aliu]
  */
+  /*
  VectorOfInt* reorderBuffer;

  int32_t expIndex;
  UnicodeString key;
  const RuleBasedCollator* orderAlias;
+  */
+
+  /**
+  * Data wrapper for collation elements
+  */
+  UCollationElements *m_data_;
+
+  /**
+  * Indicates if m_data_ belongs to this object.
+  */
+  UBool isDataOwned_;
  
  // CollationElementIterator private constructor/destructor ------------------

  /**
  * Default constructor.
  */
-  CollationElementIterator();
+  /* CollationElementIterator(); */
  
  /**
  * Constructor.
@ -377,7 +401,7 @@ inline int32_t CollationElementIterator::tertiaryOrder(int32_t order)

 inline int32_t CollationElementIterator::getMaxExpansion(int32_t order) const
 {
-  return orderAlias->getMaxExpansion(order);
+  return ucol_getMaxExpansion(m_data_, order);
 }

 inline UBool CollationElementIterator::isIgnorable(int32_t order)
--- a/icu4c/source/i18n/unicode/ucoleitr.h
+++ b/icu4c/source/i18n/unicode/ucoleitr.h
@ -3,22 +3,32 @@
 *   Copyright (C) 2001, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *******************************************************************************
-*/
+*
+* File ucoleitr.cpp
+*
+* Modification History:
+*
+* Date        Name        Description
+* 02/15/2001  synwee      Modified all methods to process its own function 
+*                         instead of calling the equivalent c++ api (coleitr.h)
+*******************************************************************************/

 #ifndef UCOLEITR_H
 #define UCOLEITR_H

-/** This indicates the last element in a UCollationElements has been consumed. 
- *
+/**  
+ * This indicates the last element in a UCollationElements has been consumed.
 */
 #define UCOL_NULLORDER        0xFFFFFFFF

 #include "unicode/ucol.h"

-/** The UCollationElements struct.
- *  For usage in C programs.
+/** 
+ * The UCollationElements struct.
+ * For usage in C programs.
 */
-typedef void * UCollationElements;
+// typedef void * UCollationElements;
+typedef struct UCollationElements UCollationElements;

 /**
 * The UCollationElements  is used as an iterator to walk through
@ -66,7 +76,7 @@ typedef void * UCollationElements;
 * a collation order is its primary order; the next 8 bits is the secondary 
 * order and the last 8 bits is the tertiary order.
 *
- * @see                Collator
+ * @see UCollator
 */

 /**
@ -76,13 +86,13 @@ typedef void * UCollationElements;
 * @param text The text to iterate over.
 * @param textLength The number of characters in text, or -1 if null-terminated
 * @param status A pointer to an UErrorCode to receive any errors.
- * @stable
+ * @return a struct containing collation element information
 */
 U_CAPI UCollationElements*
-ucol_openElements(    const    UCollator       *coll,
-            const    UChar           *text,
-            int32_t                  textLength,
-            UErrorCode         *status);
+ucol_openElements(const UCollator  *coll,
+                  const UChar      *text,
+                        int32_t    textLength,
+                        UErrorCode *status);

 /**
 * get a hash code for a key... Not very useful!
@ -95,7 +105,6 @@ ucol_keyHashCode(const uint8_t* key, int32_t length);
 * Close a UCollationElements.
 * Once closed, a UCollationElements may no longer be used.
 * @param elems The UCollationElements to close.
- * @stable
 */
 U_CAPI void
 ucol_closeElements(UCollationElements *elems);
@ -106,7 +115,6 @@ ucol_closeElements(UCollationElements *elems);
 * @param elems The UCollationElements to reset.
 * @see ucol_next
 * @see ucol_previous
- * @stable
 */
 U_CAPI void
 ucol_reset(UCollationElements *elems);
@ -116,13 +124,11 @@ ucol_reset(UCollationElements *elems);
 * A single character may contain more than one collation element.
 * @param elems The UCollationElements containing the text.
 * @param status A pointer to an UErrorCode to receive any errors.
- * @return The next collation elements ordering, or \Ref{UCOL_NULLORDER} if the
- * end of the text is reached.
- * @stable
+ * @return The next collation elements ordering, or \Ref{UCOL_NULLORDER} if 
+ *         the end of the text is reached.
 */
 U_CAPI int32_t
-ucol_next(    UCollationElements    *elems,
-        UErrorCode        *status);
+ucol_next(UCollationElements *elems, UErrorCode *status);

 /**
 * Get the ordering priority of the previous collation element in the text.
@ -131,11 +137,9 @@ ucol_next(    UCollationElements    *elems,
 * @param status A pointer to an UErrorCode to receive any errors.
 * @return The previous collation elements ordering, or \Ref{UCOL_NULLORDER}
 * if the end of the text is reached.
- * @stable
 */
 U_CAPI int32_t
-ucol_previous(    UCollationElements    *elems,
-        UErrorCode        *status);
+ucol_previous(UCollationElements *elems, UErrorCode *status);

 /**
 * Get the maximum length of any expansion sequences that end with the 
@ -144,28 +148,24 @@ ucol_previous(    UCollationElements    *elems,
 * @param elems The UCollationElements containing the text.
 * @param order A collation order returned by previous or next.
 * @return The maximum length of any expansion sequences ending with the 
- * specified order.
- * @stable
+ *         specified order.
 */
 U_CAPI int32_t
-ucol_getMaxExpansion(    const    UCollationElements    *elems,
-            int32_t                order);
+ucol_getMaxExpansion(const UCollationElements *elems, int32_t order);

 /**
 * Set the text containing the collation elements.
- * This 
 * @param elems The UCollationElements to set.
 * @param text The source text containing the collation elements.
 * @param textLength The length of text, or -1 if null-terminated.
 * @param status A pointer to an UErrorCode to receive any errors.
 * @see ucol_getText
- * @stable
 */
 U_CAPI void
-ucol_setText(    UCollationElements    *elems,
-        const    UChar        *text,
-        int32_t            textLength,
-        UErrorCode        *status);
+ucol_setText(      UCollationElements *elems, 
+             const UChar              *text,
+                   int32_t            textLength,
+                   UErrorCode         *status);

 /**
 * Get the offset of the current source character.
@ -174,7 +174,6 @@ ucol_setText(    UCollationElements    *elems,
 * @param elems The UCollationElements to query.
 * @return The offset of the current source character.
 * @see ucol_setOffset
- * @stable
 */
 U_CAPI UTextOffset
 ucol_getOffset(const UCollationElements *elems);
@ -186,11 +185,10 @@ ucol_getOffset(const UCollationElements *elems);
 * @param offset The desired character offset.
 * @param status A pointer to an UErrorCode to receive any errors.
 * @see ucol_getOffset
- * @stable
 */
 U_CAPI void
-ucol_setOffset(    UCollationElements    *elems,
-        UTextOffset        offset,
-        UErrorCode        *status);
+ucol_setOffset(UCollationElements *elems,
+               UTextOffset        offset,
+               UErrorCode         *status);

 #endif