ICU-3944 text access, work in progress

X-SVN-Rev: 17988
2025-04-06 22:15:31 +00:00 · 2005-06-23 05:51:28 +00:00 · 2005-06-23 05:51:28 +00:00 · 32b19f04b2
commit 32b19f04b2
parent 9fc80fe9b2
9 changed files with 303 additions and 187 deletions
--- a/icu4c/source/common/dbbi.cpp
+++ b/icu4c/source/common/dbbi.cpp
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 1999-2004 IBM Corp. All rights reserved.
+*   Copyright (C) 1999-2005 IBM Corp. All rights reserved.
 **********************************************************************
 *   Date        Name        Description
 *   12/1/99    rgillam     Complete port from Java.
@ -37,7 +37,7 @@ RuleBasedBreakIterator() {


 DictionaryBasedBreakIterator::DictionaryBasedBreakIterator(UDataMemory* rbbiData,
-                                                           const char* dictionaryFilename, 
+                                                           const char* dictionaryFilename,
                                                           UErrorCode& status)
 : RuleBasedBreakIterator(rbbiData, status)
 {
@ -143,7 +143,7 @@ DictionaryBasedBreakIterator::previous()
        reset();
        int32_t result = RuleBasedBreakIterator::previous();
        if (cachedBreakPositions != NULL) {
-            for (positionInCache=0; 
+            for (positionInCache=0;
                cachedBreakPositions[positionInCache] != result;
                positionInCache++);
            U_ASSERT(positionInCache < numCachedBreakPositions);
@ -334,7 +334,7 @@ BreakIterator *  DictionaryBasedBreakIterator::createBufferClone(void *stackBuff
    }

    //
-    //  If user buffer size is zero this is a preflight operation to 
+    //  If user buffer size is zero this is a preflight operation to
    //    obtain the needed buffer size, allowing for worst case misalignment.
    //
    if (bufferSize == 0) {
@ -367,7 +367,7 @@ BreakIterator *  DictionaryBasedBreakIterator::createBufferClone(void *stackBuff
    }

    //
-    //  Initialize the clone object.  
+    //  Initialize the clone object.
    //    TODO:  using an overloaded C++ "operator new" to directly initialize the
    //           copy in the user's buffer would be better, but it doesn't seem
    //           to get along with namespaces.  Investigate why.
@ -383,7 +383,7 @@ BreakIterator *  DictionaryBasedBreakIterator::createBufferClone(void *stackBuff
    if (status != U_SAFECLONE_ALLOCATED_WARNING) {
        clone->fBufferClone = TRUE;
    }
-    return clone;    
+    return clone;
 }


@ -405,15 +405,15 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
    // that needs to be kept with the word).  Seek from the beginning of the
    // range to the first dictionary character
    fText->setIndex(startPos);
-    UChar c = fText->current();
+    UChar32 c = fText->current32();
    while (isDictionaryChar(c) == FALSE) {
-        c = fText->next();
+        c = fText->next32();
    }

    if (U_FAILURE(status)) {
        return; // UStack below overwrites the status error codes
    }
-    
+
    // initialize.  We maintain two stacks: currentBreakPositions contains
    // the list of break positions that will be returned if we successfully
    // finish traversing the whole range now.  possibleBreakPositions lists
@ -429,9 +429,9 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
    // further, this saves us from having to follow each possible path
    // through the text all the way to the error (hopefully avoiding many
    // future recursive calls as well).
-    // there can be only one kind of error in UStack and UVector, so we'll 
+    // there can be only one kind of error in UStack and UVector, so we'll
    // just let the error fall through
-    UStack currentBreakPositions(status); 
+    UStack currentBreakPositions(status);
    UStack possibleBreakPositions(status);
    UVector wrongBreakPositions(status);

@ -456,8 +456,15 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
        return;
    }
    // initialize (we always exit the loop with a break statement)
-    c = fText->current();
+    c = fText->current32();
    for (;;) {
+        // The dictionary implementation doesn't do supplementary chars.
+        // Put them through as an unpaired surrogate, which
+        // will end any dictionary match in progress.
+        // With any luck, this dictionary implementation will be retired soon.
+        if (c>0x10000) {
+            c = 0xd800;
+        }

        // if we can transition to state "-1" from our current state, we're
        // on the last character of a legal word.  Push that position onto
@ -470,7 +477,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
        }

        // look up the new state to transition to in the dictionary
-        state = fTables->fDictionary->at(state, c);
+        state = fTables->fDictionary->at(state, (UChar)c);

        // if the character we're sitting on causes us to transition to
        // the "end of word" state, then it was a non-dictionary character
@ -515,7 +522,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
                        possibleBreakPositions.peeki())) {
                possibleBreakPositions.popi();
            }
-            
+
            // if we've used up all possible break-position combinations, there's
            // an error or an unknown word in the text.  In this case, we start
            // over, treating the farthest character we've reached as the beginning
@ -532,7 +539,8 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
                    }
                    bestBreakPositions.removeAllElements();
                    if (farthestEndPoint < endPos) {
-                        fText->setIndex(farthestEndPoint + 1);
+                        fText->setIndex(farthestEndPoint);
+                        fText->next32();
                    }
                    else {
                        break;
@ -547,7 +555,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
                            return;
                        }
                    }
-                    fText->next();
+                    fText->next32();
                    currentBreakPositions.push(fText->getIndex(), status);
                    if (U_FAILURE(status)) {
                        return;
@ -574,7 +582,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t

            // re-sync "c" for the next go-round, and drop out of the loop if
            // we've made it off the end of the range
-            c = fText->current();
+            c = fText->current32();
            if (fText->getIndex() >= endPos) {
                break;
            }
@ -583,7 +591,7 @@ DictionaryBasedBreakIterator::divideUpDictionaryRange(int32_t startPos, int32_t
        // if we didn't hit any exceptional conditions on this last iteration,
        // just advance to the next character and loop
        else {
-            c = fText->next();
+            c = fText->next32();
        }
    }

--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -1348,6 +1348,21 @@ UBool RuleBasedBreakIterator::isDictionaryChar(UChar32   c) {
 }


+//-------------------------------------------------------------------------------
+//
+//  UText functions
+//
+//-------------------------------------------------------------------------------
+void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
+    // TODO: implement this.
+}
+
+
+UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
+    // TODO: implement this.
+    return fillIn;
+}
+

 U_NAMESPACE_END

--- a/icu4c/source/common/ubrk.cpp
+++ b/icu4c/source/common/ubrk.cpp
@ -1,6 +1,6 @@
 /*
 *****************************************************************************************
-*   Copyright (C) 1996-2004, International Business Machines
+*   Copyright (C) 1996-2005, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *****************************************************************************************
 */
@ -70,14 +70,16 @@ ubrk_open(UBreakIteratorType type,
    return 0;
  }

-  UCharCharacterIterator *iter = 0;
-  iter = new UCharCharacterIterator(text, textLength);
-  if(iter == 0) {
-    *status = U_MEMORY_ALLOCATION_ERROR;
-    delete result;
-    return 0;
+  if (text != NULL) {
+      UCharCharacterIterator *iter = 0;
+      iter = new UCharCharacterIterator(text, textLength);
+      if(iter == 0) {
+          *status = U_MEMORY_ALLOCATION_ERROR;
+          delete result;
+          return 0;
+      }
+      result->adoptText(iter);
  }
-  result->adoptText(iter);

  return (UBreakIterator*)result;
 }
@ -186,6 +188,19 @@ ubrk_setText(UBreakIterator* bi,
  }
 }

+U_DRAFT void U_EXPORT2
+ubrk_setUText(UBreakIterator *bi,
+             UText          *text,
+             UErrorCode     *status)
+{
+    BreakIterator *brit = (BreakIterator *)bi;
+    brit->setText(text, *status);
+}
+
+
+
+
+
 U_CAPI int32_t U_EXPORT2
 ubrk_current(const UBreakIterator *bi)
 {
@ -273,8 +288,8 @@ ubrk_getRuleStatusVec(UBreakIterator *bi, int32_t *fillInVec, int32_t capacity,


 U_CAPI const char* U_EXPORT2
-ubrk_getLocaleByType(const UBreakIterator *bi, 
-                     ULocDataLocaleType type, 
+ubrk_getLocaleByType(const UBreakIterator *bi,
+                     ULocDataLocaleType type,
                     UErrorCode* status)
 {
    if (bi == NULL) {
--- a/icu4c/source/common/unicode/brkiter.h
+++ b/icu4c/source/common/unicode/brkiter.h
@ -261,6 +261,23 @@ public:
     */
    virtual const CharacterIterator& getText(void) const = 0;

+
+    /**
+      *  Get a UText for the text being analyzed.
+      *  The returned UText is a shallow clone of the UText used internally
+      *  by the break iterator implementation.  It can safely be used to
+      *  access the text without impacting any break iterator operations,
+      *  but the underlying text itself must not be altered.
+      *
+      * @param fillIn A UText to be filled in.  If NULL, a new UText will be
+      *           allocated to hold the result.
+      * @status   receives any error codes.
+      * @return   The current UText for this break iterator.  If an input
+      *           UText was provided, it will always be returned.
+      * @draft ICU 3.4
+      */
+     virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
+
    /**
     * Change the text over which this operates. The text boundary is
     * reset to the start.
@ -270,12 +287,19 @@ public:
    virtual void  setText(const UnicodeString &text) = 0;

    /**
-     * Change the text over which this operates. The boundary iteration position is
-     * reset to the start.
+     * Reset the break iterator to operate over the text represented by 
+     * the UText.  The iterator position is reset to the start.
+     *
+     * This function makes a shallow clone of the supplied UText.  This means
+     * that the caller is free to immediately close or otherwise reuse the
+     * Utext that was passed as a parameter, but that the underlying text itself
+     * must not be altered while being referenced by the break iterator.
+     *
     * @param text The UText used to change the text.
-     * @stable ICU 2.0
+     * @status   receives any error codes.
+     * @draft ICU 3.4
     */
-    //virtual void  setText(UText &text) = 0;
+    virtual void  setText(UText *text, UErrorCode &status) = 0;

    /**
     * Change the text over which this operates. The text boundary is
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@ -17,10 +17,10 @@
 #include "unicode/utypes.h"

 /**
- * \file 
+ * \file
 * \brief C++ API: Rule Based Break Iterator
 */
- 
+
 #if !UCONFIG_NO_BREAK_ITERATION

 #include "unicode/brkiter.h"
@ -242,6 +242,22 @@ public:
    virtual const CharacterIterator& getText(void) const;


+    /**
+      *  Get a UText for the text being analyzed.
+      *  The returned UText is a shallow clone of the UText used internally
+      *  by the break iterator implementation.  It can safely be used to
+      *  access the text without impacting any break iterator operations,
+      *  but the underlying text itself must not be altered.
+      *
+      * @param fillIn A UText to be filled in.  If NULL, a new UText will be
+      *           allocated to hold the result.
+      * @status   receives any error codes.
+      * @return   The current UText for this break iterator.  If an input
+      *           UText was provided, it will always be returned.
+      * @draft ICU 3.4
+      */
+     virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
+
    /**
     * Set the iterator to analyze a new piece of text.  This function resets
     * the current iteration position to the beginning of the text.
@ -259,6 +275,21 @@ public:
     */
    virtual void setText(const UnicodeString& newText);

+    /**
+     * Reset the break iterator to operate over the text represented by
+     * the UText.  The iterator position is reset to the start.
+     *
+     * This function makes a shallow clone of the supplied UText.  This means
+     * that the caller is free to immediately close or otherwise reuse the
+     * Utext that was passed as a parameter, but that the underlying text itself
+     * must not be altered while being referenced by the break iterator.
+     *
+     * @param text    The UText used to change the text.
+     * @param status  Receives any error codes.
+     * @draft ICU 3.4
+     */
+    virtual void  setText(UText *text, UErrorCode &status);
+
    /**
     * Sets the current iteration position to the beginning of the text.
     * (i.e., the CharacterIterator's starting offset).
--- a/icu4c/source/common/unicode/ubrk.h
+++ b/icu4c/source/common/unicode/ubrk.h
@ -8,6 +8,7 @@

 #include "unicode/utypes.h"
 #include "unicode/uloc.h"
+#include "unicode/utext.h"

 /**
 * A text-break iterator.
@ -392,6 +393,21 @@ ubrk_setText(UBreakIterator* bi,
             int32_t         textLength,
             UErrorCode*     status);

+
+/**
+ * Sets an existing iterator to point to a new piece of text
+ * @param bi The iterator to use
+ * @param text The text to be set
+ * @param status The error code
+ * @draft ICU 3.4
+ */
+U_DRAFT void U_EXPORT2
+ubrk_setUText(UBreakIterator* bi,
+             UText*          text,
+             UErrorCode*     status);
+
+
+
 /**
 * Determine the most recently-returned text boundary.
 *
--- a/icu4c/source/common/unicode/utext.h
+++ b/icu4c/source/common/unicode/utext.h
@ -329,7 +329,7 @@ utext_isLengthExpensive(const UText *ut);
 *
 * This function is roughly equivalent to the the sequence
 *    utext_setIndex(index);
- *    utext_current();
+ *    utext_current32();
 * (There is a difference if the index is out of bounds by being less than zero)
 * 
 * @param ut the text to be accessed
@ -354,7 +354,7 @@ utext_char32At(UText *ut, int32_t nativeIndex);
 * @draft ICU 3.4
 */
 U_DRAFT UChar32 U_EXPORT2
-utext_current(UText *ut);
+utext_current32(UText *ut);


 /**
@ -750,32 +750,32 @@ enum {
     * For example, byte indexes into UTF-8 text or UTF-32 indexes into UTF-32 text.
     * @draft ICU 3.4
     */
-    UTEXT_PROVIDER_NON_UTF16_INDEXES,
+    UTEXT_PROVIDER_NON_UTF16_INDEXES = 0,
    /**
     * It is potentially time consuming for the provider to determine the length of the text.
     * @draft ICU 3.4
     */
-    UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE,
+    UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1,
    /**
     * Text chunks remain valid and usable until the text object is modified or
     * deleted, not just until the next time the access() function is called
     * (which is the default).
     * @draft ICU 3.4
     */
-    UTEXT_PROVIDER_STABLE_CHUNKS,
+    UTEXT_PROVIDER_STABLE_CHUNKS = 2,
    /**
     * The provider supports modifying the text via the replace() and copy()
     * functions.
     * @see Replaceable
     * @draft ICU 3.4
     */
-    UTEXT_PROVIDER_WRITABLE,
+    UTEXT_PROVIDER_WRITABLE = 3,
    /**
     * There is meta data associated with the text.
     * @see Replaceable::hasMetaData()
     * @draft ICU 3.4
     */
-    UTEXT_PROVIDER_HAS_META_DATA
+    UTEXT_PROVIDER_HAS_META_DATA = 4
 };

 /**
--- a/icu4c/source/common/utext.cpp
+++ b/icu4c/source/common/utext.cpp
@ -101,7 +101,7 @@ utext_setIndex(UText *ut, int32_t index) {
        if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) {
            UChar c = ut->chunk.contents[ut->chunk.offset];
            if (U16_TRAIL(c)) {
-                utext_current(ut);  // force index to the start of the curent code point.
+                utext_current32(ut);  // force index to the start of the curent code point.
            }
        }
    }
@ -111,7 +111,7 @@ utext_setIndex(UText *ut, int32_t index) {

  
 U_DRAFT UChar32 U_EXPORT2
-utext_current(UText *ut) {
+utext_current32(UText *ut) {
    UChar32  c = U_SENTINEL;
    if (ut->chunk.offset < ut->chunk.length) {
        c = ut->chunk.contents[ut->chunk.offset];
@ -161,7 +161,7 @@ utext_next32(UText *ut) {
    if (U16_IS_SURROGATE(c)) {
        // looking at a surrogate.  Could be unpaired, need to be careful.
        // Speed doesn't matter, will be very rare.
-        c =  utext_current(ut);
+        c =  utext_current32(ut);
        if (U_IS_SUPPLEMENTARY(c)) {
            offset++;
        }
@ -192,7 +192,7 @@ utext_previous32(UText *ut) {
    if (U16_IS_SURROGATE(c)) {
        // Note that utext_current() will move the chunk offset to the lead surrogate
        // if we come in referring to trail half of a surrogate pair.
-        c =  utext_current(ut);
+        c =  utext_current32(ut);
    } 

 prev32_return:
@ -224,7 +224,7 @@ utext_next32From(UText *ut, int32_t index) {
        // Surrogate code unit.  Could be pointing at either half of a pair, or at
        //   an unpaired surrogate.  Let utext_current() do the work.  Speed doesn't matter.
        chunk->offset = offset;
-        c = utext_current(ut);  
+        c = utext_current32(ut);  
        if (U_IS_SUPPLEMENTARY(c)) {
            offset++;
        }
@ -257,8 +257,8 @@ utext_previous32From(UText *ut, int32_t index) {
    c = chunk->contents[offset];
    chunk->offset = offset;
    if (U16_IS_SURROGATE(c)) {
-        c = utext_current(ut);  // get supplementary char if not unpaired surrogate,
-                                //  and adjust offset to start.
+        c = utext_current32(ut);  // get supplementary char if not unpaired surrogate,
+                                  //  and adjust offset to start.
    }
 prev32return:
    return c;
@ -911,7 +911,6 @@ U_CDECL_END
 //
 //------------------------------------------------------------------------------

-#if 0 // initially commented out to reduce testing

 /*
 * TODO: use a flag in RepText to support readonly strings?
@ -922,124 +921,159 @@ U_CDECL_END
 // to allow for possible trimming for code point boundaries
 enum { REP_TEXT_CHUNK_SIZE=10 };

-struct RepText : public UText {
-    /* chunk UChars */
-    UChar s[REP_TEXT_CHUNK_SIZE];
+struct ReplExtra {
+    /*
+     * Chunk UChars.
+     * +1 to simplify filling with surrogate pair at the end.
+     */
+    UChar s[REP_TEXT_CHUNK_SIZE+1];
 };

+
 U_CDECL_BEGIN

 static UText * U_CALLCONV
-repTextClone(const UText *t) {
-    RepText *t2=(RepText *)uprv_malloc(sizeof(RepText));
-    if(t2!=NULL) {
-        *t2=*(const RepText *)t;
-        t2->context=((const Replaceable *)t->context)->clone();
-        if(t2->context==NULL) {
-            uprv_free(t2);
-            t2=NULL;
-        }
+repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
+    // First do a generic shallow clone.  Does everything needed for the UText struct itself.
+    dest = noopTextClone(dest, src, deep, status);
+
+    if (deep && U_SUCCESS(*status)) {
+        const Replaceable *replSrc = (const Replaceable *)src->context;
+        dest->context = replSrc->clone();
    }
-    return t2;
+    return dest;
 }

-static int32_t U_CALLCONV
-repTextGetProperties(UText *t) {
-    int32_t props=I32_FLAG(UTEXT_PROVIDER_WRITABLE);
-    if(((const Replaceable *)((const RepText *)t)->context)->hasMetaData()) {
-        props|=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
-    }
-    return props;
-}
+

 static int32_t U_CALLCONV
-repTextLength(UText *t) {
-    return ((const Replaceable *)((const RepText *)t)->context)->length();
+repTextLength(UText *ut) {
+    const Replaceable *replSrc = (const Replaceable *)ut->context;
+    int32_t  len = replSrc->length();
+    return len;
 }

-static int32_t U_CALLCONV
-repTextAccess(UText *t, int32_t index, UBool forward, UTextChunk *chunk) {
-    RepText *rt=(RepText *)t;
-    const Replaceable *rep=(const Replaceable *)rt->context;
-    int32_t start, limit, length=rep->length();
-    int32_t chunkStart, chunkLength, chunkOffset;
+
+static UBool U_CALLCONV
+repTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
+    const Replaceable *rep=(const Replaceable *)ut->context;
+    int32_t start;          // index of the start of the chunk to be loaded
+    int32_t limit;          // index of the end+1 of the chunk to be loaded.
+    int32_t length=rep->length();   // Full length of the input text (bigger than a chunk)
+

    /*
     * Compute start/limit boundaries around index, for a segment of text
     * to be extracted.
-     * The segment will be trimmed to not include halves of surrogate pairs.
+     * To allow for the possibility that our user gave an index to the trailing
+     * half of a surrogate pair, we must request one extra preceding UChar when
+     * going in the forward direction.  This will ensure that the buffer has the
+     * entire code point at the specified index.
     */
    if(forward) {
-        if(length<=index) {
-            return -1;
+
+        if (index>=ut->chunk.nativeStart && index<ut->chunk.nativeLimit) {
+            // Buffer already contains the requested position.
+            ut->chunk.offset = index - ut->chunk.nativeStart;
+            return TRUE;
        }
-        limit=index+REP_TEXT_CHUNK_SIZE-1;
-        if(limit>length) {
-            limit=length;
+        if (index>=length && ut->chunk.nativeLimit==length) {
+            // Request for end of string, and buffer already extends up to it.
+            // Can't get the data, but don't change the buffer.
+            ut->chunk.offset = length - ut->chunk.nativeStart;
+            return FALSE;
+        }
+
+        if (index<0) {
+            index = 0;
+        }
+        ut->chunk.nativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
+        // Going forward, so we want to have the buffer with stuff at and beyond
+        //   the requested index.  The -1 gets us one code point before the
+        //   requested index also, to handle the case of the index being on
+        //   a trail surrogate of a surrogate pair.
+        if(ut->chunk.nativeLimit > length) {
+            ut->chunk.nativeLimit = length;
+        }
+        // unless buffer ran off end, start is index-1.
+        ut->chunk.nativeStart = ut->chunk.nativeLimit - REP_TEXT_CHUNK_SIZE;   
+        if(ut->chunk.nativeStart < 0) {
+            ut->chunk.nativeStart = 0;
+        }
+    } else {
+        // Reverse iteration.  Fill buffer with data preceding the requested index.
+        if(index<0) {
+            index = 0;
+        }
+        if (index>ut->chunk.nativeStart && index<=ut->chunk.nativeLimit) {
+            // Requested position already in buffer.
+            ut->chunk.offset = index - ut->chunk.nativeStart;
+            return TRUE;
+        }
+        if (index==0 && ut->chunk.nativeStart==0) {
+            // Request for start, buffer already begins at start.
+            //  No data, but keep the buffer as is.
+            ut->chunk.offset = 0;
+            return FALSE;
+        }
+        limit = index;
+        if (limit>length) {
+            limit = length;
        }
        start=limit-REP_TEXT_CHUNK_SIZE;
        if(start<0) {
            start=0;
        }
-    } else {
-        if(index<0) {
-            return -1;
-        }
-        start=index-REP_TEXT_CHUNK_SIZE+1;
-        if(start<0) {
-            start=0;
-        }
-        limit=start+REP_TEXT_CHUNK_SIZE;
-        if(length<limit) {
-            limit=length;
-        }
    }
-    UnicodeString buffer(rt->s, 0, REP_TEXT_CHUNK_SIZE); // writable alias
-    rep->extractBetween(start, limit, buffer);
+    ReplExtra *ex = (ReplExtra *)ut->pExtra;
+    // UnicodeString with its buffer a writable alias to the chunk buffer
+    UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/); 
+    rep->extractBetween(ut->chunk.nativeStart, ut->chunk.nativeLimit, buffer);

-    chunkStart=0;
-    chunkLength=limit-start;
-    chunkOffset=index-start;
+    ut->chunk.contents = ex->s;
+    ut->chunk.length    = ut->chunk.nativeLimit - ut->chunk.nativeStart;
+    ut->chunk.offset    = index - ut->chunk.nativeStart;

-    // trim contents for code point boundaries
-    if(0<start && U16_IS_TRAIL(rt->s[chunkStart])) {
-        ++chunkStart;
-        --chunkLength;
-        ++start;
-    }
-    if(limit<length && U16_IS_LEAD(rt->s[chunkStart+chunkLength-1])) {
-        --chunkLength;
-        --limit;
+    // Surrogate pairs from the input text must not span chunk boundaries.
+    // If end of chunk could be the start of a surrogate, trim it off.
+    if (ut->chunk.nativeLimit < length &&
+        U16_IS_LEAD(ex->s[ut->chunk.length-1])) {
+            ut->chunk.length--;
+        }
+
+
+    // if the first UChar in the chunk could be the trailing half of a surrogate pair,
+    // trim it off.
+    if(ut->chunk.nativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
+        ++(ut->chunk.contents);
+        --(ut->chunk.length);
+        --(ut->chunk.offset);
    }

    // adjust the index/chunkOffset to a code point boundary
-    U16_SET_CP_START(rt->s, chunkStart, chunkOffset);
+    U16_SET_CP_START(ut->chunk.contents, 0, ut->chunk.offset);

-    chunk->contents=rt->s+chunkStart;
-    chunk->length=chunkLength;
-    chunk->start=start;
-    chunk->limit=limit;
-    chunk->nonUTF16Indexes=FALSE;
-    return chunkOffset; // chunkOffset corresponding to index
+    return TRUE; 
 }

+
+
 static int32_t U_CALLCONV
-repTextExtract(UText *t,
+repTextExtract(UText *ut,
               int32_t start, int32_t limit,
               UChar *dest, int32_t destCapacity,
-               UErrorCode *pErrorCode) {
-    RepText *rt=(RepText *)t;
-    const Replaceable *rep=(const Replaceable *)rt->context;
+               UErrorCode *status) {
+    const Replaceable *rep=(const Replaceable *)ut->context;
    int32_t length=rep->length();

-    if(U_FAILURE(*pErrorCode)) {
+    if(U_FAILURE(*status)) {
        return 0;
    }
    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
-        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        *status=U_ILLEGAL_ARGUMENT_ERROR;
    }
    if(start<0 || start>limit || length<limit) {
-        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        *status=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
    }
    length=limit-start;
@ -1048,28 +1082,27 @@ repTextExtract(UText *t,
    }
    UnicodeString buffer(dest, 0, destCapacity); // writable alias
    rep->extractBetween(start, limit, buffer);
-    return u_terminateUChars(dest, destCapacity, length, pErrorCode);
+    return u_terminateUChars(dest, destCapacity, length, status);
 }

 static int32_t U_CALLCONV
-repTextReplace(UText *t,
+repTextReplace(UText *ut,
               int32_t start, int32_t limit,
               const UChar *src, int32_t length,
-               UTextChunk *chunk,
-               UErrorCode *pErrorCode) {
-    RepText *rt=(RepText *)t;
-    Replaceable *rep=(Replaceable *)rt->context;
+               UErrorCode *status) {
+    Replaceable *rep=(Replaceable *)ut->context;
    int32_t oldLength;

-    if(U_FAILURE(*pErrorCode)) {
+    if(U_FAILURE(*status)) {
        return 0;
    }
    if(src==NULL && length!=0) {
-        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        *status=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
    }
    oldLength=rep->length(); // will subtract from new length
    if(start<0 || start>limit || oldLength<limit) {
-        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        *status=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
    }
    // prepare
@ -1082,24 +1115,22 @@ repTextReplace(UText *t,
 }

 static void U_CALLCONV
-repTextCopy(UText *t,
+repTextCopy(UText *ut,
            int32_t start, int32_t limit,
            int32_t destIndex,
            UBool move,
-            UTextChunk *chunk,
-            UErrorCode *pErrorCode) {
-    RepText *rt=(RepText *)t;
-    Replaceable *rep=(Replaceable *)rt->context;
+            UErrorCode *status) {
+    Replaceable *rep=(Replaceable *)ut->context;
    int32_t length=rep->length();

-    if(U_FAILURE(*pErrorCode)) {
+    if(U_FAILURE(*status)) {
        return;
    }
    if( start<0 || start>limit || length<limit ||
        destIndex<0 || length<destIndex ||
        (start<destIndex && destIndex<limit)
    ) {
-        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+        *status=U_INDEX_OUTOFBOUNDS_ERROR;
        return;
    }
    if(move) {
@ -1118,61 +1149,37 @@ repTextCopy(UText *t,
    // never invalidate the chunk because we have a copy of the characters
 }

-static const UText repText={
-    NULL, NULL, NULL, NULL,
-    (int32_t)sizeof(UText), 0, 0, 0,
-    repTextClone,
-    repTextGetProperties,
-    repTextLength,
-    repTextAccess,
-    repTextExtract,
-    repTextReplace,
-    repTextCopy,
-    NULL, // mapOffsetToNative
-    NULL  // mapIndexToUTF16
-};
+

 U_DRAFT UText * U_EXPORT2
-utext_openReplaceable(Replaceable *rep, UErrorCode *pErrorCode) {
-    if(U_FAILURE(*pErrorCode)) {
+utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status) {
+    if(U_FAILURE(*status)) {
        return NULL;
    }
    if(rep==NULL) {
-        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        *status=U_ILLEGAL_ARGUMENT_ERROR;
        return NULL;
    }
-    RepText *rt=(RepText *)uprv_malloc(sizeof(RepText));
-    if(rt==NULL) {
-        *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
-        return NULL;
+    ut = utext_setup(ut, sizeof(ReplExtra), status);
+    
+    ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
+    if(rep->hasMetaData()) {
+        ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
    }
-    *((UText *)rt)=repText;
-    rt->context=rep;
-    return rt;
+
+    ut->clone      = noopTextClone;
+    ut->length     = repTextLength;
+    ut->access     = repTextAccess;
+    ut->extract    = repTextExtract;
+    ut->replace    = repTextReplace;
+    ut->copy       = repTextCopy;
+
+    ut->context=rep;
+    return ut;
 }

-U_DRAFT void U_EXPORT2
-utext_closeReplaceable(UText *t) {
-    if(t!=NULL) {
-        uprv_free((RepText *)t);
-    }
-}
-
-U_DRAFT void U_EXPORT2
-utext_resetReplaceable(UText *t, Replaceable *rep, UErrorCode *pErrorCode) {
-    if(U_FAILURE(*pErrorCode)) {
-        return;
-    }
-    if(rep==NULL) {
-        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
-        return;
-    }
-    RepText *rt=(RepText *)t;
-    rt->context=rep;
-}
 U_CDECL_END

-#endif



--- a/icu4c/source/test/cintltst/utexttst.c
+++ b/icu4c/source/test/cintltst/utexttst.c
@ -128,17 +128,17 @@ static void TestAPI(void) {
        c = utext_char32At(uta, 0);
        TEST_ASSERT(c==uString[0]);
        
-        c = utext_current(uta);
+        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32(uta);
        TEST_ASSERT(c==uString[0]);
-        c = utext_current(uta);
+        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = utext_previous32(uta);
        TEST_ASSERT(c==uString[0]);
-        c = utext_current(uta);
+        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32From(uta, 1);
@ -170,7 +170,7 @@ static void TestAPI(void) {
        utext_setIndex(uta, 0);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c==uString[0]);
-        c = utext_current(uta);
+        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = UTEXT_PREVIOUS32(uta);