ICU-3944 text access, tests and fixes

X-SVN-Rev: 18190
2025-04-13 08:53:20 +00:00 · 2005-07-11 07:40:18 +00:00 · 2005-07-11 07:40:18 +00:00 · 6edb932e50
commit 6edb932e50
parent 6bf5e3f373
2 changed files with 106 additions and 16 deletions
--- a/icu4c/source/common/utext.cpp
+++ b/icu4c/source/common/utext.cpp
@ -239,7 +239,6 @@ U_DRAFT UChar32 U_EXPORT2
 utext_previous32From(UText *ut, int32_t index) {
    UTextChunk *chunk = &ut->chunk;
    UChar32     c     = U_SENTINEL;
-    UChar32     startingChar;

    if(index<=chunk->nativeStart || index>chunk->nativeLimit) {
        // Requested native index is outside of the current chunk.
@ -250,7 +249,15 @@ utext_previous32From(UText *ut, int32_t index) {
    } else if(chunk->nonUTF16Indexes) {
        chunk->offset=ut->mapNativeIndexToUTF16(ut, index);
    } else {
+        // This chunk uses UTF-16 indexing.  Index into it.
        chunk->offset = index - chunk->nativeStart;
+        // put offset onto a code point boundary if it isn't there already.
+        if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) {
+            c = chunk->contents[chunk->offset];
+            if (U16_TRAIL(c)) {
+                utext_current32(ut);  // force index to the start of the curent code point.
+            }
+        }
    }

    if (chunk->offset<=0) {
@ -258,16 +265,13 @@ utext_previous32From(UText *ut, int32_t index) {
        goto prev32return;
    }

-    // Do the operation assuming that there are no surrogates involved, either
-    // at the starting position or at  the previous position.  Fast, common case.
-    startingChar = chunk->contents[chunk->offset];
-    (chunk->offset)--;
+    // Do the operation assuming that there are no surrogates involved.  Fast, common case.
+    chunk->offset--;
    c = chunk->contents[chunk->offset];

-    // Check for surrogates, do the operation over if there are any.
-    if (U16_IS_SURROGATE(startingChar) || U16_IS_SURROGATE(c)) {
-        utext_setNativeIndex(ut, index);  // setIndex() handles case of initial index on a trail surrogate
-        c = utext_previous32(ut);         // previous32() handles case of previous char being a supplementary.
+    // Check for the char being a surrogate, get the whole char if it is.
+    if (U16_IS_SURROGATE(c)) {
+        c =  utext_current32(ut);
    }

 prev32return:
@ -1104,6 +1108,14 @@ repTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk* /* chunk*/ )
    const Replaceable *rep=(const Replaceable *)ut->context;
    int32_t length=rep->length();   // Full length of the input text (bigger than a chunk)

+    // clip the requested index to the limits of the text.
+    if (index<0) {
+        index = 0;
+    }
+    if (index>length) {
+        index = length;
+    }
+

    /*
     * Compute start/limit boundaries around index, for a segment of text
@ -1127,9 +1139,6 @@ repTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk* /* chunk*/ )
            return FALSE;
        }

-        if (index<0) {
-            index = 0;
-        }
        ut->chunk.nativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
        // Going forward, so we want to have the buffer with stuff at and beyond
        //   the requested index.  The -1 gets us one code point before the
@ -1145,9 +1154,6 @@ repTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk* /* chunk*/ )
        }
    } else {
        // Reverse iteration.  Fill buffer with data preceding the requested index.
-        if(index<0) {
-            index = 0;
-        }
        if (index>ut->chunk.nativeStart && index<=ut->chunk.nativeLimit) {
            // Requested position already in buffer.
            ut->chunk.offset = index - ut->chunk.nativeStart;
@ -1229,10 +1235,27 @@ repTextExtract(UText *ut,
    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
        *status=U_ILLEGAL_ARGUMENT_ERROR;
    }
-    if(start<0 || start>limit || length<limit) {
+    if(start<0 || start>limit) {
        *status=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
    }
+    if (start>length) {
+        start=length;
+    }
+    if (limit>length) {
+        limit=length;
+    }
+
+    // adjust start, limit if they point to trail half of surrogates
+    if (start<length && U16_IS_TRAIL(rep->charAt(start)) &&
+        U_IS_SUPPLEMENTARY(rep->char32At(start))){
+            start--;
+    }
+    if (limit<length && U16_IS_TRAIL(rep->charAt(limit)) &&
+        U_IS_SUPPLEMENTARY(rep->char32At(limit))){
+            limit--;
+    }
+
    length=limit-start;
    if(length>destCapacity) {
        limit = start + destCapacity;
--- a/icu4c/source/test/intltest/utxttest.cpp
+++ b/icu4c/source/test/intltest/utxttest.cpp
@ -1024,6 +1024,73 @@ void UTextTest::ErrorTest()
        utext_close(ut);
    }

+    {    //  Similar test, with UText over Replaceable
+         //  TODO:  merge the common parts of these tests.
+        
+        UnicodeString u16str("\\u1000\\U00011000\\u2000\\U00022000");
+        int32_t startMap[]  ={ 0,     1,   1,    3,     4,  4,     6,  6};
+        int32_t nextMap[]  = { 1,     3,   3,    4,     6,  6,     6,  6};
+        int32_t prevMap[]  = { 0,     0,   0,    1,     3,  3,     4,  4};
+        UChar32  c32Map[] =  {0x1000, 0x11000, 0x11000, 0x2000,  0x22000, 0x22000, -1, -1}; 
+        UChar32  pr32Map[] = {    -1, 0x1000,  0x1000,  0x11000, 0x2000,  0x2000,   0x22000,   0x22000}; 
+        int32_t  exLen[] =   {   1,  0,   2,  1,  0,  2,  0,  0,};
+
+        u16str = u16str.unescape();
+        UErrorCode status = U_ZERO_ERROR;
+        UText *ut = utext_openReplaceable(NULL, &u16str, &status);
+        TEST_SUCCESS(status);
+
+        int32_t startMapLimit = sizeof(startMap) / sizeof(int32_t);
+        int i;
+        for (i=0; i<startMapLimit; i++) {
+            utext_setNativeIndex(ut, i);
+            int32_t cpIndex = utext_getNativeIndex(ut);
+            TEST_ASSERT(cpIndex == startMap[i]);
+        }
+
+        // Check char32At
+        for (i=0; i<startMapLimit; i++) {
+            UChar32 c32 = utext_char32At(ut, i);
+            TEST_ASSERT(c32 == c32Map[i]);
+            int32_t cpIndex = utext_getNativeIndex(ut);
+            TEST_ASSERT(cpIndex == startMap[i]);
+        }
+
+        // Check utext_next32From
+        for (i=0; i<startMapLimit; i++) {
+            UChar32 c32 = utext_next32From(ut, i);
+            TEST_ASSERT(c32 == c32Map[i]);
+            int32_t cpIndex = utext_getNativeIndex(ut);
+            TEST_ASSERT(cpIndex == nextMap[i]);
+        }
+        
+        // check utext_previous32From
+        for (i=0; i<startMapLimit; i++) {
+            UChar32 c32 = utext_previous32From(ut, i);
+            TEST_ASSERT(c32 == pr32Map[i]);
+            int32_t cpIndex = utext_getNativeIndex(ut);
+            TEST_ASSERT(cpIndex == prevMap[i]);
+        }
+
+        // check Extract
+        //   Extract from i to i+1, which may be zero or one code points,
+        //     depending on whether the indices straddle a cp boundary.
+        for (i=0; i<startMapLimit; i++) {
+            UChar buf[3];
+            status = U_ZERO_ERROR;
+            int32_t  extractedLen = utext_extract(ut, i, i+1, buf, 3, &status);
+            TEST_SUCCESS(status);
+            TEST_ASSERT(extractedLen == exLen[i]);
+            if (extractedLen > 0) {
+                UChar32  c32;
+                U16_GET(buf, 0, 0, extractedLen, c32);
+                TEST_ASSERT(c32 == c32Map[i]);
+            }
+        }
+
+        utext_close(ut);
+    }
+

 }