ICU-3944 Text Access, new tests, bugs fixed

X-SVN-Rev: 18106
2025-04-09 15:27:38 +00:00 · 2005-07-01 00:39:24 +00:00 · 2005-07-01 00:39:24 +00:00 · 091627dceb
commit 091627dceb
parent c4d57fd411
3 changed files with 312 additions and 22 deletions
--- a/icu4c/source/common/utext.cpp
+++ b/icu4c/source/common/utext.cpp
@ -436,6 +436,8 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
            if (spaceRequired>0) {
                ut->extraSize = extraSpace;
                ut->pExtra    = &((ExtendedUText *)ut)->extension;
+                uprv_memset(ut->pExtra, 0, extraSpace);  // Purify whines about copying untouched extra [buffer]
+                                                         //  space when cloning, so init it now.
            }
        }
    } else {
@ -467,6 +469,7 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
            } else {
                ut->extraSize = extraSpace;
                ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
+                uprv_memset(ut->pExtra, 0, extraSpace);
            }
        }
    }
@ -613,7 +616,10 @@ U_CDECL_END
 //     UText implementation for UTF-8 strings (read-only) 
 //
 //         Use of UText data members:
-//            context    pointer to UTF-8 string
+//              context    pointer to UTF-8 string
+//              utext.b  is the input string length (bytes).
+//              utext.p  pointer to allocated utf-8 string if owned by this utext (after a clone)
+//              utext.q  pointer to the filled part of the Map array.
 //
 //      TODO:  make creation of the index mapping array lazy.
 //             Create it for a chunk the first time the user asks for an index.
@ -638,9 +644,6 @@ struct UTF8Extra {
    int32_t map[UTF8_TEXT_CHUNK_SIZE+2];
 };

-//  utext.b  is the input string length (bytes).
-//  utext.q  pointer to the filled part of the Map array.
-//
 //     because backwards iteration fills the buffers starting at the end and
 //     working towards the front, the filled part of the buffers may not begin
 //     at the start of the available storage for the buffers.
@ -679,12 +682,12 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
            return FALSE;
        }

-        chunk->nativeStart=index;
        c=s8[index];
        if(c<=0x7f) {
            // get a run of ASCII characters.
            // Even if we don't fill the buffer, we will stop with the first
            //   non-ascii char, so that the buffer can use utf-16 indexing.
+            chunk->nativeStart=index;
            u16buf[0]=(UChar)c;
            for(i=1, ++index;
                i<UTF8_TEXT_CHUNK_SIZE && index<length && (c=s8[index])<=0x7f;
@ -696,6 +699,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
        } else {
            // get a chunk of characters starting with a non-ASCII one
            U8_SET_CP_START(s8, 0, index);  // put utf-8 index at first byte of char, if not there already.
+            chunk->nativeStart=index;
            for(i=0;  i<UTF8_TEXT_CHUNK_SIZE && index<length;  ) {
                //  i     is utf-16 index into chunk buffer.
                //  index is utf-8 index into original string
@ -724,10 +728,10 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
            return FALSE;
        }

-        chunk->nativeLimit=index;
        c=s8[index-1];
        if(c<=0x7f) {
            // get a chunk of ASCII characters.  Don't build the index map
+            chunk->nativeLimit=index;
            i=UTF8_TEXT_CHUNK_SIZE;
            do {
                u16buf[--i]=(UChar)c;
@ -739,6 +743,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
            if(index<length) {
                U8_SET_CP_START(s8, 0, index);
            }
+            chunk->nativeLimit=index;
            i=UTF8_TEXT_CHUNK_SIZE;
            map[i]=index;    // map position for char following the last one in the buffer.
            do {
@ -781,6 +786,80 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
    }
 }

+
+//
+//  This is a slightly modified copy of u_strFromUTF8,
+//     Inserts a Replacement Char rather than failing on invalid UTF-8
+//     Removes unnecessary features.
+//
+static UChar* 
+utext_strFromUTF8(UChar *dest,             
+              int32_t destCapacity,
+              int32_t *pDestLength,
+              const char* src, 
+              int32_t srcLength,        // required.  NUL terminated not supported.
+              UErrorCode *pErrorCode
+              )
+{
+
+    UChar *pDest = dest;
+    UChar *pDestLimit = dest+destCapacity;
+    UChar32 ch=0;
+    int32_t index = 0;
+    int32_t reqLength = 0;
+    uint8_t* pSrc = (uint8_t*) src;
+
+         
+    while((index < srcLength)&&(pDest<pDestLimit)){
+        ch = pSrc[index++];
+        if(ch <=0x7f){
+            *pDest++=(UChar)ch;
+        }else{
+            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
+            if(ch<0){
+                ch = 0xfffd;
+            }
+            if(ch<=0xFFFF){
+                *(pDest++)=(UChar)ch;
+            }else{
+                *(pDest++)=UTF16_LEAD(ch);
+                if(pDest<pDestLimit){
+                    *(pDest++)=UTF16_TRAIL(ch);
+                }else{
+                    reqLength++;
+                    break;
+                }
+            }
+        }
+    }
+    /* donot fill the dest buffer just count the UChars needed */
+    while(index < srcLength){
+        ch = pSrc[index++];
+        if(ch <= 0x7f){
+            reqLength++;
+        }else{
+            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
+            if(ch<0){
+                ch = 0xfffd;
+            }
+            reqLength+=UTF_CHAR_LENGTH(ch);
+        }
+    }
+
+    reqLength+=(int32_t)(pDest - dest);
+
+    if(pDestLength){
+        *pDestLength = reqLength;
+    }
+
+    /* Terminate the buffer */
+    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
+
+    return dest;
+}
+
+
+
 static int32_t U_CALLCONV
 utf8TextExtract(UText *ut,
                int32_t start, int32_t limit,
@ -791,17 +870,23 @@ utf8TextExtract(UText *ut,
    }
    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
    }
-    if(start<0 || start>limit || ut->b<limit) {
+    if(start<0 || start>limit) {
        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
        return 0;
    }
+    if (limit>ut->b) {
+        limit = ut->b;
+    }
+    if (start>ut->b) {
+        start = ut->b;
+    }
    int32_t destLength=0;
-    u_strFromUTF8(dest, destCapacity, &destLength,
+    utext_strFromUTF8(dest, destCapacity, &destLength,
                    (const char *)ut->context+start, limit-start,
                    pErrorCode);
    return destLength;
-    // TODO: if U_INVALID|ILLEGAL_CHAR_FOUND, extract text anyway and use SUB for illegal sequences?
 }

 // Assume nonUTF16Indexes and 0<=offset<=chunk->length
@ -823,6 +908,19 @@ utf8TextMapIndexToUTF16(UText *ut, int32_t index) {
    while(index>map[offset]) {
        ++offset;
    }
+    if (index<map[offset]) {
+        // index was to a trail byte of a multi-byte utf-8 char.
+        // The loop above advaned offset to the start of the following char, now
+        //  offset must be backed up to the start of the utf-16 char into which
+        //  the utf-8 index pointed.
+        offset--;
+        if (offset>0 && map[offset] == map[offset-1]) {
+            // index was to a utf-8 trail byte of a supplemenary char.
+            //   Offset now points to the trail surrogate (one in back of the following char)
+            //   Back offset up one more time to get to the utf-16 lead surrogate.
+            offset--;
+        }
+    }
    return offset;
 }

--- a/icu4c/source/test/intltest/utxttest.cpp
+++ b/icu4c/source/test/intltest/utxttest.cpp
@ -43,11 +43,13 @@ UTextTest::~UTextTest() {

 void
 UTextTest::runIndexedTest(int32_t index, UBool exec,
-                                      const char* &name, char* /*par*/) {
+                          const char* &name, char* /*par*/) {
    switch (index) {
        case 0: name = "TextTest";
-            if(exec) TextTest();                         break;
-        default: name = ""; break;
+            if (exec) TextTest();    break;
+        case 1: name = "ErrorTest";
+            if (exec) ErrorTest();   break;
+        default: name = "";          break;
    }
 }

@ -62,10 +64,23 @@ static uint32_t m_rand()
 }


+//
+//   TextTest()
+//
+//       Top Level function for UText testing.
+//       Specifies the strings to be tested, with the acutal testing itself
+//       being carried out in another function, TestString().
+//
 void  UTextTest::TextTest() {
    int32_t i, j;

    TestString("abcd\\U00010001xyz");
+    TestString("");
+
+    // Supplementary chars at start or end
+    TestString("\\U00010001");
+    TestString("abc\\U00010001");
+    TestString("\\U00010001abc");

    // Test simple strings of lengths 1 to 60, looking for glitches at buffer boundaries
    UnicodeString s;
@ -126,14 +141,11 @@ void  UTextTest::TextTest() {
    TestString(s);
 }

-//
-//  mapping between native indexes and code points.
-//     native indexes could be utf-8, utf-16, utf32, or some code page.
-//     The general purpose UText test funciton takes an array of these as
-//     expected contents of the text being accessed.
-//
-

+//
+//  TestString()     Run a suite of UText tests on a string.
+//                   The test string is unescaped before use.
+//
 void UTextTest::TestString(const UnicodeString &s) {
    int32_t       i;
    int32_t       j;
@ -147,7 +159,7 @@ void UTextTest::TestString(const UnicodeString &s) {
    saLen = sa.length();

    //
-    // Build up the mapping between code points and UTF-16 code unit indexes.
+    // Build up a mapping between code points and UTF-16 code unit indexes.
    //
    m *cpMap = new m[sa.length() + 1];
    j = 0;
@ -161,7 +173,7 @@ void UTextTest::TestString(const UnicodeString &s) {
    cpMap[j].nativeIdx = i;   // position following the last char in utf-16 string.    


-    // UChar * test, null term
+    // UChar * test, null terminated
    status = U_ZERO_ERROR;
    UChar *buf = new UChar[saLen+1];
    sa.extract(buf, saLen+1, status);
@ -502,6 +514,11 @@ cleanupAndReturn:
    utext_close(targetUT);
 }

+//
+//  TestAccess()    Test the read only access functions on a UText.
+//                  The text is accessed in a variety of ways, and compared with
+//                  the reference UnicodeString.
+//
 void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *cpMap) {
    UErrorCode  status = U_ZERO_ERROR;
    gTestNum++;
@ -711,7 +728,11 @@ void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *c

    status = U_ZERO_ERROR;
    len = utext_extract(ut, 0, utlen, NULL, 0, &status);
-    TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR)
+    if (utlen == 0) {
+        TEST_ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
+    } else {
+        TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
+    }
    TEST_ASSERT(len == expectedLen);

    status = U_ZERO_ERROR;
@ -731,6 +752,176 @@ void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *c
    }

    delete buf;
+}
+
+
+
+//
+//  ErrorTest()    Check various error and edge cases.
+//
+void UTextTest::ErrorTest() 
+{
+    // Close of an unitialized UText.  Shouldn't blow up.
+    {
+        UText  ut;  
+        memset(&ut, 0, sizeof(UText));
+        utext_close(&ut);
+        utext_close(NULL);
+    }
+
+    // Double-close of a UText.  Shouldn't blow up.  UText should still be usable.
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        UText ut = UTEXT_INITIALIZER;
+        UnicodeString s("Hello, World");
+        UText *ut2 = utext_openUnicodeString(&ut, &s, &status);
+        TEST_SUCCESS(status);
+        TEST_ASSERT(ut2 == &ut);
+
+        UText *ut3 = utext_close(&ut);
+        TEST_ASSERT(ut3 == &ut);
+
+        UText *ut4 = utext_close(&ut);
+        TEST_ASSERT(ut4 == &ut);
+
+        utext_openUnicodeString(&ut, &s, &status);
+        TEST_SUCCESS(status);
+        utext_close(&ut);
+    }
+
+    // Re-use of a UText, chaining through each of the types of UText
+    //   (If it doesn't blow up, and doesn't leak, it's probably working fine)
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        UText ut = UTEXT_INITIALIZER;
+        UText  *utp;
+        UnicodeString s1("Hello, World");
+        UChar s2[] = {(UChar)0x41, (UChar)0x42, (UChar)0};
+        char  *s3 = "\x66\x67\x68";
+
+        utp = utext_openUnicodeString(&ut, &s1, &status);
+        TEST_SUCCESS(status);
+        TEST_ASSERT(utp == &ut);
+
+        utp = utext_openConstUnicodeString(&ut, &s1, &status);
+        TEST_SUCCESS(status);
+        TEST_ASSERT(utp == &ut);
+
+        utp = utext_openUTF8(&ut, s3, -1, &status);
+        TEST_SUCCESS(status);
+        TEST_ASSERT(utp == &ut);
+
+        utp = utext_openUChars(&ut, s2, -1, &status);
+        TEST_SUCCESS(status);
+        TEST_ASSERT(utp == &ut);
+
+        utp = utext_close(&ut);
+        TEST_ASSERT(utp == &ut);
+
+        utp = utext_openUnicodeString(&ut, &s1, &status);
+        TEST_SUCCESS(status);
+        TEST_ASSERT(utp == &ut);
+    }
+
+    //
+    //  UTF-8 with malformed sequences.
+    //    These should come through as the Unicode replacement char, \ufffd
+    //
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        UText *ut = NULL;
+        char *badUTF8 = "\x41\x81\x42\xf0\x81\x81\x43";   
+        UChar32  c;
+
+        ut = utext_openUTF8(NULL, badUTF8, -1, &status);
+        TEST_SUCCESS(status);
+        c = utext_char32At(ut, 1);
+        TEST_ASSERT(c == 0xfffd);
+        c = utext_char32At(ut, 3);
+        TEST_ASSERT(c == 0xfffd);
+        c = utext_char32At(ut, 5);
+        TEST_ASSERT(c == 0xfffd);
+        c = utext_char32At(ut, 6);
+        TEST_ASSERT(c == 0x43);
+
+        UChar buf[10];
+        int n = utext_extract(ut, 0, 9, buf, 10, &status);
+        TEST_SUCCESS(status);
+        TEST_ASSERT(n==5);
+        TEST_ASSERT(buf[1] == 0xfffd);
+        TEST_ASSERT(buf[3] == 0xfffd);
+        TEST_ASSERT(buf[2] == 0x42);
+    }
+
+
+    //
+    //  isLengthExpensive - does it make the exptected transitions after
+    //                      getting the length of a nul terminated string?
+    //
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        UnicodeString sa("Hello, this is a string");
+        UBool  isExpensive;
+
+        UChar sb[100];
+        memset(sb, 0x20, sizeof(sb));
+        sb[99] = 0;
+
+        UText *uta = utext_openUnicodeString(NULL, &sa, &status);
+        TEST_SUCCESS(status);
+        isExpensive = utext_isLengthExpensive(uta);
+        TEST_ASSERT(isExpensive == FALSE);
+        utext_close(uta);
+
+        UText *utb = utext_openUChars(NULL, sb, -1, &status);
+        TEST_SUCCESS(status);
+        isExpensive = utext_isLengthExpensive(utb);
+        TEST_ASSERT(isExpensive == TRUE);
+        int32_t  len = utext_nativeLength(utb);
+        TEST_ASSERT(len == 99);
+        isExpensive = utext_isLengthExpensive(utb);
+        TEST_ASSERT(isExpensive == FALSE);
+        utext_close(utb);
+    }
+
+    //
+    // get/set native index to positions not on code point boundaries.
+    //
+    {
+        char *u8str =         "\xc8\x81\xe1\x82\x83\xf1\x84\x85\x86";
+        int32_t startMap[] = {   0,  0,  2,  2,  2,  5,  5,  5,  5,  9,  9};
+
+
+        UErrorCode status = U_ZERO_ERROR;
+        UText *ut = utext_openUTF8(NULL, u8str, -1, &status);
+        TEST_SUCCESS(status);
+
+        int32_t i;
+        int32_t startMapLimit = sizeof(startMap) / sizeof(int32_t);
+        for (i=0; i<startMapLimit; i++) {
+            utext_setNativeIndex(ut, i);
+            int32_t cpIndex = utext_getNativeIndex(ut);
+            TEST_ASSERT(cpIndex == startMap[i]);
+        }
+        utext_close(ut);
+
+        //  Similar test, with utf16 instead of utf8
+        UnicodeString u16str("\\u1000\\U00011000\\u2000\\U00022000");
+        int32_t start16Map[]  ={ 0,     1,   1,    3,     4,  4,     6,  6};
+        u16str = u16str.unescape();
+        status = U_ZERO_ERROR;
+        ut = utext_openUnicodeString(NULL, &u16str, &status);
+        TEST_SUCCESS(status);
+
+        startMapLimit = sizeof(start16Map) / sizeof(int32_t);
+        for (i=0; i<startMapLimit; i++) {
+            utext_setNativeIndex(ut, i);
+            int32_t cpIndex = utext_getNativeIndex(ut);
+            TEST_ASSERT(cpIndex == start16Map[i]);
+        }
+        utext_close(ut);
+    }
+

 }

--- a/icu4c/source/test/intltest/utxttest.h
+++ b/icu4c/source/test/intltest/utxttest.h
@ -29,6 +29,7 @@ public:

    void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=NULL);
    void TextTest();
+    void ErrorTest();

 private:
    struct m {                              // Map between native indices & code points.