diff --git a/icu4c/source/common/utext.cpp b/icu4c/source/common/utext.cpp index 3234ac57cbf..d9a1d09fa99 100644 --- a/icu4c/source/common/utext.cpp +++ b/icu4c/source/common/utext.cpp @@ -239,7 +239,6 @@ U_DRAFT UChar32 U_EXPORT2 utext_previous32From(UText *ut, int32_t index) { UTextChunk *chunk = &ut->chunk; UChar32 c = U_SENTINEL; - UChar32 startingChar; if(index<=chunk->nativeStart || index>chunk->nativeLimit) { // Requested native index is outside of the current chunk. @@ -250,7 +249,15 @@ utext_previous32From(UText *ut, int32_t index) { } else if(chunk->nonUTF16Indexes) { chunk->offset=ut->mapNativeIndexToUTF16(ut, index); } else { + // This chunk uses UTF-16 indexing. Index into it. chunk->offset = index - chunk->nativeStart; + // put offset onto a code point boundary if it isn't there already. + if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) { + c = chunk->contents[chunk->offset]; + if (U16_TRAIL(c)) { + utext_current32(ut); // force index to the start of the curent code point. + } + } } if (chunk->offset<=0) { @@ -258,16 +265,13 @@ utext_previous32From(UText *ut, int32_t index) { goto prev32return; } - // Do the operation assuming that there are no surrogates involved, either - // at the starting position or at the previous position. Fast, common case. - startingChar = chunk->contents[chunk->offset]; - (chunk->offset)--; + // Do the operation assuming that there are no surrogates involved. Fast, common case. + chunk->offset--; c = chunk->contents[chunk->offset]; - // Check for surrogates, do the operation over if there are any. - if (U16_IS_SURROGATE(startingChar) || U16_IS_SURROGATE(c)) { - utext_setNativeIndex(ut, index); // setIndex() handles case of initial index on a trail surrogate - c = utext_previous32(ut); // previous32() handles case of previous char being a supplementary. + // Check for the char being a surrogate, get the whole char if it is. + if (U16_IS_SURROGATE(c)) { + c = utext_current32(ut); } prev32return: @@ -1104,6 +1108,14 @@ repTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk* /* chunk*/ ) const Replaceable *rep=(const Replaceable *)ut->context; int32_t length=rep->length(); // Full length of the input text (bigger than a chunk) + // clip the requested index to the limits of the text. + if (index<0) { + index = 0; + } + if (index>length) { + index = length; + } + /* * Compute start/limit boundaries around index, for a segment of text @@ -1127,9 +1139,6 @@ repTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk* /* chunk*/ ) return FALSE; } - if (index<0) { - index = 0; - } ut->chunk.nativeLimit = index + REP_TEXT_CHUNK_SIZE - 1; // Going forward, so we want to have the buffer with stuff at and beyond // the requested index. The -1 gets us one code point before the @@ -1145,9 +1154,6 @@ repTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk* /* chunk*/ ) } } else { // Reverse iteration. Fill buffer with data preceding the requested index. - if(index<0) { - index = 0; - } if (index>ut->chunk.nativeStart && index<=ut->chunk.nativeLimit) { // Requested position already in buffer. ut->chunk.offset = index - ut->chunk.nativeStart; @@ -1229,10 +1235,27 @@ repTextExtract(UText *ut, if(destCapacity<0 || (dest==NULL && destCapacity>0)) { *status=U_ILLEGAL_ARGUMENT_ERROR; } - if(start<0 || start>limit || lengthlimit) { *status=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } + if (start>length) { + start=length; + } + if (limit>length) { + limit=length; + } + + // adjust start, limit if they point to trail half of surrogates + if (startcharAt(start)) && + U_IS_SUPPLEMENTARY(rep->char32At(start))){ + start--; + } + if (limitcharAt(limit)) && + U_IS_SUPPLEMENTARY(rep->char32At(limit))){ + limit--; + } + length=limit-start; if(length>destCapacity) { limit = start + destCapacity; diff --git a/icu4c/source/test/intltest/utxttest.cpp b/icu4c/source/test/intltest/utxttest.cpp index 49035fa7532..0e082454a6b 100644 --- a/icu4c/source/test/intltest/utxttest.cpp +++ b/icu4c/source/test/intltest/utxttest.cpp @@ -1024,6 +1024,73 @@ void UTextTest::ErrorTest() utext_close(ut); } + { // Similar test, with UText over Replaceable + // TODO: merge the common parts of these tests. + + UnicodeString u16str("\\u1000\\U00011000\\u2000\\U00022000"); + int32_t startMap[] ={ 0, 1, 1, 3, 4, 4, 6, 6}; + int32_t nextMap[] = { 1, 3, 3, 4, 6, 6, 6, 6}; + int32_t prevMap[] = { 0, 0, 0, 1, 3, 3, 4, 4}; + UChar32 c32Map[] = {0x1000, 0x11000, 0x11000, 0x2000, 0x22000, 0x22000, -1, -1}; + UChar32 pr32Map[] = { -1, 0x1000, 0x1000, 0x11000, 0x2000, 0x2000, 0x22000, 0x22000}; + int32_t exLen[] = { 1, 0, 2, 1, 0, 2, 0, 0,}; + + u16str = u16str.unescape(); + UErrorCode status = U_ZERO_ERROR; + UText *ut = utext_openReplaceable(NULL, &u16str, &status); + TEST_SUCCESS(status); + + int32_t startMapLimit = sizeof(startMap) / sizeof(int32_t); + int i; + for (i=0; i 0) { + UChar32 c32; + U16_GET(buf, 0, 0, extractedLen, c32); + TEST_ASSERT(c32 == c32Map[i]); + } + } + + utext_close(ut); + } + }