ICU-4873 UText with surrogates spanning chunk boundaries.

X-SVN-Rev: 19498
This commit is contained in:
Andy Heninger 2006-03-31 02:11:39 +00:00
parent 43b2723119
commit f47dea2b53
2 changed files with 256 additions and 57 deletions

View file

@ -36,26 +36,41 @@ utext_access(UText *ut, int64_t index, UBool forward) {
U_DRAFT UBool U_EXPORT2
utext_moveIndex32(UText *ut, int32_t delta) {
UBool retval = TRUE;
if(delta>0) {
UChar32 c;
if (delta > 0) {
do {
if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
retval = FALSE;
break;
return FALSE;
}
c = ut->chunkContents[ut->chunkOffset];
if (U16_IS_SURROGATE(c)) {
c = utext_next32(ut);
if (c == U_SENTINEL) {
return FALSE;
}
} else {
ut->chunkOffset++;
}
U16_FWD_1(ut->chunkContents, ut->chunkOffset, ut->chunkLength);
} while(--delta>0);
} else if (delta<0) {
do {
if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
retval = FALSE;
break;
return FALSE;
}
c = ut->chunkContents[ut->chunkOffset-1];
if (U16_IS_SURROGATE(c)) {
c = utext_previous32(ut);
if (c == U_SENTINEL) {
return FALSE;
}
} else {
ut->chunkOffset--;
}
U16_BACK_1(ut->chunkContents, 0, ut->chunkOffset);
} while(++delta<0);
}
return retval;
return TRUE;
}
@ -85,7 +100,7 @@ utext_getNativeIndex(UText *ut) {
U_DRAFT void U_EXPORT2
utext_setNativeIndex(UText *ut, int64_t index) {
if(index<ut->chunkNativeStart || ut->chunkNativeLimit<index) {
if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
// The desired position is outside of the current chunk.
// Access the new position. Assume a forward iteration from here,
// which will also be optimimum for a single random access.
@ -94,14 +109,27 @@ utext_setNativeIndex(UText *ut, int64_t index) {
} else if(ut->nonUTF16Indexes) {
ut->chunkOffset=ut->mapNativeIndexToUTF16(ut, index);
} else {
// utf-16 indexing.
ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
// Our convention is that the index must always be on a code point boundary.
// If we are somewhere in the middle of a utf-16 buffer, check that new index
// is not in the middle of a surrogate pair.
if (index>ut->chunkNativeStart && index < ut->chunkNativeLimit) {
U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset)
}
// The convention is that the index must always be on a code point boundary.
// Adjust the index position if it is in the middle of a surrogate pair.
if (ut->chunkOffset<ut->chunkLength) {
UChar c= ut->chunkContents[ut->chunkOffset];
if (UTF16_IS_TRAIL(c)) {
UChar lead = 0;
if (ut->chunkOffset==0) {
ut->access(ut, ut->chunkNativeStart, FALSE);
}
if (ut->chunkOffset>0) {
UChar lead = ut->chunkContents[ut->chunkOffset-1];
if (UTF16_IS_LEAD(lead)) {
ut->chunkOffset--;
}
}
}
}
}
@ -109,38 +137,78 @@ utext_setNativeIndex(UText *ut, int64_t index) {
//
// utext_current32. Get the UChar32 at the current position.
// As a side effect, adjust the index if the current position
// is on a trail surrogate. This feature is used internally;
// from an external view, indexes are never on trail surrogates.
// UText iteration position is always on a code point boundary,
// never on the trail half of a surrogate pair.
//
U_DRAFT UChar32 U_EXPORT2
utext_current32(UText *ut) {
UChar32 c = U_SENTINEL;
UChar32 c;
if (ut->chunkOffset==ut->chunkLength) {
// Current position is just off the end of the chunk.
// Can also happen at startup, with a zero length chunk at zero offset.
ut->access(ut, ut->chunkNativeLimit, TRUE);
}
if (ut->chunkOffset < ut->chunkLength) {
c = ut->chunkContents[ut->chunkOffset];
if (U16_IS_SURROGATE(c)) {
// looking at a surrogate. Could be unpaired, need to be careful.
U16_GET(ut->chunkContents, 0, ut->chunkOffset, ut->chunkLength, c);
U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset)
if (ut->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
// Off the end of the text.
return U_SENTINEL;
}
}
return c;
c = ut->chunkContents[ut->chunkOffset];
if (U16_IS_LEAD(c) == FALSE) {
// Normal, non-supplementary case.
return c;
}
//
// Possible supplementary char.
//
UChar32 trail = 0;
UChar32 supplementaryC = c;
if ((ut->chunkOffset+1) < ut->chunkLength) {
// The trail surrogate is in the same chunk.
trail = ut->chunkContents[ut->chunkOffset+1];
} else {
// The trail surrogate is in a different chunk.
// Because we must maintain the iteration position, we need to switch forward
// into the new chunk, get the trail surrogate, then revert the chunk back to the
// original one.
int64_t nativePosition = ut->chunkNativeLimit;
int32_t originalOffset = ut->chunkOffset;
if (ut->access(ut, nativePosition, TRUE)) {
trail = ut->chunkContents[ut->chunkOffset];
UBool r = ut->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk
U_ASSERT(r==TRUE);
ut->chunkOffset = originalOffset;
}
}
if (U16_IS_TRAIL(trail)) {
supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
}
return supplementaryC;
}
U_DRAFT UChar32 U_EXPORT2
utext_char32At(UText *ut, int64_t nativeIndex) {
UChar32 c = U_SENTINEL;
// Fast path the common case.
if (!ut->nonUTF16Indexes && nativeIndex>=ut->chunkNativeStart && nativeIndex<ut->chunkNativeLimit) {
ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
c = ut->chunkContents[ut->chunkOffset];
if (U16_IS_SURROGATE(c) == FALSE) {
return c;
}
}
utext_setNativeIndex(ut, nativeIndex);
if (nativeIndex>=0 && ut->chunkOffset < ut->chunkLength) {
if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
c = ut->chunkContents[ut->chunkOffset];
if (U16_IS_SURROGATE(c)) {
U16_GET(ut->chunkContents, 0, ut->chunkOffset, ut->chunkLength, c);
// For surrogates, let current32() deal with the complications
// of supplementaries that may span chunk boundaries.
c = utext_current32(ut);
}
}
return c;
@ -157,8 +225,33 @@ utext_next32(UText *ut) {
}
}
U16_NEXT(ut->chunkContents, ut->chunkOffset, ut->chunkLength, c);
return c;
c = ut->chunkContents[ut->chunkOffset++];
if (U16_IS_LEAD(c) == FALSE) {
// Normal case, not supplementary.
// (A trail surrogate seen here is just returned as is, as a surrogate value.
// It cannot be part of a pair.)
return c;
}
if (ut->chunkOffset >= ut->chunkLength) {
if (ut->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
// c is an unpaired lead surrogate at the end of the text.
// return it as it is.
return c;
}
}
UChar32 trail = ut->chunkContents[ut->chunkOffset];
if (U16_IS_TRAIL(trail) == FALSE) {
// c was an unpaired lead surrogate, not at the end of the text.
// return it as it is (unpaired). Iteration position is on the
// following character, possibly in the next chunk, where the
// trail surrogate would have been if it had existed.
return c;
}
UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
ut->chunkOffset++; // move iteration position over the trail surrogate.
return supplementary;
}
@ -171,8 +264,33 @@ utext_previous32(UText *ut) {
return U_SENTINEL;
}
}
U16_PREV(ut->chunkContents, 0, ut->chunkOffset, c);
return c;
ut->chunkOffset--;
c = ut->chunkContents[ut->chunkOffset];
if (U16_IS_TRAIL(c) == FALSE) {
// Normal case, not supplementary.
// (A lead surrogate seen here is just returned as is, as a surrogate value.
// It cannot be part of a pair.)
return c;
}
if (ut->chunkOffset <= 0) {
if (ut->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
// c is an unpaired trail surrogate at the start of the text.
// return it as it is.
return c;
}
}
UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
if (U16_IS_LEAD(lead) == FALSE) {
// c was an unpaired trail surrogate, not at the end of the text.
// return it as it is (unpaired). Iteration position is at c
return c;
}
UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
ut->chunkOffset--; // move iteration position over the lead surrogate.
return supplementary;
}
@ -194,9 +312,9 @@ utext_next32From(UText *ut, int64_t index) {
c = ut->chunkContents[ut->chunkOffset++];
if (U16_IS_SURROGATE(c)) {
// Surrogate code unit. Speed doesn't matter, let plain next32() do the work.
ut->chunkOffset--; // undo the ++, above.
U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset)
// Surrogates. Many edge cases. Use other functions that already
// deal with the problems.
utext_setNativeIndex(ut, index);
c = utext_next32(ut);
}
return c;
@ -205,38 +323,49 @@ utext_next32From(UText *ut, int64_t index) {
U_DRAFT UChar32 U_EXPORT2
utext_previous32From(UText *ut, int64_t index) {
UChar32 c;
//
// Return the character preceding the specified index.
// Leave the iteration position at the start of the character that was returned.
//
UChar32 cPrev; // The character preceding cCurr, which is what we will return.
// Address the chunk containg the position preceding the incoming index
// A tricky edge case:
// We try to test the requested native index against the chunkNativeStart to determine
// whether the character preceding the one at the index is in the current chunk.
// BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
// requested index is on something other than the first position of the first char.
//
if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
// Requested native index is outside of the current chunk.
if(!ut->access(ut, index, FALSE)) {
// no chunk available here
return U_SENTINEL;
return U_SENTINEL;
}
} else if(ut->nonUTF16Indexes) {
ut->chunkOffset=ut->mapNativeIndexToUTF16(ut, index);
if (ut->chunkOffset==0 && !ut->access(ut, index, FALSE)) {
// no chunk available here
return U_SENTINEL;
}
} else {
// This chunk uses UTF-16 indexing. Index into it.
ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
// put offset onto a code point boundary if it isn't there already.
if (index>ut->chunkNativeStart && index < ut->chunkNativeLimit) {
U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset)
}
}
if (ut->chunkOffset<=0) {
// already at the start of text. Return U_SENTINEL.
return U_SENTINEL;
}
//
// Simple case with no surrogates.
//
ut->chunkOffset--;
cPrev = ut->chunkContents[ut->chunkOffset];
U16_PREV(ut->chunkContents, 0, ut->chunkOffset, c);
if (U_IS_LEAD(c)) {
// User supplied index might have been pointing to the trail surrogate
// of a pair, in which case we need to get the whole supplemenary value.
c = utext_current32(ut);
if (U16_IS_SURROGATE(cPrev)) {
// Possible supplementary. Many edge cases.
// Let other functions do the heavy lifting.
utext_setNativeIndex(ut, index);
cPrev = utext_previous32(ut);
}
return c;
return cPrev;
}
@ -498,7 +627,7 @@ resetChunk(UText *ut, int64_t index) {
//
// invalidateChunk Reset a chunk to have no contents, so that the next call
// to access will new data to load.
// to access will cause new data to load.
// This is needed when copy/move/replace operate directly on the
// backing text, potentially putting it out of sync with the
// contents in the chunk.
@ -531,7 +660,7 @@ U_CDECL_BEGIN
//
// Clone. This is a generic copy-the-utext-by-value clone function that can be
// used as-is with some utext types, and as helper by other clones.
// used as-is with some utext types, and as a helper by other clones.
//
static UText * U_CALLCONV
shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
@ -689,6 +818,7 @@ utf8TextAccess(UText *ut, int64_t index, UBool forward) {
} else {
// Reverse Access. The chunk buffer must be filled so as to contain the
// character preceding the specified index.
U8_SET_CP_START(s8, 0, index32);
if(index32<=0) {
resetChunk(ut, 0);
return FALSE;

View file

@ -21,6 +21,8 @@
static UBool gFailed = FALSE;
static int gTestNum = 0;
// Forward decl
UText *openFragmentedUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status);
#define TEST_ASSERT(x) \
{if ((x)==FALSE) {errln("Test #%d failure in file %s at line %d\n", gTestNum, __FILE__, __LINE__);\
@ -223,6 +225,13 @@ void UTextTest::TestString(const UnicodeString &s) {
TestCMR(sa, ut, cpCount, cpMap, cpMap);
utext_close(ut);
// Fragmented UnicodeString (Chunk size of one)
//
status = U_ZERO_ERROR;
ut = openFragmentedUnicodeString(NULL, &sa, &status);
TEST_SUCCESS(status);
TestAccess(sa, ut, cpCount, cpMap);
utext_close(ut);
//
// UTF-8 test
@ -934,6 +943,7 @@ void UTextTest::ErrorTest()
// check utext_previous32From
for (i=0; i<startMapLimit; i++) {
gTestNum++;
UChar32 c32 = utext_previous32From(ut, i);
TEST_ASSERT(c32 == pr32Map[i]);
int64_t cpIndex = utext_getNativeIndex(ut);
@ -1184,3 +1194,62 @@ void UTextTest::FreezeTest() {
}
//
// Fragmented UText
// A UText type that works with a chunk size of 1.
// Intended to test for edge cases.
// Input comes from a UnicodeString.
//
// ut.b the character. Put into both halves.
//
static UBool
fragTextAccess(UText *ut, int64_t index, UBool forward) {
const UnicodeString *us = (const UnicodeString *)ut->context;
UChar c;
int32_t length = us->length();
if (forward && index>=0 && index<length) {
c = us->charAt((int32_t)index);
ut->b = c | c<<16;
ut->chunkOffset = 0;
ut->chunkLength = 1;
ut->chunkNativeStart = index;
ut->chunkNativeLimit = index+1;
return true;
}
if (!forward && index>0 && index <=length) {
c = us->charAt((int32_t)index-1);
ut->b = c | c<<16;
ut->chunkOffset = 1;
ut->chunkLength = 1;
ut->chunkNativeStart = index-1;
ut->chunkNativeLimit = index;
return true;
}
ut->b = 0;
ut->chunkOffset = 0;
ut->chunkLength = 0;
if (index <= 0) {
ut->chunkNativeStart = 0;
ut->chunkNativeLimit = 0;
} else {
ut->chunkNativeStart = length;
ut->chunkNativeLimit = length;
}
return false;
}
UText *
openFragmentedUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
ut = utext_openUnicodeString(ut, s, status);
if (U_FAILURE(*status)) {
return ut;
}
ut->access = fragTextAccess;
ut->chunkContents = (UChar *)&ut->b;
ut->access(ut, 0, TRUE);
return ut;
}