ICU-4669 ICU 3.4 review comments on UText, fixes added.

X-SVN-Rev: 18715
This commit is contained in:
Andy Heninger 2005-10-25 00:49:31 +00:00
parent fc6581c380
commit 48e7893bad
3 changed files with 129 additions and 106 deletions

View file

@ -1403,14 +1403,22 @@ private:
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CharacterIteratorUT)
CharacterIteratorUT::CharacterIteratorUT(UText *ut) {
fUText = 0;
textLength = 0;
pos = 0;
begin = 0;
end = 0;
if (ut == NULL) {
return;
}
UErrorCode status = U_ZERO_ERROR;
fUText = utext_clone(NULL, ut, FALSE, &status);
// Set the inherited CharacterItertor fields
textLength = utext_nativeLength(ut);
pos = 0;
begin = 0;
end = textLength;
if (fUText != NULL) {
// Set the inherited CharacterItertor fields
textLength = utext_nativeLength(ut);
end = textLength;
}
}
CharacterIteratorUT::CharacterIteratorUT() {
@ -1427,15 +1435,7 @@ CharacterIteratorUT::~CharacterIteratorUT() {
CharacterIterator *CharacterIteratorUT::clone() const {
UErrorCode status = U_ZERO_ERROR;
CharacterIteratorUT *result = new CharacterIteratorUT();
result->fUText = utext_clone(NULL, fUText, TRUE, &status);
if (U_SUCCESS(status)) {
result->textLength = utext_nativeLength(fUText);
result->pos = 0;
result->begin = 0;
result->end = textLength;
}
CharacterIteratorUT *result = new CharacterIteratorUT(this->fUText);
return result;
}

View file

@ -23,7 +23,7 @@
*
* The Text Access API provides a means to allow text that is stored in alternative
* formats to work with ICU services. ICU normally operates on text that is
* stored UTF-16 format, in (UChar *) arrays for the C APIs or as type
* stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type
* UnicodeString for C++ APIs.
*
* ICU Text Access allows other formats, such as UTF-8 or non-contiguous
@ -102,6 +102,35 @@
* an adjusted index is returned, the original index referred to the
* interior of a character.
*
* <em>Conventions for calling UText functions</em>
*
* Most UText access functions have as their first parameter a (UText *) pointer,
* which specifies the UText to be used. Unless otherwise noted, the
* pointer must refer to a valid, open UText. Attempting to
* use a closed UText or passing a NULL pointer is a programming error and
* will produce undefined results or NULL pointer exceptions.
*
* The UText_Open family of functions can either open an existing (closed)
* UText, or heap allocate a new UText. Here is sample code for creating
* a stack-allocated UText.
*
* \code
* char *s = whatever(); // A utf-8 string
* U_ErrorCode status = U_ZERO_ERROR;
* UText ut = UTEXT_INITIALIZER;
* utext_openUTF8(ut, s, -1, &status);
* if (U_FAILURE(status)) {
* // error handling
* } else {
* // work with the UText
* }
* \endcode
*
* Any existing UText passed to an open function _must_ have been initialized,
* either by the UTEXT_INITIALIZER, or by having been originally heap-allocated
* by an open function. Passing NULL will cause the open function to
* heap-allocate and fully initialize a new UText.
*
*/
@ -169,7 +198,7 @@ utext_close(UText *ut);
* @param ut Pointer to a UText struct. If NULL, a new UText will be created.
* If non-NULL, must refer to an initialized UText struct, which will then
* be reset to reference the specified UTF-8 string.
* @param s A UTF-8 string
* @param s A UTF-8 string. Must not be NULL.
* @param length The length of the UTF-8 string in bytes, or -1 if the string is
* zero terminated.
* @param status Errors are returned here.
@ -251,15 +280,18 @@ utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status);
/**
* clone a UText. Much like opening a UText where the source text is itself
* Clone a UText. Much like opening a UText where the source text is itself
* another UText.
*
* A deep clone will copy both the UText data structures and the underlying text.
* The original and cloned UText will operate completely independently; modifications
* made to the text in one will not effect the other. Text providers are not
* made to the text in one will not affect the other. Text providers are not
* required to support deep clones. The user of clone() must check the status return
* and be prepared to handle failures.
*
* The standard UText implementations for UTF8, UChar *, UnicodeString and
* Replaceable all support deep cloning.
*
* A shallow clone replicates only the UText data structures; it does not make
* a copy of the underlying text. Shallow clones can be used as an efficient way to
* have multiple iterators active in a single text string that is not being
@ -275,6 +307,8 @@ utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status);
*
* @param dest A UText struct to be filled in with the result of the clone operation,
* or NULL if the clone function should heap-allocate a new UText struct.
* If non-NULL, must refer to an already existing UText, which will then
* be reset to become the clone.
* @param src The UText to be cloned.
* @param deep TRUE to request a deep clone, FALSE for a shallow clone.
* @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR
@ -336,7 +370,10 @@ utext_isLengthExpensive(const UText *ut);
* This function is roughly equivalent to the the sequence
* utext_setNativeIndex(index);
* utext_current32();
* (There is a difference if the index is out of bounds by being less than zero)
* (There is a subtle difference if the index is out of bounds by being less than zero -
* utext_setNativeIndex(negative value) sets the index to zero, after which utext_current()
* will return the char at zero. utext_char32At(negative index), on the other hand, will
* return the U_SENTINEL value of -1.)
*
* @param ut the text to be accessed
* @param nativeIndex the native index of the character to be accessed. If the index points
@ -366,9 +403,12 @@ utext_current32(UText *ut);
/**
* Get the code point at the current iteration position of the UText, and
* advance the position to the first index following the character.
* Returns U_SENTINEL (-1) if the position is at the end of the
* text.
* This is a post-increment operation
*
* If the position is at the end of the text (the index following
* the last character, which is also the length of the text),
* return U_SENTINEL (-1) and do not advance the index.
*
* This is a post-increment operation.
*
* An inline macro version of this function, UTEXT_NEXT32(),
* is available for performance critical use.
@ -386,11 +426,12 @@ utext_next32(UText *ut);
* Move the iterator position to the character (code point) whose
* index precedes the current position, and return that character.
* This is a pre-decrement operation.
* Returns U_SENTINEL (-1) if the position is at the start of the text.
* This is a pre-decrement operation.
*
* An inline macro version of this function, UTEXT_PREVIOUS32(),
* is available for performance critical use.
* If the initial position is at the start of the text (index of 0)
* return U_SENTINEL (-1), and leave the position unchanged.
*
* An inline macro version of this function, UTEXT_PREVIOUS32(),
* is available for performance critical use.
*
* @param ut the text to be accessed.
* @return the previous UChar32 code point, or U_SENTINEL (-1)
@ -403,12 +444,16 @@ utext_previous32(UText *ut);
/**
* Set the iteration index, access the text for forward iteration,
* and return the code point starting at or before that index.
* Set the iteration index and return the code point at that index.
* Leave the iteration index at the start of the following code point.
*
* This function is the most efficient and convenient way to
* begin a forward iteration.
* begin a forward iteration. The results are identical to the those
* from the sequence
* \code
* utext_setIndex();
* utext_next32();
* \code
*
* @param ut the text to be accessed.
* @param nativeIndex Iteration index, in the native units of the text provider.
@ -443,9 +488,9 @@ utext_previous32From(UText *ut, int32_t nativeIndex);
* Get the current iterator position, which can range from 0 to
* the length of the text.
* The position is a native index into the input text, in whatever format it
* may have, and may not always correspond to a UChar (UTF-16) index
* into the text. The returned position will always be aligned to a
* code point boundary
* may have (possibly UTF-8 for example), and may not always be the same as
* the corresponding UChar (UTF-16) index.
* The returned position will always be aligned to a code point boundary.
*
* @param ut the text to be accessed.
* @return the current index position, in the native units of the text provider.
@ -458,7 +503,7 @@ utext_getNativeIndex(UText *ut);
* Set the current iteration position to the nearest code point
* boundary at or preceding the specified index.
* The index is in the native units of the original input text.
* If the index is out of range, it will be trimmed to be within
* If the index is out of range, it will be pinned to be within
* the range of the input text.
* <p/>
* It will usually be more efficient to begin an iteration
@ -489,10 +534,6 @@ utext_setNativeIndex(UText *ut, int32_t nativeIndex);
* forward or backward, but no further backward than to 0 and
* no further forward than to utext_nativeLength().
* The resulting index value will be in between 0 and length, inclusive.
* <p/>
* Because the index is kept in the native units of the text provider, the
* actual numeric amount by which the index moves depends on the
* underlying text storage representation of the text provider.
*
* @param ut the text to be accessed.
* @param delta the signed number of code points to move the iteration position.
@ -510,7 +551,7 @@ utext_moveIndex32(UText *ut, int32_t delta);
* is specified in the native indices of the UText provider. These may not necessarily
* be UTF-16 indices.
* <p/>
* The size (number of 16 bit UChars) in the data to be extracted is returned. The
* The size (number of 16 bit UChars) of the data to be extracted is returned. The
* full number of UChars is returned, even when the extracted text is truncated
* because the specified buffer size is too small.
*
@ -519,10 +560,13 @@ utext_moveIndex32(UText *ut, int32_t delta);
* terminating NUL is not included in the returned length.
*
* @param ut the UText from which to extract data.
* @param nativeStart the native index of the first character to extract.
* @param nativeStart the native index of the first character to extract.\
* If the specified index is out of range,
* it will be pinned to to be within 0 <= index <= textLength
* @param nativeLimit the native string index of the position following the last
* character to extract. If the specified limit is greater than the length
* of the text, the limit will be trimmed back to the text length.
* character to extract. If the specified index is out of range,
* it will be pinned to to be within 0 <= index <= textLength.
* nativeLimit must be >= nativeStart.
* @param dest the UChar (UTF-16) buffer into which the extracted text is placed
* @param destCapacity The size, in UChars, of the destination buffer. May be zero
* for precomputing the required size.

View file

@ -99,10 +99,7 @@ utext_setNativeIndex(UText *ut, int32_t index) {
// If we are somewhere in the middle of a utf-16 buffer, check that new index
// is not in the middle of a surrogate pair.
if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) {
UChar c = ut->chunk.contents[ut->chunk.offset];
if (U16_TRAIL(c)) {
utext_current32(ut); // force index to the start of the curent code point.
}
U16_SET_CP_START(ut->chunk.contents, 0, ut->chunk.offset)
}
}
}
@ -110,6 +107,12 @@ utext_setNativeIndex(UText *ut, int32_t index) {
//
// utext_current32. Get the UChar32 at the current position.
// As a side effect, adjust the index if the current position
// is on a trail surrogate. This feature is used internally;
// from an external view, indexes are never on trail surrogates.
//
U_DRAFT UChar32 U_EXPORT2
utext_current32(UText *ut) {
UChar32 c = U_SENTINEL;
@ -122,16 +125,8 @@ utext_current32(UText *ut) {
c = ut->chunk.contents[ut->chunk.offset];
if (U16_IS_SURROGATE(c)) {
// looking at a surrogate. Could be unpaired, need to be careful.
// Speed doesn't matter, will be very rare.
UChar32 char16AtIndex = c;
U16_GET(ut->chunk.contents, 0, ut->chunk.offset, ut->chunk.length, c);
if (U16_IS_TRAIL(char16AtIndex) && U_IS_SUPPLEMENTARY(c)) {
// Incoming position pointed to the trailing part of a supplementary pair.
// Move offset to point to the lead surrogate. This is needed because utext_current()
// is used internally to force code point alignment. When called from
// the outside we should always be pre-aligned, but this check doesn't hurt.
ut->chunk.offset--;
}
U16_SET_CP_START(ut->chunk.contents, 0, ut->chunk.offset)
}
}
return c;
@ -144,8 +139,8 @@ utext_char32At(UText *ut, int32_t nativeIndex) {
utext_setNativeIndex(ut, nativeIndex);
if (ut->chunk.offset < ut->chunk.length) {
c = ut->chunk.contents[ut->chunk.offset];
if (c >= 0xd800) {
c = utext_current32(ut);
if (U16_IS_SURROGATE(c)) {
U16_GET(ut->chunk.contents, 0, ut->chunk.offset, ut->chunk.length, c);
}
}
return c;
@ -155,54 +150,31 @@ utext_char32At(UText *ut, int32_t nativeIndex) {
U_DRAFT UChar32 U_EXPORT2
utext_next32(UText *ut) {
UTextChunk *chunk = &ut->chunk;
UChar32 c = U_SENTINEL;
UChar32 c;
if (chunk->offset >= chunk->length) {
if (ut->access(ut, chunk->nativeLimit, TRUE, chunk) == FALSE) {
goto next32_return;
return U_SENTINEL;
}
}
c = chunk->contents[chunk->offset++];
if (U16_IS_SURROGATE(c)) {
// looking at a surrogate. Could be unpaired, need to be careful.
// Speed doesn't matter, will be very rare.
chunk->offset--;
c = utext_current32(ut);
chunk->offset++;
if (U_IS_SUPPLEMENTARY(c)) {
chunk->offset++;
}
}
next32_return:
U16_NEXT(chunk->contents, chunk->offset, chunk->length, c);
return c;
}
}
U_DRAFT UChar32 U_EXPORT2
utext_previous32(UText *ut) {
UTextChunk *chunk = &ut->chunk;
int32_t offset = chunk->offset;
UChar32 c = U_SENTINEL;
UChar32 c;
if (offset <= 0) {
if (chunk->offset <= 0) {
if (ut->access(ut, chunk->nativeStart, FALSE, chunk) == FALSE) {
goto prev32_return;
return U_SENTINEL;
}
offset = chunk->offset;
}
c = chunk->contents[--offset];
chunk->offset = offset;
if (U16_IS_SURROGATE(c)) {
// Note that utext_current() will move the chunk offset to the lead surrogate
// if we come in referring to trail half of a surrogate pair.
c = utext_current32(ut);
}
prev32_return:
U16_PREV(chunk->contents, 0, chunk->offset, c);
// TODO: update position
return c;
}
@ -216,7 +188,7 @@ utext_next32From(UText *ut, int32_t index) {
if(index<chunk->nativeStart || index>=chunk->nativeLimit) {
if(!ut->access(ut, index, TRUE, chunk)) {
// no chunk available here
goto next32return;
return U_SENTINEL;
}
} else if(chunk->nonUTF16Indexes) {
chunk->offset = ut->mapNativeIndexToUTF16(ut, index);
@ -228,9 +200,9 @@ utext_next32From(UText *ut, int32_t index) {
if (U16_IS_SURROGATE(c)) {
// Surrogate code unit. Speed doesn't matter, let plain next32() do the work.
chunk->offset--; // undo the ++, above.
U16_SET_CP_START(ut->chunk.contents, 0, ut->chunk.offset)
c = utext_next32(ut);
}
next32return:
return c;
}
@ -238,13 +210,13 @@ next32return:
U_DRAFT UChar32 U_EXPORT2
utext_previous32From(UText *ut, int32_t index) {
UTextChunk *chunk = &ut->chunk;
UChar32 c = U_SENTINEL;
UChar32 c;
if(index<=chunk->nativeStart || index>chunk->nativeLimit) {
// Requested native index is outside of the current chunk.
if(!ut->access(ut, index, FALSE, chunk)) {
// no chunk available here
goto prev32return;
return U_SENTINEL;
}
} else if(chunk->nonUTF16Indexes) {
chunk->offset=ut->mapNativeIndexToUTF16(ut, index);
@ -253,28 +225,22 @@ utext_previous32From(UText *ut, int32_t index) {
chunk->offset = index - chunk->nativeStart;
// put offset onto a code point boundary if it isn't there already.
if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) {
c = chunk->contents[chunk->offset];
if (U16_TRAIL(c)) {
utext_current32(ut); // force index to the start of the curent code point.
}
U16_SET_CP_START(chunk->contents, 0, chunk->offset)
}
}
if (chunk->offset<=0) {
// already at the start of text. Return U_SENTINEL.
goto prev32return;
return U_SENTINEL;
}
// Do the operation assuming that there are no surrogates involved. Fast, common case.
chunk->offset--;
c = chunk->contents[chunk->offset];
// Check for the char being a surrogate, get the whole char if it is.
if (U16_IS_SURROGATE(c)) {
c = utext_current32(ut);
U16_PREV(chunk->contents, 0, chunk->offset, c);
if (U_IS_LEAD(c)) {
// User supplied index might have been pointing to the trail surrogate
// of a pair, in which case we need to get the whole supplemenary value.
c = utext_current32(ut);
}
prev32return:
return c;
}
@ -474,6 +440,19 @@ utext_close(UText *ut) {
uprv_free(ut->pExtra);
ut->pExtra = NULL;
}
// Zero out fields of the closed UText. This is a defensive move,
// inteded to cause applications that inadvertantly use a closed
// utext to crash with null pointer errors.
ut->clone = NULL;
ut->nativeLength = NULL;
ut->access = NULL;
ut->extract = NULL;
ut->replace = NULL;
ut->copy = NULL;
ut->close = NULL;
ut->chunk.contents = NULL;
if (ut->flags & UTEXT_HEAP_ALLOCATED) {
// This UText was allocated by UText setup. We need to free it.
// Clear magic, so we can detect if the user messes up and immediately