mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-4669 ICU 3.4 review comments on UText, fixes added.
X-SVN-Rev: 18715
This commit is contained in:
parent
fc6581c380
commit
48e7893bad
3 changed files with 129 additions and 106 deletions
|
@ -1403,14 +1403,22 @@ private:
|
|||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CharacterIteratorUT)
|
||||
|
||||
CharacterIteratorUT::CharacterIteratorUT(UText *ut) {
|
||||
fUText = 0;
|
||||
textLength = 0;
|
||||
pos = 0;
|
||||
begin = 0;
|
||||
end = 0;
|
||||
if (ut == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fUText = utext_clone(NULL, ut, FALSE, &status);
|
||||
|
||||
// Set the inherited CharacterItertor fields
|
||||
textLength = utext_nativeLength(ut);
|
||||
pos = 0;
|
||||
begin = 0;
|
||||
end = textLength;
|
||||
if (fUText != NULL) {
|
||||
// Set the inherited CharacterItertor fields
|
||||
textLength = utext_nativeLength(ut);
|
||||
end = textLength;
|
||||
}
|
||||
}
|
||||
|
||||
CharacterIteratorUT::CharacterIteratorUT() {
|
||||
|
@ -1427,15 +1435,7 @@ CharacterIteratorUT::~CharacterIteratorUT() {
|
|||
|
||||
|
||||
CharacterIterator *CharacterIteratorUT::clone() const {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
CharacterIteratorUT *result = new CharacterIteratorUT();
|
||||
result->fUText = utext_clone(NULL, fUText, TRUE, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
result->textLength = utext_nativeLength(fUText);
|
||||
result->pos = 0;
|
||||
result->begin = 0;
|
||||
result->end = textLength;
|
||||
}
|
||||
CharacterIteratorUT *result = new CharacterIteratorUT(this->fUText);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
*
|
||||
* The Text Access API provides a means to allow text that is stored in alternative
|
||||
* formats to work with ICU services. ICU normally operates on text that is
|
||||
* stored UTF-16 format, in (UChar *) arrays for the C APIs or as type
|
||||
* stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type
|
||||
* UnicodeString for C++ APIs.
|
||||
*
|
||||
* ICU Text Access allows other formats, such as UTF-8 or non-contiguous
|
||||
|
@ -102,6 +102,35 @@
|
|||
* an adjusted index is returned, the original index referred to the
|
||||
* interior of a character.
|
||||
*
|
||||
* <em>Conventions for calling UText functions</em>
|
||||
*
|
||||
* Most UText access functions have as their first parameter a (UText *) pointer,
|
||||
* which specifies the UText to be used. Unless otherwise noted, the
|
||||
* pointer must refer to a valid, open UText. Attempting to
|
||||
* use a closed UText or passing a NULL pointer is a programming error and
|
||||
* will produce undefined results or NULL pointer exceptions.
|
||||
*
|
||||
* The UText_Open family of functions can either open an existing (closed)
|
||||
* UText, or heap allocate a new UText. Here is sample code for creating
|
||||
* a stack-allocated UText.
|
||||
*
|
||||
* \code
|
||||
* char *s = whatever(); // A utf-8 string
|
||||
* U_ErrorCode status = U_ZERO_ERROR;
|
||||
* UText ut = UTEXT_INITIALIZER;
|
||||
* utext_openUTF8(ut, s, -1, &status);
|
||||
* if (U_FAILURE(status)) {
|
||||
* // error handling
|
||||
* } else {
|
||||
* // work with the UText
|
||||
* }
|
||||
* \endcode
|
||||
*
|
||||
* Any existing UText passed to an open function _must_ have been initialized,
|
||||
* either by the UTEXT_INITIALIZER, or by having been originally heap-allocated
|
||||
* by an open function. Passing NULL will cause the open function to
|
||||
* heap-allocate and fully initialize a new UText.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
@ -169,7 +198,7 @@ utext_close(UText *ut);
|
|||
* @param ut Pointer to a UText struct. If NULL, a new UText will be created.
|
||||
* If non-NULL, must refer to an initialized UText struct, which will then
|
||||
* be reset to reference the specified UTF-8 string.
|
||||
* @param s A UTF-8 string
|
||||
* @param s A UTF-8 string. Must not be NULL.
|
||||
* @param length The length of the UTF-8 string in bytes, or -1 if the string is
|
||||
* zero terminated.
|
||||
* @param status Errors are returned here.
|
||||
|
@ -251,15 +280,18 @@ utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status);
|
|||
|
||||
|
||||
/**
|
||||
* clone a UText. Much like opening a UText where the source text is itself
|
||||
* Clone a UText. Much like opening a UText where the source text is itself
|
||||
* another UText.
|
||||
*
|
||||
* A deep clone will copy both the UText data structures and the underlying text.
|
||||
* The original and cloned UText will operate completely independently; modifications
|
||||
* made to the text in one will not effect the other. Text providers are not
|
||||
* made to the text in one will not affect the other. Text providers are not
|
||||
* required to support deep clones. The user of clone() must check the status return
|
||||
* and be prepared to handle failures.
|
||||
*
|
||||
* The standard UText implementations for UTF8, UChar *, UnicodeString and
|
||||
* Replaceable all support deep cloning.
|
||||
*
|
||||
* A shallow clone replicates only the UText data structures; it does not make
|
||||
* a copy of the underlying text. Shallow clones can be used as an efficient way to
|
||||
* have multiple iterators active in a single text string that is not being
|
||||
|
@ -275,6 +307,8 @@ utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status);
|
|||
*
|
||||
* @param dest A UText struct to be filled in with the result of the clone operation,
|
||||
* or NULL if the clone function should heap-allocate a new UText struct.
|
||||
* If non-NULL, must refer to an already existing UText, which will then
|
||||
* be reset to become the clone.
|
||||
* @param src The UText to be cloned.
|
||||
* @param deep TRUE to request a deep clone, FALSE for a shallow clone.
|
||||
* @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR
|
||||
|
@ -336,7 +370,10 @@ utext_isLengthExpensive(const UText *ut);
|
|||
* This function is roughly equivalent to the the sequence
|
||||
* utext_setNativeIndex(index);
|
||||
* utext_current32();
|
||||
* (There is a difference if the index is out of bounds by being less than zero)
|
||||
* (There is a subtle difference if the index is out of bounds by being less than zero -
|
||||
* utext_setNativeIndex(negative value) sets the index to zero, after which utext_current()
|
||||
* will return the char at zero. utext_char32At(negative index), on the other hand, will
|
||||
* return the U_SENTINEL value of -1.)
|
||||
*
|
||||
* @param ut the text to be accessed
|
||||
* @param nativeIndex the native index of the character to be accessed. If the index points
|
||||
|
@ -366,9 +403,12 @@ utext_current32(UText *ut);
|
|||
/**
|
||||
* Get the code point at the current iteration position of the UText, and
|
||||
* advance the position to the first index following the character.
|
||||
* Returns U_SENTINEL (-1) if the position is at the end of the
|
||||
* text.
|
||||
* This is a post-increment operation
|
||||
*
|
||||
* If the position is at the end of the text (the index following
|
||||
* the last character, which is also the length of the text),
|
||||
* return U_SENTINEL (-1) and do not advance the index.
|
||||
*
|
||||
* This is a post-increment operation.
|
||||
*
|
||||
* An inline macro version of this function, UTEXT_NEXT32(),
|
||||
* is available for performance critical use.
|
||||
|
@ -386,11 +426,12 @@ utext_next32(UText *ut);
|
|||
* Move the iterator position to the character (code point) whose
|
||||
* index precedes the current position, and return that character.
|
||||
* This is a pre-decrement operation.
|
||||
* Returns U_SENTINEL (-1) if the position is at the start of the text.
|
||||
* This is a pre-decrement operation.
|
||||
*
|
||||
* An inline macro version of this function, UTEXT_PREVIOUS32(),
|
||||
* is available for performance critical use.
|
||||
* If the initial position is at the start of the text (index of 0)
|
||||
* return U_SENTINEL (-1), and leave the position unchanged.
|
||||
*
|
||||
* An inline macro version of this function, UTEXT_PREVIOUS32(),
|
||||
* is available for performance critical use.
|
||||
*
|
||||
* @param ut the text to be accessed.
|
||||
* @return the previous UChar32 code point, or U_SENTINEL (-1)
|
||||
|
@ -403,12 +444,16 @@ utext_previous32(UText *ut);
|
|||
|
||||
|
||||
/**
|
||||
* Set the iteration index, access the text for forward iteration,
|
||||
* and return the code point starting at or before that index.
|
||||
* Set the iteration index and return the code point at that index.
|
||||
* Leave the iteration index at the start of the following code point.
|
||||
*
|
||||
* This function is the most efficient and convenient way to
|
||||
* begin a forward iteration.
|
||||
* begin a forward iteration. The results are identical to the those
|
||||
* from the sequence
|
||||
* \code
|
||||
* utext_setIndex();
|
||||
* utext_next32();
|
||||
* \code
|
||||
*
|
||||
* @param ut the text to be accessed.
|
||||
* @param nativeIndex Iteration index, in the native units of the text provider.
|
||||
|
@ -443,9 +488,9 @@ utext_previous32From(UText *ut, int32_t nativeIndex);
|
|||
* Get the current iterator position, which can range from 0 to
|
||||
* the length of the text.
|
||||
* The position is a native index into the input text, in whatever format it
|
||||
* may have, and may not always correspond to a UChar (UTF-16) index
|
||||
* into the text. The returned position will always be aligned to a
|
||||
* code point boundary
|
||||
* may have (possibly UTF-8 for example), and may not always be the same as
|
||||
* the corresponding UChar (UTF-16) index.
|
||||
* The returned position will always be aligned to a code point boundary.
|
||||
*
|
||||
* @param ut the text to be accessed.
|
||||
* @return the current index position, in the native units of the text provider.
|
||||
|
@ -458,7 +503,7 @@ utext_getNativeIndex(UText *ut);
|
|||
* Set the current iteration position to the nearest code point
|
||||
* boundary at or preceding the specified index.
|
||||
* The index is in the native units of the original input text.
|
||||
* If the index is out of range, it will be trimmed to be within
|
||||
* If the index is out of range, it will be pinned to be within
|
||||
* the range of the input text.
|
||||
* <p/>
|
||||
* It will usually be more efficient to begin an iteration
|
||||
|
@ -489,10 +534,6 @@ utext_setNativeIndex(UText *ut, int32_t nativeIndex);
|
|||
* forward or backward, but no further backward than to 0 and
|
||||
* no further forward than to utext_nativeLength().
|
||||
* The resulting index value will be in between 0 and length, inclusive.
|
||||
* <p/>
|
||||
* Because the index is kept in the native units of the text provider, the
|
||||
* actual numeric amount by which the index moves depends on the
|
||||
* underlying text storage representation of the text provider.
|
||||
*
|
||||
* @param ut the text to be accessed.
|
||||
* @param delta the signed number of code points to move the iteration position.
|
||||
|
@ -510,7 +551,7 @@ utext_moveIndex32(UText *ut, int32_t delta);
|
|||
* is specified in the native indices of the UText provider. These may not necessarily
|
||||
* be UTF-16 indices.
|
||||
* <p/>
|
||||
* The size (number of 16 bit UChars) in the data to be extracted is returned. The
|
||||
* The size (number of 16 bit UChars) of the data to be extracted is returned. The
|
||||
* full number of UChars is returned, even when the extracted text is truncated
|
||||
* because the specified buffer size is too small.
|
||||
*
|
||||
|
@ -519,10 +560,13 @@ utext_moveIndex32(UText *ut, int32_t delta);
|
|||
* terminating NUL is not included in the returned length.
|
||||
*
|
||||
* @param ut the UText from which to extract data.
|
||||
* @param nativeStart the native index of the first character to extract.
|
||||
* @param nativeStart the native index of the first character to extract.\
|
||||
* If the specified index is out of range,
|
||||
* it will be pinned to to be within 0 <= index <= textLength
|
||||
* @param nativeLimit the native string index of the position following the last
|
||||
* character to extract. If the specified limit is greater than the length
|
||||
* of the text, the limit will be trimmed back to the text length.
|
||||
* character to extract. If the specified index is out of range,
|
||||
* it will be pinned to to be within 0 <= index <= textLength.
|
||||
* nativeLimit must be >= nativeStart.
|
||||
* @param dest the UChar (UTF-16) buffer into which the extracted text is placed
|
||||
* @param destCapacity The size, in UChars, of the destination buffer. May be zero
|
||||
* for precomputing the required size.
|
||||
|
|
|
@ -99,10 +99,7 @@ utext_setNativeIndex(UText *ut, int32_t index) {
|
|||
// If we are somewhere in the middle of a utf-16 buffer, check that new index
|
||||
// is not in the middle of a surrogate pair.
|
||||
if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) {
|
||||
UChar c = ut->chunk.contents[ut->chunk.offset];
|
||||
if (U16_TRAIL(c)) {
|
||||
utext_current32(ut); // force index to the start of the curent code point.
|
||||
}
|
||||
U16_SET_CP_START(ut->chunk.contents, 0, ut->chunk.offset)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -110,6 +107,12 @@ utext_setNativeIndex(UText *ut, int32_t index) {
|
|||
|
||||
|
||||
|
||||
//
|
||||
// utext_current32. Get the UChar32 at the current position.
|
||||
// As a side effect, adjust the index if the current position
|
||||
// is on a trail surrogate. This feature is used internally;
|
||||
// from an external view, indexes are never on trail surrogates.
|
||||
//
|
||||
U_DRAFT UChar32 U_EXPORT2
|
||||
utext_current32(UText *ut) {
|
||||
UChar32 c = U_SENTINEL;
|
||||
|
@ -122,16 +125,8 @@ utext_current32(UText *ut) {
|
|||
c = ut->chunk.contents[ut->chunk.offset];
|
||||
if (U16_IS_SURROGATE(c)) {
|
||||
// looking at a surrogate. Could be unpaired, need to be careful.
|
||||
// Speed doesn't matter, will be very rare.
|
||||
UChar32 char16AtIndex = c;
|
||||
U16_GET(ut->chunk.contents, 0, ut->chunk.offset, ut->chunk.length, c);
|
||||
if (U16_IS_TRAIL(char16AtIndex) && U_IS_SUPPLEMENTARY(c)) {
|
||||
// Incoming position pointed to the trailing part of a supplementary pair.
|
||||
// Move offset to point to the lead surrogate. This is needed because utext_current()
|
||||
// is used internally to force code point alignment. When called from
|
||||
// the outside we should always be pre-aligned, but this check doesn't hurt.
|
||||
ut->chunk.offset--;
|
||||
}
|
||||
U16_SET_CP_START(ut->chunk.contents, 0, ut->chunk.offset)
|
||||
}
|
||||
}
|
||||
return c;
|
||||
|
@ -144,8 +139,8 @@ utext_char32At(UText *ut, int32_t nativeIndex) {
|
|||
utext_setNativeIndex(ut, nativeIndex);
|
||||
if (ut->chunk.offset < ut->chunk.length) {
|
||||
c = ut->chunk.contents[ut->chunk.offset];
|
||||
if (c >= 0xd800) {
|
||||
c = utext_current32(ut);
|
||||
if (U16_IS_SURROGATE(c)) {
|
||||
U16_GET(ut->chunk.contents, 0, ut->chunk.offset, ut->chunk.length, c);
|
||||
}
|
||||
}
|
||||
return c;
|
||||
|
@ -155,54 +150,31 @@ utext_char32At(UText *ut, int32_t nativeIndex) {
|
|||
U_DRAFT UChar32 U_EXPORT2
|
||||
utext_next32(UText *ut) {
|
||||
UTextChunk *chunk = &ut->chunk;
|
||||
UChar32 c = U_SENTINEL;
|
||||
UChar32 c;
|
||||
|
||||
if (chunk->offset >= chunk->length) {
|
||||
if (ut->access(ut, chunk->nativeLimit, TRUE, chunk) == FALSE) {
|
||||
goto next32_return;
|
||||
return U_SENTINEL;
|
||||
}
|
||||
}
|
||||
|
||||
c = chunk->contents[chunk->offset++];
|
||||
if (U16_IS_SURROGATE(c)) {
|
||||
// looking at a surrogate. Could be unpaired, need to be careful.
|
||||
// Speed doesn't matter, will be very rare.
|
||||
chunk->offset--;
|
||||
c = utext_current32(ut);
|
||||
chunk->offset++;
|
||||
if (U_IS_SUPPLEMENTARY(c)) {
|
||||
chunk->offset++;
|
||||
}
|
||||
}
|
||||
|
||||
next32_return:
|
||||
U16_NEXT(chunk->contents, chunk->offset, chunk->length, c);
|
||||
return c;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
U_DRAFT UChar32 U_EXPORT2
|
||||
utext_previous32(UText *ut) {
|
||||
UTextChunk *chunk = &ut->chunk;
|
||||
int32_t offset = chunk->offset;
|
||||
UChar32 c = U_SENTINEL;
|
||||
UChar32 c;
|
||||
|
||||
if (offset <= 0) {
|
||||
if (chunk->offset <= 0) {
|
||||
if (ut->access(ut, chunk->nativeStart, FALSE, chunk) == FALSE) {
|
||||
goto prev32_return;
|
||||
return U_SENTINEL;
|
||||
}
|
||||
offset = chunk->offset;
|
||||
}
|
||||
|
||||
c = chunk->contents[--offset];
|
||||
chunk->offset = offset;
|
||||
if (U16_IS_SURROGATE(c)) {
|
||||
// Note that utext_current() will move the chunk offset to the lead surrogate
|
||||
// if we come in referring to trail half of a surrogate pair.
|
||||
c = utext_current32(ut);
|
||||
}
|
||||
|
||||
prev32_return:
|
||||
U16_PREV(chunk->contents, 0, chunk->offset, c);
|
||||
// TODO: update position
|
||||
return c;
|
||||
}
|
||||
|
||||
|
@ -216,7 +188,7 @@ utext_next32From(UText *ut, int32_t index) {
|
|||
if(index<chunk->nativeStart || index>=chunk->nativeLimit) {
|
||||
if(!ut->access(ut, index, TRUE, chunk)) {
|
||||
// no chunk available here
|
||||
goto next32return;
|
||||
return U_SENTINEL;
|
||||
}
|
||||
} else if(chunk->nonUTF16Indexes) {
|
||||
chunk->offset = ut->mapNativeIndexToUTF16(ut, index);
|
||||
|
@ -228,9 +200,9 @@ utext_next32From(UText *ut, int32_t index) {
|
|||
if (U16_IS_SURROGATE(c)) {
|
||||
// Surrogate code unit. Speed doesn't matter, let plain next32() do the work.
|
||||
chunk->offset--; // undo the ++, above.
|
||||
U16_SET_CP_START(ut->chunk.contents, 0, ut->chunk.offset)
|
||||
c = utext_next32(ut);
|
||||
}
|
||||
next32return:
|
||||
return c;
|
||||
}
|
||||
|
||||
|
@ -238,13 +210,13 @@ next32return:
|
|||
U_DRAFT UChar32 U_EXPORT2
|
||||
utext_previous32From(UText *ut, int32_t index) {
|
||||
UTextChunk *chunk = &ut->chunk;
|
||||
UChar32 c = U_SENTINEL;
|
||||
UChar32 c;
|
||||
|
||||
if(index<=chunk->nativeStart || index>chunk->nativeLimit) {
|
||||
// Requested native index is outside of the current chunk.
|
||||
if(!ut->access(ut, index, FALSE, chunk)) {
|
||||
// no chunk available here
|
||||
goto prev32return;
|
||||
return U_SENTINEL;
|
||||
}
|
||||
} else if(chunk->nonUTF16Indexes) {
|
||||
chunk->offset=ut->mapNativeIndexToUTF16(ut, index);
|
||||
|
@ -253,28 +225,22 @@ utext_previous32From(UText *ut, int32_t index) {
|
|||
chunk->offset = index - chunk->nativeStart;
|
||||
// put offset onto a code point boundary if it isn't there already.
|
||||
if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) {
|
||||
c = chunk->contents[chunk->offset];
|
||||
if (U16_TRAIL(c)) {
|
||||
utext_current32(ut); // force index to the start of the curent code point.
|
||||
}
|
||||
U16_SET_CP_START(chunk->contents, 0, chunk->offset)
|
||||
}
|
||||
}
|
||||
|
||||
if (chunk->offset<=0) {
|
||||
// already at the start of text. Return U_SENTINEL.
|
||||
goto prev32return;
|
||||
return U_SENTINEL;
|
||||
}
|
||||
|
||||
// Do the operation assuming that there are no surrogates involved. Fast, common case.
|
||||
chunk->offset--;
|
||||
c = chunk->contents[chunk->offset];
|
||||
|
||||
// Check for the char being a surrogate, get the whole char if it is.
|
||||
if (U16_IS_SURROGATE(c)) {
|
||||
c = utext_current32(ut);
|
||||
U16_PREV(chunk->contents, 0, chunk->offset, c);
|
||||
if (U_IS_LEAD(c)) {
|
||||
// User supplied index might have been pointing to the trail surrogate
|
||||
// of a pair, in which case we need to get the whole supplemenary value.
|
||||
c = utext_current32(ut);
|
||||
}
|
||||
|
||||
prev32return:
|
||||
return c;
|
||||
}
|
||||
|
||||
|
@ -474,6 +440,19 @@ utext_close(UText *ut) {
|
|||
uprv_free(ut->pExtra);
|
||||
ut->pExtra = NULL;
|
||||
}
|
||||
|
||||
// Zero out fields of the closed UText. This is a defensive move,
|
||||
// inteded to cause applications that inadvertantly use a closed
|
||||
// utext to crash with null pointer errors.
|
||||
ut->clone = NULL;
|
||||
ut->nativeLength = NULL;
|
||||
ut->access = NULL;
|
||||
ut->extract = NULL;
|
||||
ut->replace = NULL;
|
||||
ut->copy = NULL;
|
||||
ut->close = NULL;
|
||||
ut->chunk.contents = NULL;
|
||||
|
||||
if (ut->flags & UTEXT_HEAP_ALLOCATED) {
|
||||
// This UText was allocated by UText setup. We need to free it.
|
||||
// Clear magic, so we can detect if the user messes up and immediately
|
||||
|
|
Loading…
Add table
Reference in a new issue