ICU-3944 text access, work in progress

X-SVN-Rev: 17958
This commit is contained in:
Andy Heninger 2005-06-21 18:47:04 +00:00
parent 3609db9872
commit 40e2e39792
4 changed files with 780 additions and 506 deletions

View file

@ -45,6 +45,63 @@
* or system with a unique text storage format can implement a set of
* UText provider functions for that format, which will then allow other
* ICU services to operate on that format.
*
*
* <em>Iterating over text</em>
*
* Here is sample code for a forward iteration over the contents of a UText
*
* \code
* UChar32 c;
* UText *ut = whatever();
*
* for (c=utext_next32From(ut, 0); c!=U_SENTINEL; c=utext_next32(ut)) {
* // do whatever the codepoint c here.
* }
* \endcode
*
* And here is similar code to iterate in the revese direction, from the end
* of the text towards the beginning.
*
* \code
* UChar32 c;
* UText *ut = whatever();
* int textLength = utext_length(ut);
* for (c=utext_previous32From(ut, textLength); c!=U_SENTINEL; c=utext_previous32(ut)) {
* // do whatever the codepoint c here.
* }
* \endcode
*
* <em>Characters and Indexing</em>
*
* Indexing into text by UText functions is nearly always in terms of the native
* indexing of the underlying text storage. The storage format could be utf-8
* or utf-32, for example. When coding to the UText access API, no assumptions
* can be made regarding the size of characters, or how far an index
* may move when iterating between characters.
*
* All indices supplied to UText functions are pinned to the length of the
* text. An out-of-bounds index is not considered to be an error, but is
* adjusted to be in the range 0 <= index <= length of input text.
*
*
* When an index position is returned from a UText function, it will be
* a native index to the underlying text. In the case of multi-unit characers,
* tt will always refer to the first position, never to the interior. This
* is essentially the same thing as saying that a returned index will always
* point to a boundary between characters.
*
* When a native index is supplied to a UText function, all indices that
* refer to any part of a multi-unit character representation are considered
* to be equivalent. In the case of multi-unit characers, an incoming index
* will be logically normalized to refer to the start of the character.
*
* It is possible to test whether a native index is on a code point boundary
* by doing a utext_setIndex() followed by a utext_getIndex().
* If the index returns unchanged, it was on a code point boundary. If
* an adjusted index is returned, the original index referred to the
* interior of a character.
*
*/
@ -102,8 +159,10 @@ utext_close(UText *ut);
/**
* Open a read-only UText implementation for UTF-8 strings.
*
* Any invalid utf-8 sequences in the input will appear on the output side
* of the UText as Unicode Replacement characters, \uFFFD.
* Any invalid utf-8 in the input will be handled in this way:
* a sequence of bytes that has the form of a trunctated, but otherwise valid,
* utf-8 sequence will be replaced by a single unicode replacement character, \uFFFD.
* Any other illegal bytes will each be replaced by a \uFFFD.
*
* @param ut Pointer to a UText struct. If NULL, a new UText will be created.
* If non-NULL, must refer to an initialized UText struct, which will then
@ -119,31 +178,6 @@ utext_close(UText *ut);
U_DRAFT UText * U_EXPORT2
utext_openUTF8(UText *ut, const uint8_t *s, int32_t length, UErrorCode *status);
/**
* Open a read-only UText implementation for a SBCS strings.
* The implementation converts 1:1 according to the provided mapping table.
* Supplementary code points are not supported.
*
* @param ut Pointer to a UText struct. If NULL, a new UText will be created.
* If non-NULL, must refer to an initialized UText struct, which will then
* be reset to reference the specified input string.
* @param toU Mapping table for conversion from SBCS to Unicode (BMP only).
* The mapping table must be available during the lifetime of the
* UText object.
* @param s A byte text string
* @param length The length of the input string in bytes, or -1 if the string is
* zero terminated.
* @param status Errors are returned here.
* @return A pointer to the UText. If a pre-allocated UText was provided, it
* will always be used and returned.
* @draft ICU 3.4
*/
U_DRAFT UText * U_EXPORT2
utext_openSBCS(UText *ut,
const UChar toU[256],
const char *s, int32_t length,
UErrorCode *status);
/**
* Open a read-only UText for UChar * string.
@ -160,12 +194,12 @@ utext_openSBCS(UText *ut,
* @draft ICU 3.4
*/
U_DRAFT UText * U_EXPORT2
utext_openUChar(UText *ut, const UChar *s, int32_t length, UErrorCode *status);
utext_openUChars(UText *ut, const UChar *s, int32_t length, UErrorCode *status);
#ifdef XP_CPLUSPLUS
/**
* Open a UText for a UnicodeString.
* Open a writable UText for a non-const UnicodeString.
*
* @param t Pointer to a UText struct. If NULL, a new UText will be created.
* If non-NULL, must refer to an initialized UText struct, which will then
@ -181,7 +215,7 @@ utext_openUnicodeString(UText *t, UnicodeString *s, UErrorCode *status);
/**
* Open a UText for a const UnicodeString. The resulting UText will not be writeable.
* Open a UText for a const UnicodeString. The resulting UText will not be writable.
*
* @param t Pointer to a UText struct. If NULL, a new UText will be created.
* If non-NULL, must refer to an initialized UText struct, which will then
@ -254,7 +288,7 @@ utext_clone(UText *dest, const UText *src, UBool deep, UErrorCode *status);
/**
* Get the length of the text. Depending on the characteristics
* of the underlying text represenation, this may be expensive.
* @see utext_lengthIsExpensive()
* @see utext_isLengthExpensive()
*
*
* @param ut the text to be accessed.
@ -269,28 +303,33 @@ utext_length(UText *ut);
* Return TRUE if calculating the length of the text could be expensive.
* Finding the length of NUL terminated strings is considered to be expensive.
*
* Note that the value of this function may change
* as the result of other operations on a UText.
* Once the length of a string has been discovered, it will no longer
* be expensive to report it.
*
* @param ut the text to be accessed.
* @return TRUE if determining the lenght of the text could be time consuming.
* @draft ICU 3.4
*/
U_DRAFT UBool U_EXPORT2
utext_lengthIsExpensive(const UText *ut);
utext_isLengthExpensive(const UText *ut);
/**
* Returns the code point at the requested index,
* or U_SENTINEL (-1) if it is out of bounds.
* Sets the current iteration position to the specified index.
*
* If the specified index points to the interior of a multi-unit
* character - one of the trail bytes of a utf-8 sequence, for example -
* the complete code point will be returned, and the current
* iteration position will be left at the start of the code point.
* the complete code point will be returned.
*
* TODO: drop this function as being dangerous? There is no clean way for applications
* to increment the index, which is in native units. Likely user error to increment
* it by utf-16 units. next32From(index) does same thing, except for where iteration
* position is left.
* The iteration position will be set to the start of the returned code point.
*
* This function is roughly equivalent to the the sequence
* utext_setIndex(index);
* utext_current();
* (There is a difference if the index is out of bounds by being less than zero)
*
* @param ut the text to be accessed
* @param the native index of the character to be accessed. If the index points
* to other than the first unit of a multi-unit character, it will be adjusted
@ -299,7 +338,7 @@ utext_lengthIsExpensive(const UText *ut);
* @draft ICU 3.4
*/
U_DRAFT UChar32 U_EXPORT2
utext_char32At(UText *ut, int32_t index);
utext_char32At(UText *ut, int32_t nativeIndex);
/**
@ -309,7 +348,7 @@ utext_char32At(UText *ut, int32_t index);
* the input text.
*
* @param ut the text to be accessed.
* @return the Unicode code point at the specified index.
* @return the Unicode code point at the current iterator position.
* @draft ICU 3.4
*/
U_DRAFT UChar32 U_EXPORT2
@ -358,20 +397,17 @@ utext_previous32(UText *ut);
* and return the code point starting at or before that index.
* Leave the iteration index at the start of the following code point.
*
* An inline macro version of this function, UTEXT_NEXT32FROM(),
* is available for performance critical use.
* This function is the most efficient and convenient way to
* begin a forward iteration.
*
* @param ut the text to be accessed.
* @param index Iteration index.
* @param index Iteration index, in the native units of the text provider.
* @return Code point which starts at or before index,
* or U_SENTINEL (-1) if it is out of bounds.
* @draft ICU 3.4
*/
U_DRAFT UChar32 U_EXPORT2
utext_next32From(UText *ut, int32_t index);
utext_next32From(UText *ut, int32_t nativeIndex);
@ -380,21 +416,18 @@ utext_next32From(UText *ut, int32_t index);
* one specified by the initial index. Leave the iteration position
* at the start of the returned code point.
*
* An inline macro version of this function, UTEXT_PREVIOUS32FROM(),
* is available for performance critical use.
* This function is the most efficient and convenient way to
* begin a backwards iteration.
*
* @param ut the text to be accessed.
* @param index Iteration index.
* @param index Iteration index in the native units of the thext provider.
* @return Code point preceding the one at the initial index,
* or U_SENTINEL (-1) if it is out of bounds.
*
* @draft ICU 3.4
*/
U_DRAFT UChar32 U_EXPORT2
utext_previous32From(UText *ut, int32_t index);
utext_previous32From(UText *ut, int32_t nativeIndex);
/**
* Get the current iterator position, which can range from 0 to
@ -405,33 +438,40 @@ utext_previous32From(UText *ut, int32_t index);
* code point boundary
*
* @param ut the text to be accessed.
* @return the current index position, in native units.
* @return the current index position, in the native units of the text provider.
* @draft ICU 3.4
*/
U_DRAFT int32_t U_EXPORT2
utext_getIndex(UText *ut);
/**
* Set the current iteration position to the specified index.
* Set the current iteration position to the nearest code point
* boundary at or preceding the specified index.
* The index is in the native units of the original input text.
* If the index is out of range, it will be trimmed to be witnin
* If the index is out of range, it will be trimmed to be within
* the range of the input text.
* If the specifed index does not fall on a code point boundary in
* the input text, it will be adjusted back to do so.
* <p/>
* It will usually be more efficient to begin an iteration
* using the functions utext_next32From() or utext_previous32From()
* rather than setIndex().
* <p/>
* Moving the index position to an adjacent character is best done
* with utext_next32(), utext_previous32() or utext_moveIndex().
* Attempting to do direct arithmetic on the index position is
* complicated by the fact that the size (in native units) of a
* character depends on the underlying representation of the character
* (utf-8, utf-16, utf-32, arbitrary codepage), and is not
* easily knowable.
*
* @param ut the text to be accessed.
* @param index the native unit index of the new iteration position.
* @draft ICU 3.4
*/
U_DRAFT void U_EXPORT2
utext_setIndex(UText *ut, int32_t index);
utext_setIndex(UText *ut, int32_t nativeIndex);
/**
* Move the iterator postion by delta code points. The amount to move
* Move the iterator postion by delta code points. The number of code points
* is a signed number; a negative delta will move the iterator backwards,
* towards the start of the text.
* <p/>
@ -439,6 +479,10 @@ utext_setIndex(UText *ut, int32_t index);
* forward or backward, but no further backward than to 0 and
* no further forward than to length().
* The resulting index value will be in between 0 and length(), inclusive.
* <p/>
* Because the index is kept in the native units of the text provider, the
* actual numeric amount by which the index moves depends on the
* underlying text storage representation of the text provider.
*
* @param ut the text to be accessed.
* @param delta the signed number of code points to move the iteration position.
@ -467,22 +511,21 @@ utext_moveIndex(UText *ut, int32_t delta);
* @param ut the UText from which to extract data.
* @param start the native index of the first character to extract.
* @param limit the native string index of the position following the last
* character to extract.
* character to extract. If the specified limit is greater than the length
* of the text, the limit will be trimmed back to the text length.
* @param dest the UChar (utf-16) buffer into which the extracted text is placed
* @param destCapacity The size, in UChars, of the destination buffer. May be zero
* for precomputing the required size.
* @param status receives any error status.
* U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the
* buffer was too small. Returns number of UChars for preflighting.
* @return Number of UChars in the data. Does not include a trailing NUL.
*
* TODO: how should invalid source data be handled? Corrupt utf-8, for example.
* @return Number of UChars in the data to be extracted. Does not include a trailing NUL.
*
* @draft ICU 3.4
*/
U_DRAFT int32_t U_EXPORT2
utext_extract(UText *ut,
int32_t start, int32_t limit,
int32_t nativeStart, int32_t nativeLimit,
UChar *dest, int32_t destCapacity,
UErrorCode *status);
@ -533,38 +576,6 @@ utext_extract(UText *ut,
(ut)->chunk.contents[--((ut)->chunk.offset)] : utext_previous32(ut))
/**
* inline version of utext_next32from(), for performance-critical situations.
*
* Set the iteration index, access the text for forward iteration,
* and return the code point starting at or before that index.
* Leave the iteration index at the start of the following code point.
*
* @draft ICU 3.4
*/
#define UTEXT_NEXT32FROM(ut, index) \
((index) >= (ut)->chunk.start && \
(index) < (ut)->chunk.limit && \
!(ut)->chunk.nonUTF16Indexes && \
(ut)->chunk.contents[(ut)->chunk.offset=(ut)->chunk.start+(index)] < 0xd800 ? \
(ut)->chunk.contents[((ut)->chunk.offset)++] : utext_next32From(ut, index))
/**
* inline version of utext_previous32from(), for performance-critical situations.
*
* Set the iteration index, and return the code point preceding the
* one specified by the initial index. Leave the iteration position
* at the start of the returned code point.
*
* @draft ICU 3.4
*/
#define UTEXT_PREVIOUS32FROM(ut, index) \
((index) > (ut)->chunk.start && \
(index) <= (ut)->chunk.limit && \
!(ut)->chunk.nonUTF16Indexes && \
(ut)->chunk.contents[(ut)->chunk.offset=(ut)->chunk.start+(index-1)] < 0xd800 ? \
(ut)->chunk.contents[(ut)->chunk.offset] : utext_previous32From(ut, index))
/************************************************************************************
@ -587,7 +598,7 @@ utext_extract(UText *ut,
*
*/
U_DRAFT UBool U_EXPORT2
utext_isWriteable(const UText *ut);
utext_isWriteble(const UText *ut);
/**
@ -609,7 +620,7 @@ utext_hasMetaData(const UText *ut);
* newly inserted replacement text.
*
* This function is only available on UText types that support writing,
* that is, ones where utext_isWriteable() returns TRUE.
* that is, ones where utext_isWritable() returns TRUE.
*
* When using this function, there should be only a single UText opened onto the
* underlying native text string. Behavior after a replace operation
@ -617,8 +628,8 @@ utext_hasMetaData(const UText *ut);
* modified string.
*
* @param ut the UText representing the text to be operated on.
* @param start the native index of the start of the region to be replaced
* @param limit the native index of the character following the region to be replaced.
* @param nativeStart the native index of the start of the region to be replaced
* @param nativeLimit the native index of the character following the region to be replaced.
* @param replacementText pointer to the replacement text
* @param replacmentLength length of the replacement text, or -1 if the text is NUL terminated.
* @param status receives any error status. Possible errors include
@ -631,7 +642,7 @@ utext_hasMetaData(const UText *ut);
*/
U_DRAFT int32_t U_EXPORT2
utext_replace(UText *ut,
int32_t start, int32_t limit,
int32_t nativeStart, int32_t nativeLimit,
const UChar *replacementText, int32_t replacementLength,
UErrorCode *status);
@ -648,25 +659,25 @@ utext_replace(UText *ut,
* it does not replace or overwrite any existing text.
*
* This function is only available on UText types that support writing,
* that is, ones where utext_isWriteable() returns TRUE.
* that is, ones where utext_isWritable() returns TRUE.
*
* When using this function, there should be only a single UText opened onto the
* underlying native text string. Behavior after a copy operation
* on a UText is undefined in any other additional UTexts that refer to the
* modified string.
*
* @param ut The UText representing the text to be operated on.
* @param start The native index of the start of the region to be copied or moved
* @param limit The native index of the character following the region to be replaced.
* @param destIndex The native destination index to which the source substring is copied or moved.
* @param move If TRUE, then the substring is moved, not copied/duplicated.
* @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION
* @param ut The UText representing the text to be operated on.
* @param nativeStart The native index of the start of the region to be copied or moved
* @param nativeLimit The native index of the character following the region to be replaced.
* @param destIndex The native destination index to which the source substring is copied or moved.
* @param move If TRUE, then the substring is moved, not copied/duplicated.
* @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION
*
* @draft ICU 3.4
*/
U_DRAFT void U_EXPORT2
utext_copy(UText *ut,
int32_t start, int32_t limit,
int32_t nativeStart, int32_t nativeLimit,
int32_t destIndex,
UBool move,
UErrorCode *status);
@ -709,10 +720,10 @@ struct UTextChunk {
int32_t length;
/** (Native) text index corresponding to the start of the chunk. */
int32_t start;
int32_t nativeStart;
/** (Native) text index corresponding to the end of the chunk (contents+length). */
int32_t limit;
int32_t nativeLimit;
/** If TRUE, then non-UTF-16 indexes are used in this chunk. */
UBool nonUTF16Indexes;
@ -739,10 +750,10 @@ enum {
*/
UTEXT_PROVIDER_NON_UTF16_INDEXES,
/**
* The provider can return the text length inexpensively.
* It is potentially time consuming for the provider to determine the length of the text.
* @draft ICU 3.4
*/
UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE,
UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE,
/**
* Text chunks remain valid and usable until the text object is modified or
* deleted, not just until the next time the access() function is called
@ -799,18 +810,6 @@ enum {
typedef UText * U_CALLCONV
UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status);
/**
* Function type declaration for UText.GetProperties().
*
* Gets the provider properties for this UText.
*
* @param ut the UText to get properties for.
* @return Provider properties bit field.
*
* @draft ICU 3.4
*/
typedef int32_t U_CALLCONV
UTextGetProperties(UText *ut);
/**
* Function type declaration for UText.length().
@ -821,7 +820,7 @@ UTextGetProperties(UText *ut);
* @draft ICU 3.4
*/
typedef int32_t U_CALLCONV
UTextLength(UText *ut);
UTextNativeLength(UText *ut);
/**
* Function type declaration for UText.access(). Get the description of the text chunk
@ -830,22 +829,22 @@ UTextLength(UText *ut);
* of bounds, the iteration position will be left at the start or end
* of the string, as appropriate.
*
* @param ut the UText being accessed.
* @param index Requested (native) index of the text to be accessed.
* @param forward If TRUE, then the returned chunk must contain text
* starting from the index, so that start<=index<limit.
* If FALSE, then the returned chunk must contain text
* before the index, so that start<index<=limit.
* @return True if the requested index could be accessed. The chunk
* will contain the requested text.
* False value if a chunk cannot be accessed
* (the requested index is out of bounds).
* @param ut the UText being accessed.
* @param nativeIndex Requested index of the text to be accessed.
* @param forward If TRUE, then the returned chunk must contain text
* starting from the index, so that start<=index<limit.
* If FALSE, then the returned chunk must contain text
* before the index, so that start<index<=limit.
* @return True if the requested index could be accessed. The chunk
* will contain the requested text.
* False value if a chunk cannot be accessed
* (the requested index is out of bounds).
*
* @see UText
* @draft ICU 3.4
*/
typedef UBool U_CALLCONV
UTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk);
UTextAccess(UText *ut, int32_t nativeIndex, UBool forward, UTextChunk *chunk);
/**
* Function type declaration for UText.extract().
@ -860,23 +859,23 @@ UTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk);
* The extracted string will (if you are a user) / must (if you are a text provider)
* be NUL-terminated if there is sufficient space in the destination buffer.
*
* @param ut the UText from which to extract data.
* @param start the native index of the first characer to extract.
* @param limit the native string index of the position following the last
* character to extract.
* @param dest the UChar (utf-16) buffer into which the extracted text is placed
* @param ut the UText from which to extract data.
* @param nativeStart the native index of the first characer to extract.
* @param nativeLimit the native string index of the position following the last
* character to extract.
* @param dest the UChar (utf-16) buffer into which the extracted text is placed
* @param destCapacity The size, in UChars, of the destination buffer. May be zero
* for precomputing the required size.
* @param status receives any error status.
* If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for
* preflighting.
* for precomputing the required size.
* @param status receives any error status.
* If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for
* preflighting.
* @return Number of UChars in the data. Does not include a trailing NUL.
*
* @draft ICU 3.4
*/
typedef int32_t U_CALLCONV
UTextExtract(UText *ut,
int32_t start, int32_t limit,
int32_t nativeStart, int32_t nativeLimit,
UChar *dest, int32_t destCapacity,
UErrorCode *status);
@ -897,10 +896,10 @@ UTextExtract(UText *ut,
* by the replace operation.
*
* @param ut the UText representing the text to be operated on.
* @param start the native index of the start of the region to be replaced
* @param limit the native index of the character following the region to be replaced.
* @param nativeStart the index of the start of the region to be replaced
* @param nativeLimit the index of the character following the region to be replaced.
* @param replacementText pointer to the replacement text
* @param replacmentLength length of the replacement text, or -1 if the text is NUL terminated.
* @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated.
* @param status receives any error status. Possible errors include
* U_NO_WRITE_PERMISSION
*
@ -911,8 +910,8 @@ UTextExtract(UText *ut,
*/
typedef int32_t U_CALLCONV
UTextReplace(UText *t,
int32_t start, int32_t limit,
const UChar *src, int32_t length,
int32_t nativeStart, int32_t nativeLimit,
const UChar *replacementText, int32_t replacmentLength,
UErrorCode *pErrorCode);
/**
@ -934,57 +933,58 @@ UTextReplace(UText *t,
* taking into account any changes to the underlying string's structure caused
* by the replace operation.
*
* @param ut The UText representing the text to be operated on.
* @param start The native index of the start of the region to be copied or moved
* @param limit The native index of the character following the region to be replaced.
* @param destIndex The native destination index to which the source substring is copied or moved.
* @param move If TRUE, then the substring is moved, not copied/duplicated.
* @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION
* @param ut The UText representing the text to be operated on.
* @param nativeStart The index of the start of the region to be copied or moved
* @param nativeLimit The index of the character following the region to be replaced.
* @param nativeDest The destination index to which the source substring is copied or moved.
* @param move If TRUE, then the substring is moved, not copied/duplicated.
* @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION
*
* @draft ICU 3.4
*/
typedef void U_CALLCONV
UTextCopy(UText *t,
int32_t start, int32_t limit,
int32_t destIndex,
int32_t nativeStart, int32_t nativeLimit,
int32_t nativeDest,
UBool move,
UErrorCode *pErrorCode);
/**
* Function type declaration for UText.mapOffsetToNative().
* Map from a UChar offset within the current text chunk within the UText to
* the corresponding native index in the orginal source text.
*
* This is required only for text providers that do not use native utf-16 indexes.
*
* TODO: specify behavior with out-of-bounds offset? Shouldn't ever occur.
*
* @param ut the UText.
* @param chunk The UTextChunk in which to perform a mapping.
* TODO: keep this as a separate parameter, or just imply that the function
* works on the chunk embedded in the UText?
* @param offset UTF-16 offset relative to the current text chunk embedded in the UText
* @param offset UTF-16 offset within text chunk
* 0<=offset<=chunk->length.
* @return Absolute (native) index corresponding to the UTF-16 offset
* relative to the current text chunk.
* @return Absolute (native) index corresponding to the specified chunk offset.
* The returned native index should always be to a code point boundary.
*
* @draft ICU 3.4
*/
typedef int32_t U_CALLCONV
UTextMapOffsetToNative(UText *ut, UTextChunk *chunk, int32_t offset);
UTextMapOffsetToNative(UText *ut, int32_t offset);
/**
* Function type declaration for UText.mapIndexToUTF16().
* This is required only for text providers that do not use native utf-16 indexes.
* Map from a native index to a UChar offset within a text chunk
*
* @param ut The UText containing the text chunk.
* @param chunk the text chunk in which the mapping occurs.
* TODO: keep this as a separate parameter, or just imply that the function
* works on the chunk embedded in the UText?
* @param index Absolute (native) text index, chunk->start<=index<=chunk->limit.
* @return Chunk-relative UTF-16 offset corresponding to the absolute (native)
* index.
* This function is required only for text providers that do not use native utf-16 indexes.
*
* @see UText
* @param ut The UText containing the text chunk.
* @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit.
* @return Chunk-relative UTF-16 offset corresponding to the specified native
* index.
*
* TODO: specify behavior with out-of-bounds index? Shouldn't ever occur.
* @draft ICU 3.4
*/
typedef int32_t U_CALLCONV
UTextMapIndexToUTF16(UText *ut, UTextChunk *chunk, int32_t index);
UTextMapIndexToUTF16(UText *ut, int32_t nativeIndex);
/**
@ -1077,6 +1077,15 @@ struct UText {
int32_t a, b, c;
/**
* Text provider properties. This set of flags is maintainted by the
* text provider implementation.
* @draft ICU 3.4
*/
int32_t providerProperties;
/** desciptor for the text chunk that includes or is adjacent to
* the current iteration position.
* @draft ICU 3.4
@ -1084,14 +1093,6 @@ struct UText {
UTextChunk chunk;
/**
* Text provider properties
* @draft ICU 3.4
*/
int32_t providerProperties;
/**
* (public) Function pointer for UTextClone
*
@ -1100,14 +1101,6 @@ struct UText {
*/
UTextClone *clone;
/**
* (public) function pointer for UTextGetProperties
*
* @see UTextGetProperties
* @draft ICU 3.4
*/
UTextGetProperties *properties;
/**
* (public) function pointer for UTextLength
* May be expensive to compute!
@ -1115,7 +1108,7 @@ struct UText {
* @see UTextLength
* @draft ICU 3.4
*/
UTextLength *length;
UTextNativeLength *length;
/**
* (public) Function pointer for UTextAccess.
@ -1224,16 +1217,16 @@ enum {
* @internal
*/
#define UTEXT_INITIALZIER_HEAD \
NULL, /* context */ \
NULL, NULL, NULL, /* p, q, r */ \
NULL, /* pExtra */ \
0, /* extraSize */ \
0, /* flags */ \
UTEXT_MAGIC, /* magic */ \
sizeof(UText), /* sizeOfStruct */ \
0, 0, 0, /* a, b, c */ \
UTEXT_CHUNK_INIT, /* UTextChunk */ \
-1 /* provderProps */
NULL, /* context */ \
NULL, NULL, NULL, /* p, q, r */ \
NULL, /* pExtra */ \
0, /* extraSize */ \
0, /* flags */ \
UTEXT_MAGIC, /* magic */ \
sizeof(UText), /* sizeOfStruct */ \
0, 0, 0, /* a, b, c */ \
0, /* providerProps */ \
UTEXT_CHUNK_INIT /* UTextChunk */
@ -1247,7 +1240,6 @@ enum {
#define UTEXT_INITIALIZER { \
UTEXT_INITIALZIER_HEAD, \
NULL, /* clone () */ \
NULL, /* properties ()*/ \
NULL, /* length () */ \
NULL, /* access () */ \
NULL, /* extract () */ \

View file

@ -21,6 +21,7 @@
#include "ustr_imp.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
#define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
@ -38,7 +39,7 @@ utext_moveIndex(UText *ut, int32_t delta) {
UBool retval = TRUE;
if(delta>0) {
do {
if(ut->chunk.offset>=ut->chunk.length && !utext_access(ut, ut->chunk.limit, TRUE)) {
if(ut->chunk.offset>=ut->chunk.length && !utext_access(ut, ut->chunk.nativeLimit, TRUE)) {
retval = FALSE;
break;
}
@ -46,7 +47,7 @@ utext_moveIndex(UText *ut, int32_t delta) {
} while(--delta>0);
} else if (delta<0) {
do {
if(ut->chunk.offset<=0 && !utext_access(ut, ut->chunk.start, FALSE)) {
if(ut->chunk.offset<=0 && !utext_access(ut, ut->chunk.nativeStart, FALSE)) {
retval = FALSE;
break;
}
@ -63,12 +64,20 @@ utext_length(UText *ut) {
return ut->length(ut);
}
U_DRAFT UBool U_EXPORT2
utext_isLengthExpensive(const UText *ut) {
UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
return r;
}
U_DRAFT int32_t U_EXPORT2
utext_getIndex(UText *ut) {
if(!ut->chunk.nonUTF16Indexes || ut->chunk.offset==0) {
return ut->chunk.start+ut->chunk.offset;
return ut->chunk.nativeStart+ut->chunk.offset;
} else {
return ut->mapOffsetToNative(ut, &ut->chunk, ut->chunk.offset);
return ut->mapOffsetToNative(ut, ut->chunk.offset);
}
}
@ -76,23 +85,23 @@ utext_getIndex(UText *ut) {
U_DRAFT void U_EXPORT2
utext_setIndex(UText *ut, int32_t index) {
// TODO - revise for keeping index always valid.
if(index<ut->chunk.start || ut->chunk.limit<index) {
// The desired position is outside of the current chunk. Invalidate it and
// leave it to next32() or previous32() to access the text
// in the desired direction.
if(index<ut->chunk.nativeStart || ut->chunk.nativeLimit<index) {
// The desired position is outside of the current chunk.
// Access the new position. Assume a forward iteration from here,
// which will also be optimimum for a single random access.
// Reverse iterations may suffer slightly.
ut->access(ut, index, TRUE, &ut->chunk);
} else if(ut->chunk.nonUTF16Indexes) {
ut->chunk.offset=ut->mapIndexToUTF16(ut, &ut->chunk, index);
ut->chunk.offset=ut->mapIndexToUTF16(ut, index);
} else {
ut->chunk.offset=index-ut->chunk.start;
ut->chunk.offset=index-ut->chunk.nativeStart;
// Our convention is that the index must always be on a code point boundary.
// If we are somewhere in the middle of a utf-16 buffer, check that new index
// is not in the middle of a surrogate pair.
if (index>ut->chunk.start && index < ut->chunk.limit) { // TODO: clean up end-of-chunk / end of input handling. Everywhere.
if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) {
UChar c = ut->chunk.contents[ut->chunk.offset];
if (U16_TRAIL(c)) {
utext_current(ut); // force index onto a code point boundary.
utext_current(ut); // force index to the start of the curent code point.
}
}
}
@ -123,6 +132,18 @@ utext_current(UText *ut) {
return c;
}
U_DRAFT UChar32 U_EXPORT2
utext_char32At(UText *ut, int32_t nativeIndex) {
UChar32 c = U_SENTINEL;
utext_setIndex(ut, nativeIndex);
if (nativeIndex >= 0 && nativeIndex < ut->chunk.nativeLimit) {
c = ut->chunk.contents[ut->chunk.offset];
}
return c;
}
U_DRAFT UChar32 U_EXPORT2
utext_next32(UText *ut) {
UTextChunk *chunk = &ut->chunk;
@ -130,7 +151,7 @@ utext_next32(UText *ut) {
UChar32 c = U_SENTINEL;
if (offset >= chunk->length) {
if (ut->access(ut, chunk->limit, TRUE, chunk) == FALSE) {
if (ut->access(ut, chunk->nativeLimit, TRUE, chunk) == FALSE) {
goto next32_return;
}
offset = chunk->offset;
@ -160,7 +181,7 @@ utext_previous32(UText *ut) {
UChar32 c = U_SENTINEL;
if (offset <= 0) {
if (ut->access(ut, chunk->start, FALSE, chunk) == FALSE) {
if (ut->access(ut, chunk->nativeStart, FALSE, chunk) == FALSE) {
goto prev32_return;
}
offset = chunk->offset;
@ -186,16 +207,16 @@ utext_next32From(UText *ut, int32_t index) {
UTextChunk *chunk = &ut->chunk;
UChar32 c = U_SENTINEL;
if(index<chunk->start || index>=chunk->limit) {
if(index<chunk->nativeStart || index>=chunk->nativeLimit) {
if(!ut->access(ut, index, TRUE, chunk)) {
// no chunk available here
goto next32return;
}
offset = chunk->offset;
} else if(chunk->nonUTF16Indexes) {
offset=ut->mapIndexToUTF16(ut, chunk, index);
offset=ut->mapIndexToUTF16(ut, index);
} else {
offset = index - chunk->start;
offset = index - chunk->nativeStart;
}
c = chunk->contents[offset++];
@ -220,16 +241,16 @@ utext_previous32From(UText *ut, int32_t index) {
UTextChunk *chunk = &ut->chunk;
UChar32 c = U_SENTINEL;
if(index<=chunk->start || index>chunk->limit) {
if(index<=chunk->nativeStart || index>chunk->nativeLimit) {
if(!ut->access(ut, index, FALSE, chunk)) {
// no chunk available here
goto prev32return;
}
offset = chunk->offset;
} else if(chunk->nonUTF16Indexes) {
offset=ut->mapIndexToUTF16(ut, chunk, index);
offset=ut->mapIndexToUTF16(ut, index);
} else {
offset = index - chunk->start;
offset = index - chunk->nativeStart;
}
offset--;
@ -253,6 +274,66 @@ utext_extract(UText *ut,
}
U_DRAFT UBool U_EXPORT2
utext_isWriteble(const UText *ut)
{
UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
return b;
}
U_DRAFT UBool U_EXPORT2
utext_hasMetaData(const UText *ut)
{
UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
return b;
}
U_DRAFT int32_t U_EXPORT2
utext_replace(UText *ut,
int32_t nativeStart, int32_t nativeLimit,
const UChar *replacementText, int32_t replacementLength,
UErrorCode *status)
{
if (U_FAILURE(*status)) {
return 0;
}
if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
*status = U_NO_WRITE_PERMISSION;
return 0;
}
int32_t i = ut->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
return i;
}
U_DRAFT void U_EXPORT2
utext_copy(UText *ut,
int32_t nativeStart, int32_t nativeLimit,
int32_t destIndex,
UBool move,
UErrorCode *status)
{
if (U_FAILURE(*status)) {
return;
}
if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
*status = U_NO_WRITE_PERMISSION;
return;
}
ut->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
}
U_DRAFT UText * U_EXPORT2
utext_clone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
return src->clone(dest, src, deep, status);
}
U_DRAFT UBool U_EXPORT2
utext_compare(UText *ut, const UChar *s, int32_t length, UBool codePointOrder) {
int32_t segLength, result;
@ -285,7 +366,7 @@ utext_compare(UText *ut, const UChar *s, int32_t length, UBool codePointOrder) {
return 0;
}
if(!ut->access(ut, ut->chunk.limit, TRUE, &ut->chunk)) {
if(!ut->access(ut, ut->chunk.nativeLimit, TRUE, &ut->chunk)) {
// the text ends before the string does
return -1;
}
@ -321,9 +402,10 @@ enum {
//
// Extended form of a UText. The purpose is to aid in computing the total size required
// when a provider asks for a UText to be allocated with extra storage.
//
struct ExtendedUText: public UText {
void *extension;
struct ExtendedUText {
UText ut;
UAlignedMemory extension;
};
static const UText emptyText = UTEXT_INITIALIZER;
@ -338,14 +420,18 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
// We need to heap-allocate storage for the new UText
int32_t spaceRequired = sizeof(UText);
if (extraSpace > 0) {
spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(void *);
spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
}
ut = (UText *)uprv_malloc(spaceRequired);
*ut = emptyText;
ut->flags |= UTEXT_HEAP_ALLOCATED;
if (spaceRequired>0) {
ut->extraSize = spaceRequired;
ut->pExtra = &((ExtendedUText *)ut)->extension;
if (ut == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
} else {
*ut = emptyText;
ut->flags |= UTEXT_HEAP_ALLOCATED;
if (spaceRequired>0) {
ut->extraSize = spaceRequired;
ut->pExtra = &((ExtendedUText *)ut)->extension;
}
}
} else {
// We have been supplied with an already existing UText.
@ -378,6 +464,9 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
}
}
}
if (U_SUCCESS(*status)) {
ut->flags |= UTEXT_OPEN;
}
return ut;
}
@ -429,15 +518,15 @@ utext_close(UText *ut) {
//
static void
resetChunk(UTextChunk *chunk, int32_t index) {
if (index==chunk->limit) {
if (index==chunk->nativeLimit) {
chunk->offset = chunk->length;
} else if (index==chunk->start) {
} else if (index==chunk->nativeStart) {
chunk->offset = 0;
} else {
chunk->length = 0;
chunk->start = index;
chunk->limit = index;
chunk->offset = 0;
chunk->length = 0;
chunk->nativeStart = index;
chunk->nativeLimit = index;
chunk->offset = 0;
}
}
@ -452,16 +541,52 @@ resetChunk(UTextChunk *chunk, int32_t index) {
U_CDECL_BEGIN
static UText * U_CALLCONV
noopTextClone(UText * /* dest */, const UText * /*src*/, UBool /*deep*/, UErrorCode * /*status*/) {
return NULL; // not supported
//
// Clone. This is a generic copy-the-utext-by-value clone function that can be
// used as-is with some utext types, and as helper by other clones.
//
noopTextClone(UText * dest, const UText * src, UBool deep, UErrorCode * status) {
if (U_FAILURE(*status)) {
return NULL;
}
int32_t srcExtraSize = src->extraSize;
//
// Use the generic text_setup to allocate storage if required.
//
dest = utext_setup(dest, srcExtraSize, status);
if (U_FAILURE(*status)) {
return dest;
}
//
// flags (how the UText was allocated) and the pointer to the
// extra storage must retain the values in the cloned utext that
// were set up by utext_setup. Save them separately before
// copying the whole struct.
//
void *destExtra = dest->pExtra;
int32_t flags = dest->flags;
//
// Copy the whole UText struct by value.
// Any "Extra" storage is copied also.
//
int sizeToCopy = src->sizeOfStruct;
if (sizeToCopy > dest->sizeOfStruct) {
sizeToCopy = dest->sizeOfStruct;
}
uprv_memcpy(dest, src, sizeToCopy);
dest->pExtra = destExtra;
dest->flags = flags;
if (srcExtraSize > 0) {
uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
}
return dest;
}
static int32_t U_CALLCONV
noopTextGetProperties(UText * /*t*/) {
return
I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE)|
I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
}
static int32_t U_CALLCONV
noopTextLength(UText * /* t */) {
@ -483,12 +608,12 @@ noopTextExtract(UText * /* t */,
}
static int32_t U_CALLCONV
noopTextMapOffsetToNative(UText * /* t */, UTextChunk * /* chunk */, int32_t /* offset */) {
noopTextMapOffsetToNative(UText * /* t */, int32_t /* offset */) {
return 0;
}
static int32_t U_CALLCONV
noopTextMapIndexToUTF16(UText * /* t */, UTextChunk * /* chunk */, int32_t /* index */) {
noopTextMapIndexToUTF16(UText * /* t */, int32_t /* index */) {
return 0;
}
@ -498,7 +623,6 @@ U_CDECL_END
static const UText noopText={
UTEXT_INITIALZIER_HEAD,
noopTextClone,
noopTextGetProperties,
noopTextLength,
noopTextAccess,
noopTextExtract,
@ -550,14 +674,6 @@ struct UTF8Extra {
U_CDECL_BEGIN
static int32_t U_CALLCONV
utf8TextGetProperties(UText * /*t*/) {
return
I32_FLAG(UTEXT_PROVIDER_NON_UTF16_INDEXES)|
I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE);
// not UTEXT_PROVIDER_STABLE_CHUNKS because chunk-related data is kept
// in UTF8Text, so only one at a time can be active
}
static int32_t U_CALLCONV
utf8TextLength(UText *ut) {
@ -590,7 +706,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
return FALSE;
}
chunk->start=index;
chunk->nativeStart=index;
c=s8[index];
if(c<=0x7f) {
// get a run of ASCII characters.
@ -621,11 +737,11 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
map[i]=index;
chunk->nonUTF16Indexes=TRUE;
}
chunk->contents = u16buf;
chunk->length = i;
chunk->limit = index;
ut->q = map;
chunk->offset = 0; // chunkOffset corresponding to index
chunk->contents = u16buf;
chunk->length = i;
chunk->nativeLimit = index;
ut->q = map;
chunk->offset = 0; // chunkOffset corresponding to index
return TRUE;
} else {
// Reverse Access. The chunk buffer must be filled so as to contain the
@ -635,7 +751,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
return FALSE;
}
chunk->limit=index;
chunk->nativeLimit=index;
c=s8[index-1];
if(c<=0x7f) {
// get a chunk of ASCII characters. Don't build the index map
@ -684,10 +800,10 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
chunk->nonUTF16Indexes=TRUE;
}
// Common reverse iteration, for both UTF16 and non-UTIF16 indexes.
chunk->contents = u16buf+i;
chunk->length = (UTF8_TEXT_CHUNK_SIZE)-i;
chunk->start = index;
chunk->offset = chunk->length; // chunkOffset corresponding to index
chunk->contents = u16buf+i;
chunk->length = (UTF8_TEXT_CHUNK_SIZE)-i;
chunk->nativeStart = index;
chunk->offset = chunk->length; // chunkOffset corresponding to index
return TRUE;
}
}
@ -717,18 +833,20 @@ utf8TextExtract(UText *ut,
// Assume nonUTF16Indexes and 0<=offset<=chunk->length
static int32_t U_CALLCONV
utf8TextMapOffsetToNative(UText *ut, UTextChunk * /* chunk */, int32_t offset) {
utf8TextMapOffsetToNative(UText *ut, int32_t offset) {
// UText.q points to the index mapping array that is allocated in the extra storage area.
U_ASSERT(offset>=0 && offset<=ut->chunk.length);
int32_t *map=(int32_t *)(ut->q);
return map[offset];
}
// Assume nonUTF16Indexes and chunk->start<=index<=chunk->limit
static int32_t U_CALLCONV
utf8TextMapIndexToUTF16(UText *ut, UTextChunk * /*chunk */, int32_t index) {
utf8TextMapIndexToUTF16(UText *ut, int32_t index) {
int32_t *map=(int32_t *)(ut->q);
int32_t offset=0;
U_ASSERT(index>=ut->chunk.nativeStart && index<=ut->chunk.nativeLimit);
while(index>map[offset]) {
++offset;
}
@ -752,9 +870,9 @@ utext_openUTF8(UText *ut, const uint8_t *s, int32_t length, UErrorCode *status)
if (U_FAILURE(*status)) {
return ut;
}
ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_NON_UTF16_INDEXES);
ut->clone = noopTextClone;
ut->properties = utf8TextGetProperties;
ut->length = utf8TextLength;
ut->access = utf8TextAccess;
ut->extract = utf8TextExtract;
@ -777,190 +895,6 @@ U_CDECL_END
//------------------------------------------------------------------------------
//
// UText implementation for SBCS strings (read-only)
//
// Use of UText data members:
// context pointer to SBCS string
//
//------------------------------------------------------------------------------
enum { SBCS_TEXT_CHUNK_SIZE=10 };
struct SBCSText : public UText {
/* pointer to SBCS-to-BMP mapping table */
const UChar *toU;
/* length of UTF-8 string (in bytes) */
int32_t length;
/* chunk UChars */
UChar s[SBCS_TEXT_CHUNK_SIZE];
};
U_CDECL_BEGIN
static int32_t U_CALLCONV
sbcsTextGetProperties(UText * /*t*/) {
return
I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE);
// not UTEXT_PROVIDER_STABLE_CHUNKS because chunk-related data is kept
// in SBCSText, so only one at a time can be active
}
static int32_t U_CALLCONV
sbcsTextLength(UText *t) {
return ((SBCSText *)t)->length;
}
static UBool U_CALLCONV
sbcsTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
SBCSText *ts=(SBCSText *)ut;
const uint8_t *s8=(const uint8_t *)ts->context;
int32_t i, count, length=ts->length;
chunk->nonUTF16Indexes=FALSE;
if(forward) {
if(length<=index) {
resetChunk(chunk, length);
return FALSE;
}
count=length-index;
if(count>SBCS_TEXT_CHUNK_SIZE) {
count=SBCS_TEXT_CHUNK_SIZE;
}
chunk->start=index;
for(i=0; i<count; ++index, ++i) {
ts->s[i]=ts->toU[s8[index]];
}
chunk->contents=ts->s;
chunk->length=i;
chunk->limit=index;
chunk->offset = 0; // chunkOffset corresponding to index
return TRUE;
} else {
if(index<=0) {
resetChunk(chunk, 0);
return FALSE;
}
if(index<=SBCS_TEXT_CHUNK_SIZE) {
count=index;
} else {
count=SBCS_TEXT_CHUNK_SIZE;
}
chunk->limit=index;
for(i=count; i>0;) {
ts->s[--i]=ts->toU[s8[--index]];
}
chunk->contents=ts->s;
chunk->length=count;
chunk->start=index;
chunk->offset=count; // chunkOffset corresponding to index
return TRUE;
}
}
static int32_t U_CALLCONV
sbcsTextExtract(UText *t,
int32_t start, int32_t limit,
UChar *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
SBCSText *ts=(SBCSText *)t;
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
if(start<0 || start>limit || ts->length<limit) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
const uint8_t *s8=(const uint8_t *)ts->context+start;
UChar *d=dest;
const UChar *destLimit;
int32_t destLength=limit-start;
if(destLength>destCapacity) {
destLength=destCapacity;
}
destLimit=dest+destLength;
while(d<destLimit) {
*d++=ts->toU[*s8++];
}
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
static const UText sbcsText={
UTEXT_INITIALZIER_HEAD,
noopTextClone,
sbcsTextGetProperties,
sbcsTextLength,
sbcsTextAccess,
sbcsTextExtract,
NULL, // replace
NULL, // copy
NULL, // mapOffsetToNative
NULL, // mapIndexToUTF16
NULL // close
};
U_DRAFT UText * U_EXPORT2
utext_openSBCS(UText * /*ut */,
const UChar /* toU*/[256] ,
const char *s, int32_t length,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if(s==NULL || length<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
SBCSText *ts=(SBCSText *)uprv_malloc(sizeof(SBCSText));
if(ts==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
*((UText *)ts)=sbcsText;
ts->context=s;
if(length>=0) {
ts->length=length;
} else {
ts->length=(int32_t)uprv_strlen(s);
}
return ts;
}
U_DRAFT void U_EXPORT2
utext_closeSBCS(UText *t) {
if(t!=NULL) {
uprv_free((SBCSText *)t);
}
}
U_DRAFT void U_EXPORT2
utext_resetSBCS(UText *t, const char *s, int32_t length, UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return;
}
if(s==NULL || length<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
SBCSText *ts=(SBCSText *)t;
ts->context=s;
if(length>=0) {
ts->length=length;
} else {
ts->length=(int32_t)uprv_strlen(s);
}
}
U_CDECL_END
/* UText implementation wrapper for Replaceable (read/write) ---------------- */
@ -1283,7 +1217,6 @@ unistrTextClone(UText * /* dest */, const UText * /*src*/, UBool /*deep*/, UErro
static int32_t U_CALLCONV
unistrTextGetProperties(UText * /*t*/) {
return
I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE)|
I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)|
I32_FLAG(UTEXT_PROVIDER_WRITABLE);
}
@ -1299,13 +1232,13 @@ unistrTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
const UnicodeString *us = (const UnicodeString *)ut->context;
int32_t length = us->length();
if (chunk->limit != length) {
if (chunk->nativeLimit != length) {
// This chunk is not yet set up. Do it now.
chunk->contents=us->getBuffer();
chunk->length=length;
chunk->start=0;
chunk->limit=length;
chunk->nonUTF16Indexes=FALSE;
chunk->contents = us->getBuffer();
chunk->length = length;
chunk->nativeStart = 0;
chunk->nativeLimit = length;
chunk->nonUTF16Indexes = FALSE;
}
// pin the requested index to the bounds of the string,
@ -1423,7 +1356,6 @@ utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
ut = utext_setup(ut, 0, status);
if (U_SUCCESS(*status)) {
ut->clone = unistrTextClone;
ut->properties = unistrTextGetProperties;
ut->length = unistrTextLength;
ut->access = unistrTextAccess;
ut->extract = unistrTextExtract;
@ -1431,6 +1363,205 @@ utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
ut->copy = unistrTextCopy;
ut->context = s;
ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)|
I32_FLAG(UTEXT_PROVIDER_WRITABLE);
}
return ut;
}
//------------------------------------------------------------------------------
//
// UText implementation for const UChar * strings
//
// Use of UText data members:
// context pointer to UnicodeString
// a length. -1 if not yet known.
//
//------------------------------------------------------------------------------
U_CDECL_BEGIN
static UText * U_CALLCONV
ucstrTextClone(UText * /* dest */, const UText * /*src*/, UBool /*deep*/, UErrorCode * /*status*/) {
// TODO: fix this.
return NULL;
}
static int32_t U_CALLCONV
ucstrTextLength(UText *ut) {
if (ut->a < 0) {
// null terminated, we don't yet know the length. Scan for it.
// Access is not convenient for doing this
// because the current interation postion can't be changed.
const UChar *str = (const UChar *)ut->context;
for (;;) {
if (str[ut->chunk.nativeLimit] == 0) {
break;
}
ut->chunk.nativeLimit++;
}
ut->a = ut->chunk.nativeLimit;
ut->chunk.length = ut->chunk.nativeLimit;
ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
}
return ut->a;
}
static UBool U_CALLCONV
ucstrTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
const UChar *str = (const UChar *)ut->context;
// pin the requested index to the bounds of the string,
// and set current iteration position.
if (index<0) {
index = 0;
} else if (index < ut->chunk.nativeLimit) {
// The request data is within the chunk as it is known so far.
// There is nothing more that needs to be done within this access function.
} else if (ut->a >= 0) {
// We know the length of this string, and the user is requesting something
// at or beyond the length. Trim the requested index to the length.
index = ut->a;
} else {
// Null terminated string, length not yet known.
// Scan down another 32 UChars or to the requested index, whichever is further
int scanLimit = ut->chunk.nativeLimit + 32;
if (scanLimit <= index) {
scanLimit = index+1; // TODO: beware int overflow
}
for (; ut->chunk.nativeLimit<scanLimit; ut->chunk.nativeLimit++) {
if (str[ut->chunk.nativeLimit] == 0) {
// We found the end of the string. Remember it, trim the index to it,
// and bail out of here.
ut->a = ut->chunk.nativeLimit;
ut->chunk.length = ut->chunk.nativeLimit;
if (index > ut->chunk.nativeLimit) {
index = ut->chunk.nativeLimit;
}
ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
goto breakout;
}
}
// We scanned through the next batch of UChars without finding the end.
// The endpoint of a chunk must not be left in the middle of a surrogate pair.
// If the current end is on a lead surrogate, back the end up by one.
// It doesn't matter if the end char happens to be an unpaired surrogate,
// and it's simpler not to worry about it.
if (U16_IS_LEAD(str[ut->chunk.nativeLimit-1])) {
--ut->chunk.nativeLimit;
}
}
breakout:
chunk->offset = index;
// Check whether request is at the start or end
UBool retVal = (forward && index<ut->chunk.nativeLimit) || (!forward && index>0);
return retVal;
}
static int32_t U_CALLCONV
ucstrTextExtract(UText *ut,
int32_t start, int32_t limit,
UChar *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
const UChar *s=(const UChar *)ut->context;
int32_t strLength=ut->a;
int32_t si, di;
// If text is null terminated and we haven't yet scanned down as far as the starting
// position of the extract, do it now.
if (strLength<0 && limit>=ut->chunk.nativeLimit) {
ucstrTextAccess(ut, start, TRUE, &ut->chunk);
}
// Raise an error if starting position is outside of the string.
if(start<0 || start>limit) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
if (strLength >= 0 && limit > strLength) {
// String length is known. Trim requested limit to be no more than the length
limit = strLength;
}
di = 0;
for (si=start; si<limit; si++) {
if (strLength<0 && s[si]==0) {
// Just hit the end of a null-terminated string.
ut->a = si; // set string length for this UText
ut->chunk.nativeLimit = si;
ut->chunk.length = si;
//
break;
}
if (di<destCapacity) {
// only store if there is space.
dest[di] = s[si];
} else {
if (strLength>=0) {
// We have filled the destination buffer, and the string is known.
// Cut the loop short. There is no need to scan string termination.
di = strLength;
break;
}
}
di++;
}
u_terminateUChars(dest, destCapacity, di, pErrorCode);
return di;
}
U_CDECL_END
U_DRAFT UText * U_EXPORT2
utext_openUChars(UText *ut, const UChar *s, int32_t length, UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
if (length < -1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
ut = utext_setup(ut, 0, status);
if (U_SUCCESS(*status)) {
ut->clone = noopTextClone;
ut->length = ucstrTextLength;
ut->access = ucstrTextAccess;
ut->extract = ucstrTextExtract;
ut->replace = NULL;
ut->copy = NULL;
ut->context = s;
ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
if (length==-1) {
ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
}
ut->a = length;
ut->chunk.contents = s;
ut->chunk.nativeStart = 0;
ut->chunk.nativeLimit = length>=0? length : 0;
ut->chunk.nonUTF16Indexes = FALSE;
}
return ut;
}

View file

@ -15,8 +15,10 @@
#include "unicode/utypes.h"
#include "unicode/utext.h"
#include "unicode/ustring.h"
#include "cintltst.h"
#include "memory.h"
#include "string.h"
static void TestAPI(void);
@ -45,7 +47,7 @@ addUTextTest(TestNode** root)
/*
* TestAPI verify that the UText API is accessible from C programs.
* This is not intended to be a complete test of the API functionality. That is
* in the C++ intltest program.
* in the C++ intltest program.
* This test is intended to check that everything can be accessed and built in
* a pure C enviornment.
*/
@ -55,19 +57,193 @@ static void TestAPI(void) {
UErrorCode status = U_ZERO_ERROR;
UBool gFailed = FALSE;
UText utLoc = UTEXT_INITIALIZER;
const char * cString = "Hello, World";
UChar uString[] = {0x41, 0x42, 0x43, 0};
uint8_t *utf8String;
UText *uta;
UText *utb;
// Open
{
UText utLoc = UTEXT_INITIALIZER;
const char * cString = "Hello, World";
UChar uString[] = {0x41, 0x42, 0x43, 0};
uint8_t *utf8String;
UText *uta;
UText *utb;
UChar c;
utf8String = (uint8_t *)cString;
uta = utext_openUTF8(&utLoc, utf8String, -1, &status);
TEST_SUCCESS(status);
TEST_ASSERT(uta == &utLoc);
status = U_ZERO_ERROR;
uta = utext_openUChars(NULL, uString, -1, &status);
TEST_SUCCESS(status);
c = utext_next32(uta);
TEST_ASSERT(c == 0x41);
utb = utext_close(uta);
TEST_ASSERT(utb == NULL);
uta = utext_close(&utLoc);
TEST_ASSERT(uta == &utLoc);
utf8String = (uint8_t *)cString;
uta = utext_openUTF8(&utLoc, utf8String, -1, &status);
TEST_SUCCESS(status);
TEST_ASSERT(uta == &utLoc);
uta = utext_close(&utLoc);
TEST_ASSERT(uta == &utLoc);
}
// utext_clone()
{
UChar uString[] = {0x41, 0x42, 0x43, 0};
int len;
UText *uta;
UText *utb;
status = U_ZERO_ERROR;
uta = utext_openUChars(NULL, uString, -1, &status);
TEST_SUCCESS(status);
utb = utext_clone(NULL, uta, FALSE, &status);
TEST_SUCCESS(status);
TEST_ASSERT(utb != NULL);
TEST_ASSERT(utb != uta);
len = utext_length(uta);
TEST_ASSERT(len == u_strlen(uString));
utext_close(uta);
utext_close(utb);
}
// basic access functions
{
UChar uString[] = {0x41, 0x42, 0x43, 0};
UText *uta;
UChar32 c;
int32_t len;
UBool b;
int32_t i;
status = U_ZERO_ERROR;
uta = utext_openUChars(NULL, uString, -1, &status);
TEST_ASSERT(uta!=NULL);
TEST_SUCCESS(status);
b = utext_isLengthExpensive(uta);
TEST_ASSERT(b==TRUE);
len = utext_length(uta);
TEST_ASSERT(len == u_strlen(uString));
b = utext_isLengthExpensive(uta);
TEST_ASSERT(b==FALSE);
c = utext_char32At(uta, 0);
TEST_ASSERT(c==uString[0]);
c = utext_current(uta);
TEST_ASSERT(c==uString[0]);
c = utext_next32(uta);
TEST_ASSERT(c==uString[0]);
c = utext_current(uta);
TEST_ASSERT(c==uString[1]);
c = utext_previous32(uta);
TEST_ASSERT(c==uString[0]);
c = utext_current(uta);
TEST_ASSERT(c==uString[0]);
c = utext_next32From(uta, 1);
TEST_ASSERT(c==uString[1]);
c = utext_next32From(uta, u_strlen(uString));
TEST_ASSERT(c==U_SENTINEL);
c = utext_previous32From(uta, 2);
TEST_ASSERT(c==uString[1]);
i = utext_getIndex(uta);
TEST_ASSERT(i == 1);
utext_setIndex(uta, 0);
b = utext_moveIndex(uta, 1);
TEST_ASSERT(b==TRUE);
i = utext_getIndex(uta);
TEST_ASSERT(i==1);
b = utext_moveIndex(uta, u_strlen(uString)-1);
TEST_ASSERT(b==TRUE);
i = utext_getIndex(uta);
TEST_ASSERT(i==u_strlen(uString));
b = utext_moveIndex(uta, 1);
TEST_ASSERT(b==FALSE);
i = utext_getIndex(uta);
TEST_ASSERT(i==u_strlen(uString));
utext_setIndex(uta, 0);
c = UTEXT_NEXT32(uta);
TEST_ASSERT(c==uString[0]);
c = utext_current(uta);
TEST_ASSERT(c==uString[1]);
c = UTEXT_PREVIOUS32(uta);
TEST_ASSERT(c==uString[0]);
c = UTEXT_PREVIOUS32(uta);
TEST_ASSERT(c==U_SENTINEL);
utext_close(uta);
}
{
//
// extract
//
UText *uta;
UChar uString[] = {0x41, 0x42, 0x43, 0};
UChar buf[100];
int32_t i;
status = U_ZERO_ERROR;
uta = utext_openUChars(NULL, uString, -1, &status);
TEST_SUCCESS(status);
status = U_ZERO_ERROR;
i = utext_extract(uta, 0, 100, NULL, 0, &status);
TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR);
TEST_ASSERT(i == u_strlen(uString));
status = U_ZERO_ERROR;
memset(buf, 0, sizeof(buf));
i = utext_extract(uta, 0, 100, buf, 100, &status);
TEST_SUCCESS(status);
TEST_ASSERT(i == u_strlen(uString));
i = u_strcmp(uString, buf);
TEST_ASSERT(i == 0);
utext_close(uta);
}
{
//
// Copy, Replace, isWritable
// Can't create an editable UText from plain C, so all we
// can easily do is check that errors returned.
UText *uta;
UChar uString[] = {0x41, 0x42, 0x43, 0};
UBool b;
status = U_ZERO_ERROR;
uta = utext_openUChars(NULL, uString, -1, &status);
TEST_SUCCESS(status);
b = utext_isWriteble(uta);
TEST_ASSERT(b == FALSE);
b = utext_hasMetaData(uta);
TEST_ASSERT(b == FALSE);
utext_replace(uta,
0, 1, /* start, limit */
uString, -1, /* replacement, replacement length */
&status);
TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
utext_copy(uta,
0, 1, /* start, limit */
2, /* destination index */
FALSE, /* move flag */
&status);
TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
}
}
}

View file

@ -285,31 +285,6 @@ void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *c
}
}
cpIndex = 0;
for (i=0; i<cpCount; i++) {
cpIndex = (cpIndex + 9973) % cpCount;
index = cpMap[cpIndex].nativeIdx;
expectedC = cpMap[cpIndex].cp;
foundC = UTEXT_NEXT32FROM(ut, index);
TEST_ASSERT(expectedC == foundC);
TEST_ASSERT(expectedIndex == foundIndex);
if (gFailed) {
return;
}
}
cpIndex = 0;
for (i=0; i<cpCount; i++) {
cpIndex = (cpIndex + 9973) % cpCount;
index = cpMap[cpIndex+1].nativeIdx;
expectedC = cpMap[cpIndex].cp;
foundC = UTEXT_PREVIOUS32FROM(ut, index);
TEST_ASSERT(expectedC == foundC);
TEST_ASSERT(expectedIndex == foundIndex);
if (gFailed) {
return;
}
}
//
// moveIndex(int32_t delta);