diff --git a/icu4c/source/common/unicode/utext.h b/icu4c/source/common/unicode/utext.h
index 96a44e66f17..44d29c3f36c 100644
--- a/icu4c/source/common/unicode/utext.h
+++ b/icu4c/source/common/unicode/utext.h
@@ -45,6 +45,63 @@
* or system with a unique text storage format can implement a set of
* UText provider functions for that format, which will then allow other
* ICU services to operate on that format.
+ *
+ *
+ * Iterating over text
+ *
+ * Here is sample code for a forward iteration over the contents of a UText
+ *
+ * \code
+ * UChar32 c;
+ * UText *ut = whatever();
+ *
+ * for (c=utext_next32From(ut, 0); c!=U_SENTINEL; c=utext_next32(ut)) {
+ * // do whatever the codepoint c here.
+ * }
+ * \endcode
+ *
+ * And here is similar code to iterate in the revese direction, from the end
+ * of the text towards the beginning.
+ *
+ * \code
+ * UChar32 c;
+ * UText *ut = whatever();
+ * int textLength = utext_length(ut);
+ * for (c=utext_previous32From(ut, textLength); c!=U_SENTINEL; c=utext_previous32(ut)) {
+ * // do whatever the codepoint c here.
+ * }
+ * \endcode
+ *
+ * Characters and Indexing
+ *
+ * Indexing into text by UText functions is nearly always in terms of the native
+ * indexing of the underlying text storage. The storage format could be utf-8
+ * or utf-32, for example. When coding to the UText access API, no assumptions
+ * can be made regarding the size of characters, or how far an index
+ * may move when iterating between characters.
+ *
+ * All indices supplied to UText functions are pinned to the length of the
+ * text. An out-of-bounds index is not considered to be an error, but is
+ * adjusted to be in the range 0 <= index <= length of input text.
+ *
+ *
+ * When an index position is returned from a UText function, it will be
+ * a native index to the underlying text. In the case of multi-unit characers,
+ * tt will always refer to the first position, never to the interior. This
+ * is essentially the same thing as saying that a returned index will always
+ * point to a boundary between characters.
+ *
+ * When a native index is supplied to a UText function, all indices that
+ * refer to any part of a multi-unit character representation are considered
+ * to be equivalent. In the case of multi-unit characers, an incoming index
+ * will be logically normalized to refer to the start of the character.
+ *
+ * It is possible to test whether a native index is on a code point boundary
+ * by doing a utext_setIndex() followed by a utext_getIndex().
+ * If the index returns unchanged, it was on a code point boundary. If
+ * an adjusted index is returned, the original index referred to the
+ * interior of a character.
+ *
*/
@@ -102,8 +159,10 @@ utext_close(UText *ut);
/**
* Open a read-only UText implementation for UTF-8 strings.
*
- * Any invalid utf-8 sequences in the input will appear on the output side
- * of the UText as Unicode Replacement characters, \uFFFD.
+ * Any invalid utf-8 in the input will be handled in this way:
+ * a sequence of bytes that has the form of a trunctated, but otherwise valid,
+ * utf-8 sequence will be replaced by a single unicode replacement character, \uFFFD.
+ * Any other illegal bytes will each be replaced by a \uFFFD.
*
* @param ut Pointer to a UText struct. If NULL, a new UText will be created.
* If non-NULL, must refer to an initialized UText struct, which will then
@@ -119,31 +178,6 @@ utext_close(UText *ut);
U_DRAFT UText * U_EXPORT2
utext_openUTF8(UText *ut, const uint8_t *s, int32_t length, UErrorCode *status);
-/**
- * Open a read-only UText implementation for a SBCS strings.
- * The implementation converts 1:1 according to the provided mapping table.
- * Supplementary code points are not supported.
- *
- * @param ut Pointer to a UText struct. If NULL, a new UText will be created.
- * If non-NULL, must refer to an initialized UText struct, which will then
- * be reset to reference the specified input string.
- * @param toU Mapping table for conversion from SBCS to Unicode (BMP only).
- * The mapping table must be available during the lifetime of the
- * UText object.
- * @param s A byte text string
- * @param length The length of the input string in bytes, or -1 if the string is
- * zero terminated.
- * @param status Errors are returned here.
- * @return A pointer to the UText. If a pre-allocated UText was provided, it
- * will always be used and returned.
- * @draft ICU 3.4
- */
-U_DRAFT UText * U_EXPORT2
-utext_openSBCS(UText *ut,
- const UChar toU[256],
- const char *s, int32_t length,
- UErrorCode *status);
-
/**
* Open a read-only UText for UChar * string.
@@ -160,12 +194,12 @@ utext_openSBCS(UText *ut,
* @draft ICU 3.4
*/
U_DRAFT UText * U_EXPORT2
-utext_openUChar(UText *ut, const UChar *s, int32_t length, UErrorCode *status);
+utext_openUChars(UText *ut, const UChar *s, int32_t length, UErrorCode *status);
#ifdef XP_CPLUSPLUS
/**
- * Open a UText for a UnicodeString.
+ * Open a writable UText for a non-const UnicodeString.
*
* @param t Pointer to a UText struct. If NULL, a new UText will be created.
* If non-NULL, must refer to an initialized UText struct, which will then
@@ -181,7 +215,7 @@ utext_openUnicodeString(UText *t, UnicodeString *s, UErrorCode *status);
/**
- * Open a UText for a const UnicodeString. The resulting UText will not be writeable.
+ * Open a UText for a const UnicodeString. The resulting UText will not be writable.
*
* @param t Pointer to a UText struct. If NULL, a new UText will be created.
* If non-NULL, must refer to an initialized UText struct, which will then
@@ -254,7 +288,7 @@ utext_clone(UText *dest, const UText *src, UBool deep, UErrorCode *status);
/**
* Get the length of the text. Depending on the characteristics
* of the underlying text represenation, this may be expensive.
- * @see utext_lengthIsExpensive()
+ * @see utext_isLengthExpensive()
*
*
* @param ut the text to be accessed.
@@ -269,28 +303,33 @@ utext_length(UText *ut);
* Return TRUE if calculating the length of the text could be expensive.
* Finding the length of NUL terminated strings is considered to be expensive.
*
+ * Note that the value of this function may change
+ * as the result of other operations on a UText.
+ * Once the length of a string has been discovered, it will no longer
+ * be expensive to report it.
+ *
* @param ut the text to be accessed.
* @return TRUE if determining the lenght of the text could be time consuming.
* @draft ICU 3.4
*/
U_DRAFT UBool U_EXPORT2
-utext_lengthIsExpensive(const UText *ut);
+utext_isLengthExpensive(const UText *ut);
/**
* Returns the code point at the requested index,
* or U_SENTINEL (-1) if it is out of bounds.
- * Sets the current iteration position to the specified index.
*
* If the specified index points to the interior of a multi-unit
* character - one of the trail bytes of a utf-8 sequence, for example -
- * the complete code point will be returned, and the current
- * iteration position will be left at the start of the code point.
+ * the complete code point will be returned.
*
- * TODO: drop this function as being dangerous? There is no clean way for applications
- * to increment the index, which is in native units. Likely user error to increment
- * it by utf-16 units. next32From(index) does same thing, except for where iteration
- * position is left.
+ * The iteration position will be set to the start of the returned code point.
*
+ * This function is roughly equivalent to the the sequence
+ * utext_setIndex(index);
+ * utext_current();
+ * (There is a difference if the index is out of bounds by being less than zero)
+ *
* @param ut the text to be accessed
* @param the native index of the character to be accessed. If the index points
* to other than the first unit of a multi-unit character, it will be adjusted
@@ -299,7 +338,7 @@ utext_lengthIsExpensive(const UText *ut);
* @draft ICU 3.4
*/
U_DRAFT UChar32 U_EXPORT2
-utext_char32At(UText *ut, int32_t index);
+utext_char32At(UText *ut, int32_t nativeIndex);
/**
@@ -309,7 +348,7 @@ utext_char32At(UText *ut, int32_t index);
* the input text.
*
* @param ut the text to be accessed.
- * @return the Unicode code point at the specified index.
+ * @return the Unicode code point at the current iterator position.
* @draft ICU 3.4
*/
U_DRAFT UChar32 U_EXPORT2
@@ -358,20 +397,17 @@ utext_previous32(UText *ut);
* and return the code point starting at or before that index.
* Leave the iteration index at the start of the following code point.
*
- * An inline macro version of this function, UTEXT_NEXT32FROM(),
- * is available for performance critical use.
-
* This function is the most efficient and convenient way to
* begin a forward iteration.
*
* @param ut the text to be accessed.
- * @param index Iteration index.
+ * @param index Iteration index, in the native units of the text provider.
* @return Code point which starts at or before index,
* or U_SENTINEL (-1) if it is out of bounds.
* @draft ICU 3.4
*/
U_DRAFT UChar32 U_EXPORT2
-utext_next32From(UText *ut, int32_t index);
+utext_next32From(UText *ut, int32_t nativeIndex);
@@ -380,21 +416,18 @@ utext_next32From(UText *ut, int32_t index);
* one specified by the initial index. Leave the iteration position
* at the start of the returned code point.
*
- * An inline macro version of this function, UTEXT_PREVIOUS32FROM(),
- * is available for performance critical use.
-
* This function is the most efficient and convenient way to
* begin a backwards iteration.
*
* @param ut the text to be accessed.
- * @param index Iteration index.
+ * @param index Iteration index in the native units of the thext provider.
* @return Code point preceding the one at the initial index,
* or U_SENTINEL (-1) if it is out of bounds.
*
* @draft ICU 3.4
*/
U_DRAFT UChar32 U_EXPORT2
-utext_previous32From(UText *ut, int32_t index);
+utext_previous32From(UText *ut, int32_t nativeIndex);
/**
* Get the current iterator position, which can range from 0 to
@@ -405,33 +438,40 @@ utext_previous32From(UText *ut, int32_t index);
* code point boundary
*
* @param ut the text to be accessed.
- * @return the current index position, in native units.
+ * @return the current index position, in the native units of the text provider.
* @draft ICU 3.4
*/
U_DRAFT int32_t U_EXPORT2
utext_getIndex(UText *ut);
/**
- * Set the current iteration position to the specified index.
+ * Set the current iteration position to the nearest code point
+ * boundary at or preceding the specified index.
* The index is in the native units of the original input text.
- * If the index is out of range, it will be trimmed to be witnin
+ * If the index is out of range, it will be trimmed to be within
* the range of the input text.
- * If the specifed index does not fall on a code point boundary in
- * the input text, it will be adjusted back to do so.
*
* It will usually be more efficient to begin an iteration
* using the functions utext_next32From() or utext_previous32From()
* rather than setIndex().
+ *
+ * Moving the index position to an adjacent character is best done
+ * with utext_next32(), utext_previous32() or utext_moveIndex().
+ * Attempting to do direct arithmetic on the index position is
+ * complicated by the fact that the size (in native units) of a
+ * character depends on the underlying representation of the character
+ * (utf-8, utf-16, utf-32, arbitrary codepage), and is not
+ * easily knowable.
*
* @param ut the text to be accessed.
* @param index the native unit index of the new iteration position.
* @draft ICU 3.4
*/
U_DRAFT void U_EXPORT2
-utext_setIndex(UText *ut, int32_t index);
+utext_setIndex(UText *ut, int32_t nativeIndex);
/**
- * Move the iterator postion by delta code points. The amount to move
+ * Move the iterator postion by delta code points. The number of code points
* is a signed number; a negative delta will move the iterator backwards,
* towards the start of the text.
*
@@ -439,6 +479,10 @@ utext_setIndex(UText *ut, int32_t index);
* forward or backward, but no further backward than to 0 and
* no further forward than to length().
* The resulting index value will be in between 0 and length(), inclusive.
+ *
+ * Because the index is kept in the native units of the text provider, the
+ * actual numeric amount by which the index moves depends on the
+ * underlying text storage representation of the text provider.
*
* @param ut the text to be accessed.
* @param delta the signed number of code points to move the iteration position.
@@ -467,22 +511,21 @@ utext_moveIndex(UText *ut, int32_t delta);
* @param ut the UText from which to extract data.
* @param start the native index of the first character to extract.
* @param limit the native string index of the position following the last
- * character to extract.
+ * character to extract. If the specified limit is greater than the length
+ * of the text, the limit will be trimmed back to the text length.
* @param dest the UChar (utf-16) buffer into which the extracted text is placed
* @param destCapacity The size, in UChars, of the destination buffer. May be zero
* for precomputing the required size.
* @param status receives any error status.
* U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the
* buffer was too small. Returns number of UChars for preflighting.
- * @return Number of UChars in the data. Does not include a trailing NUL.
- *
- * TODO: how should invalid source data be handled? Corrupt utf-8, for example.
+ * @return Number of UChars in the data to be extracted. Does not include a trailing NUL.
*
* @draft ICU 3.4
*/
U_DRAFT int32_t U_EXPORT2
utext_extract(UText *ut,
- int32_t start, int32_t limit,
+ int32_t nativeStart, int32_t nativeLimit,
UChar *dest, int32_t destCapacity,
UErrorCode *status);
@@ -533,38 +576,6 @@ utext_extract(UText *ut,
(ut)->chunk.contents[--((ut)->chunk.offset)] : utext_previous32(ut))
-/**
- * inline version of utext_next32from(), for performance-critical situations.
- *
- * Set the iteration index, access the text for forward iteration,
- * and return the code point starting at or before that index.
- * Leave the iteration index at the start of the following code point.
- *
- * @draft ICU 3.4
- */
-#define UTEXT_NEXT32FROM(ut, index) \
- ((index) >= (ut)->chunk.start && \
- (index) < (ut)->chunk.limit && \
- !(ut)->chunk.nonUTF16Indexes && \
- (ut)->chunk.contents[(ut)->chunk.offset=(ut)->chunk.start+(index)] < 0xd800 ? \
- (ut)->chunk.contents[((ut)->chunk.offset)++] : utext_next32From(ut, index))
-
-/**
- * inline version of utext_previous32from(), for performance-critical situations.
- *
- * Set the iteration index, and return the code point preceding the
- * one specified by the initial index. Leave the iteration position
- * at the start of the returned code point.
- *
- * @draft ICU 3.4
- */
-#define UTEXT_PREVIOUS32FROM(ut, index) \
- ((index) > (ut)->chunk.start && \
- (index) <= (ut)->chunk.limit && \
- !(ut)->chunk.nonUTF16Indexes && \
- (ut)->chunk.contents[(ut)->chunk.offset=(ut)->chunk.start+(index-1)] < 0xd800 ? \
- (ut)->chunk.contents[(ut)->chunk.offset] : utext_previous32From(ut, index))
-
/************************************************************************************
@@ -587,7 +598,7 @@ utext_extract(UText *ut,
*
*/
U_DRAFT UBool U_EXPORT2
-utext_isWriteable(const UText *ut);
+utext_isWriteble(const UText *ut);
/**
@@ -609,7 +620,7 @@ utext_hasMetaData(const UText *ut);
* newly inserted replacement text.
*
* This function is only available on UText types that support writing,
- * that is, ones where utext_isWriteable() returns TRUE.
+ * that is, ones where utext_isWritable() returns TRUE.
*
* When using this function, there should be only a single UText opened onto the
* underlying native text string. Behavior after a replace operation
@@ -617,8 +628,8 @@ utext_hasMetaData(const UText *ut);
* modified string.
*
* @param ut the UText representing the text to be operated on.
- * @param start the native index of the start of the region to be replaced
- * @param limit the native index of the character following the region to be replaced.
+ * @param nativeStart the native index of the start of the region to be replaced
+ * @param nativeLimit the native index of the character following the region to be replaced.
* @param replacementText pointer to the replacement text
* @param replacmentLength length of the replacement text, or -1 if the text is NUL terminated.
* @param status receives any error status. Possible errors include
@@ -631,7 +642,7 @@ utext_hasMetaData(const UText *ut);
*/
U_DRAFT int32_t U_EXPORT2
utext_replace(UText *ut,
- int32_t start, int32_t limit,
+ int32_t nativeStart, int32_t nativeLimit,
const UChar *replacementText, int32_t replacementLength,
UErrorCode *status);
@@ -648,25 +659,25 @@ utext_replace(UText *ut,
* it does not replace or overwrite any existing text.
*
* This function is only available on UText types that support writing,
- * that is, ones where utext_isWriteable() returns TRUE.
+ * that is, ones where utext_isWritable() returns TRUE.
*
* When using this function, there should be only a single UText opened onto the
* underlying native text string. Behavior after a copy operation
* on a UText is undefined in any other additional UTexts that refer to the
* modified string.
*
- * @param ut The UText representing the text to be operated on.
- * @param start The native index of the start of the region to be copied or moved
- * @param limit The native index of the character following the region to be replaced.
- * @param destIndex The native destination index to which the source substring is copied or moved.
- * @param move If TRUE, then the substring is moved, not copied/duplicated.
- * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION
+ * @param ut The UText representing the text to be operated on.
+ * @param nativeStart The native index of the start of the region to be copied or moved
+ * @param nativeLimit The native index of the character following the region to be replaced.
+ * @param destIndex The native destination index to which the source substring is copied or moved.
+ * @param move If TRUE, then the substring is moved, not copied/duplicated.
+ * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION
*
* @draft ICU 3.4
*/
U_DRAFT void U_EXPORT2
utext_copy(UText *ut,
- int32_t start, int32_t limit,
+ int32_t nativeStart, int32_t nativeLimit,
int32_t destIndex,
UBool move,
UErrorCode *status);
@@ -709,10 +720,10 @@ struct UTextChunk {
int32_t length;
/** (Native) text index corresponding to the start of the chunk. */
- int32_t start;
+ int32_t nativeStart;
/** (Native) text index corresponding to the end of the chunk (contents+length). */
- int32_t limit;
+ int32_t nativeLimit;
/** If TRUE, then non-UTF-16 indexes are used in this chunk. */
UBool nonUTF16Indexes;
@@ -739,10 +750,10 @@ enum {
*/
UTEXT_PROVIDER_NON_UTF16_INDEXES,
/**
- * The provider can return the text length inexpensively.
+ * It is potentially time consuming for the provider to determine the length of the text.
* @draft ICU 3.4
*/
- UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE,
+ UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE,
/**
* Text chunks remain valid and usable until the text object is modified or
* deleted, not just until the next time the access() function is called
@@ -799,18 +810,6 @@ enum {
typedef UText * U_CALLCONV
UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status);
-/**
- * Function type declaration for UText.GetProperties().
- *
- * Gets the provider properties for this UText.
- *
- * @param ut the UText to get properties for.
- * @return Provider properties bit field.
- *
- * @draft ICU 3.4
- */
-typedef int32_t U_CALLCONV
-UTextGetProperties(UText *ut);
/**
* Function type declaration for UText.length().
@@ -821,7 +820,7 @@ UTextGetProperties(UText *ut);
* @draft ICU 3.4
*/
typedef int32_t U_CALLCONV
-UTextLength(UText *ut);
+UTextNativeLength(UText *ut);
/**
* Function type declaration for UText.access(). Get the description of the text chunk
@@ -830,22 +829,22 @@ UTextLength(UText *ut);
* of bounds, the iteration position will be left at the start or end
* of the string, as appropriate.
*
- * @param ut the UText being accessed.
- * @param index Requested (native) index of the text to be accessed.
- * @param forward If TRUE, then the returned chunk must contain text
- * starting from the index, so that start<=indexlength.
- * @return Absolute (native) index corresponding to the UTF-16 offset
- * relative to the current text chunk.
+ * @return Absolute (native) index corresponding to the specified chunk offset.
+ * The returned native index should always be to a code point boundary.
*
* @draft ICU 3.4
*/
typedef int32_t U_CALLCONV
-UTextMapOffsetToNative(UText *ut, UTextChunk *chunk, int32_t offset);
+UTextMapOffsetToNative(UText *ut, int32_t offset);
/**
* Function type declaration for UText.mapIndexToUTF16().
- * This is required only for text providers that do not use native utf-16 indexes.
+ * Map from a native index to a UChar offset within a text chunk
*
- * @param ut The UText containing the text chunk.
- * @param chunk the text chunk in which the mapping occurs.
- * TODO: keep this as a separate parameter, or just imply that the function
- * works on the chunk embedded in the UText?
- * @param index Absolute (native) text index, chunk->start<=index<=chunk->limit.
- * @return Chunk-relative UTF-16 offset corresponding to the absolute (native)
- * index.
+ * This function is required only for text providers that do not use native utf-16 indexes.
*
- * @see UText
+ * @param ut The UText containing the text chunk.
+ * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit.
+ * @return Chunk-relative UTF-16 offset corresponding to the specified native
+ * index.
+ *
+ * TODO: specify behavior with out-of-bounds index? Shouldn't ever occur.
* @draft ICU 3.4
*/
typedef int32_t U_CALLCONV
-UTextMapIndexToUTF16(UText *ut, UTextChunk *chunk, int32_t index);
+UTextMapIndexToUTF16(UText *ut, int32_t nativeIndex);
/**
@@ -1077,6 +1077,15 @@ struct UText {
int32_t a, b, c;
+ /**
+ * Text provider properties. This set of flags is maintainted by the
+ * text provider implementation.
+ * @draft ICU 3.4
+ */
+ int32_t providerProperties;
+
+
+
/** desciptor for the text chunk that includes or is adjacent to
* the current iteration position.
* @draft ICU 3.4
@@ -1084,14 +1093,6 @@ struct UText {
UTextChunk chunk;
- /**
- * Text provider properties
- * @draft ICU 3.4
- */
- int32_t providerProperties;
-
-
-
/**
* (public) Function pointer for UTextClone
*
@@ -1100,14 +1101,6 @@ struct UText {
*/
UTextClone *clone;
- /**
- * (public) function pointer for UTextGetProperties
- *
- * @see UTextGetProperties
- * @draft ICU 3.4
- */
- UTextGetProperties *properties;
-
/**
* (public) function pointer for UTextLength
* May be expensive to compute!
@@ -1115,7 +1108,7 @@ struct UText {
* @see UTextLength
* @draft ICU 3.4
*/
- UTextLength *length;
+ UTextNativeLength *length;
/**
* (public) Function pointer for UTextAccess.
@@ -1224,16 +1217,16 @@ enum {
* @internal
*/
#define UTEXT_INITIALZIER_HEAD \
- NULL, /* context */ \
- NULL, NULL, NULL, /* p, q, r */ \
- NULL, /* pExtra */ \
- 0, /* extraSize */ \
- 0, /* flags */ \
- UTEXT_MAGIC, /* magic */ \
- sizeof(UText), /* sizeOfStruct */ \
- 0, 0, 0, /* a, b, c */ \
- UTEXT_CHUNK_INIT, /* UTextChunk */ \
- -1 /* provderProps */
+ NULL, /* context */ \
+ NULL, NULL, NULL, /* p, q, r */ \
+ NULL, /* pExtra */ \
+ 0, /* extraSize */ \
+ 0, /* flags */ \
+ UTEXT_MAGIC, /* magic */ \
+ sizeof(UText), /* sizeOfStruct */ \
+ 0, 0, 0, /* a, b, c */ \
+ 0, /* providerProps */ \
+ UTEXT_CHUNK_INIT /* UTextChunk */
@@ -1247,7 +1240,6 @@ enum {
#define UTEXT_INITIALIZER { \
UTEXT_INITIALZIER_HEAD, \
NULL, /* clone () */ \
- NULL, /* properties ()*/ \
NULL, /* length () */ \
NULL, /* access () */ \
NULL, /* extract () */ \
diff --git a/icu4c/source/common/utext.cpp b/icu4c/source/common/utext.cpp
index a8e872135fc..6751791cbf6 100644
--- a/icu4c/source/common/utext.cpp
+++ b/icu4c/source/common/utext.cpp
@@ -21,6 +21,7 @@
#include "ustr_imp.h"
#include "cmemory.h"
#include "cstring.h"
+#include "uassert.h"
#define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
@@ -38,7 +39,7 @@ utext_moveIndex(UText *ut, int32_t delta) {
UBool retval = TRUE;
if(delta>0) {
do {
- if(ut->chunk.offset>=ut->chunk.length && !utext_access(ut, ut->chunk.limit, TRUE)) {
+ if(ut->chunk.offset>=ut->chunk.length && !utext_access(ut, ut->chunk.nativeLimit, TRUE)) {
retval = FALSE;
break;
}
@@ -46,7 +47,7 @@ utext_moveIndex(UText *ut, int32_t delta) {
} while(--delta>0);
} else if (delta<0) {
do {
- if(ut->chunk.offset<=0 && !utext_access(ut, ut->chunk.start, FALSE)) {
+ if(ut->chunk.offset<=0 && !utext_access(ut, ut->chunk.nativeStart, FALSE)) {
retval = FALSE;
break;
}
@@ -63,12 +64,20 @@ utext_length(UText *ut) {
return ut->length(ut);
}
+
+U_DRAFT UBool U_EXPORT2
+utext_isLengthExpensive(const UText *ut) {
+ UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
+ return r;
+}
+
+
U_DRAFT int32_t U_EXPORT2
utext_getIndex(UText *ut) {
if(!ut->chunk.nonUTF16Indexes || ut->chunk.offset==0) {
- return ut->chunk.start+ut->chunk.offset;
+ return ut->chunk.nativeStart+ut->chunk.offset;
} else {
- return ut->mapOffsetToNative(ut, &ut->chunk, ut->chunk.offset);
+ return ut->mapOffsetToNative(ut, ut->chunk.offset);
}
}
@@ -76,23 +85,23 @@ utext_getIndex(UText *ut) {
U_DRAFT void U_EXPORT2
utext_setIndex(UText *ut, int32_t index) {
- // TODO - revise for keeping index always valid.
- if(indexchunk.start || ut->chunk.limitchunk.nativeStart || ut->chunk.nativeLimitaccess(ut, index, TRUE, &ut->chunk);
} else if(ut->chunk.nonUTF16Indexes) {
- ut->chunk.offset=ut->mapIndexToUTF16(ut, &ut->chunk, index);
+ ut->chunk.offset=ut->mapIndexToUTF16(ut, index);
} else {
- ut->chunk.offset=index-ut->chunk.start;
+ ut->chunk.offset=index-ut->chunk.nativeStart;
// Our convention is that the index must always be on a code point boundary.
// If we are somewhere in the middle of a utf-16 buffer, check that new index
// is not in the middle of a surrogate pair.
- if (index>ut->chunk.start && index < ut->chunk.limit) { // TODO: clean up end-of-chunk / end of input handling. Everywhere.
+ if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) {
UChar c = ut->chunk.contents[ut->chunk.offset];
if (U16_TRAIL(c)) {
- utext_current(ut); // force index onto a code point boundary.
+ utext_current(ut); // force index to the start of the curent code point.
}
}
}
@@ -123,6 +132,18 @@ utext_current(UText *ut) {
return c;
}
+
+U_DRAFT UChar32 U_EXPORT2
+utext_char32At(UText *ut, int32_t nativeIndex) {
+ UChar32 c = U_SENTINEL;
+ utext_setIndex(ut, nativeIndex);
+ if (nativeIndex >= 0 && nativeIndex < ut->chunk.nativeLimit) {
+ c = ut->chunk.contents[ut->chunk.offset];
+ }
+ return c;
+}
+
+
U_DRAFT UChar32 U_EXPORT2
utext_next32(UText *ut) {
UTextChunk *chunk = &ut->chunk;
@@ -130,7 +151,7 @@ utext_next32(UText *ut) {
UChar32 c = U_SENTINEL;
if (offset >= chunk->length) {
- if (ut->access(ut, chunk->limit, TRUE, chunk) == FALSE) {
+ if (ut->access(ut, chunk->nativeLimit, TRUE, chunk) == FALSE) {
goto next32_return;
}
offset = chunk->offset;
@@ -160,7 +181,7 @@ utext_previous32(UText *ut) {
UChar32 c = U_SENTINEL;
if (offset <= 0) {
- if (ut->access(ut, chunk->start, FALSE, chunk) == FALSE) {
+ if (ut->access(ut, chunk->nativeStart, FALSE, chunk) == FALSE) {
goto prev32_return;
}
offset = chunk->offset;
@@ -186,16 +207,16 @@ utext_next32From(UText *ut, int32_t index) {
UTextChunk *chunk = &ut->chunk;
UChar32 c = U_SENTINEL;
- if(indexstart || index>=chunk->limit) {
+ if(indexnativeStart || index>=chunk->nativeLimit) {
if(!ut->access(ut, index, TRUE, chunk)) {
// no chunk available here
goto next32return;
}
offset = chunk->offset;
} else if(chunk->nonUTF16Indexes) {
- offset=ut->mapIndexToUTF16(ut, chunk, index);
+ offset=ut->mapIndexToUTF16(ut, index);
} else {
- offset = index - chunk->start;
+ offset = index - chunk->nativeStart;
}
c = chunk->contents[offset++];
@@ -220,16 +241,16 @@ utext_previous32From(UText *ut, int32_t index) {
UTextChunk *chunk = &ut->chunk;
UChar32 c = U_SENTINEL;
- if(index<=chunk->start || index>chunk->limit) {
+ if(index<=chunk->nativeStart || index>chunk->nativeLimit) {
if(!ut->access(ut, index, FALSE, chunk)) {
// no chunk available here
goto prev32return;
}
offset = chunk->offset;
} else if(chunk->nonUTF16Indexes) {
- offset=ut->mapIndexToUTF16(ut, chunk, index);
+ offset=ut->mapIndexToUTF16(ut, index);
} else {
- offset = index - chunk->start;
+ offset = index - chunk->nativeStart;
}
offset--;
@@ -253,6 +274,66 @@ utext_extract(UText *ut,
}
+
+
+U_DRAFT UBool U_EXPORT2
+utext_isWriteble(const UText *ut)
+{
+ UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
+ return b;
+}
+
+
+U_DRAFT UBool U_EXPORT2
+utext_hasMetaData(const UText *ut)
+{
+ UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
+ return b;
+}
+
+
+
+U_DRAFT int32_t U_EXPORT2
+utext_replace(UText *ut,
+ int32_t nativeStart, int32_t nativeLimit,
+ const UChar *replacementText, int32_t replacementLength,
+ UErrorCode *status)
+{
+ if (U_FAILURE(*status)) {
+ return 0;
+ }
+ if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
+ *status = U_NO_WRITE_PERMISSION;
+ return 0;
+ }
+ int32_t i = ut->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
+ return i;
+}
+
+U_DRAFT void U_EXPORT2
+utext_copy(UText *ut,
+ int32_t nativeStart, int32_t nativeLimit,
+ int32_t destIndex,
+ UBool move,
+ UErrorCode *status)
+{
+ if (U_FAILURE(*status)) {
+ return;
+ }
+ if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
+ *status = U_NO_WRITE_PERMISSION;
+ return;
+ }
+ ut->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
+}
+
+
+
+U_DRAFT UText * U_EXPORT2
+utext_clone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
+ return src->clone(dest, src, deep, status);
+}
+
U_DRAFT UBool U_EXPORT2
utext_compare(UText *ut, const UChar *s, int32_t length, UBool codePointOrder) {
int32_t segLength, result;
@@ -285,7 +366,7 @@ utext_compare(UText *ut, const UChar *s, int32_t length, UBool codePointOrder) {
return 0;
}
- if(!ut->access(ut, ut->chunk.limit, TRUE, &ut->chunk)) {
+ if(!ut->access(ut, ut->chunk.nativeLimit, TRUE, &ut->chunk)) {
// the text ends before the string does
return -1;
}
@@ -321,9 +402,10 @@ enum {
//
// Extended form of a UText. The purpose is to aid in computing the total size required
// when a provider asks for a UText to be allocated with extra storage.
-//
-struct ExtendedUText: public UText {
- void *extension;
+
+struct ExtendedUText {
+ UText ut;
+ UAlignedMemory extension;
};
static const UText emptyText = UTEXT_INITIALIZER;
@@ -338,14 +420,18 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
// We need to heap-allocate storage for the new UText
int32_t spaceRequired = sizeof(UText);
if (extraSpace > 0) {
- spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(void *);
+ spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
}
ut = (UText *)uprv_malloc(spaceRequired);
- *ut = emptyText;
- ut->flags |= UTEXT_HEAP_ALLOCATED;
- if (spaceRequired>0) {
- ut->extraSize = spaceRequired;
- ut->pExtra = &((ExtendedUText *)ut)->extension;
+ if (ut == NULL) {
+ *status = U_MEMORY_ALLOCATION_ERROR;
+ } else {
+ *ut = emptyText;
+ ut->flags |= UTEXT_HEAP_ALLOCATED;
+ if (spaceRequired>0) {
+ ut->extraSize = spaceRequired;
+ ut->pExtra = &((ExtendedUText *)ut)->extension;
+ }
}
} else {
// We have been supplied with an already existing UText.
@@ -378,6 +464,9 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
}
}
}
+ if (U_SUCCESS(*status)) {
+ ut->flags |= UTEXT_OPEN;
+ }
return ut;
}
@@ -429,15 +518,15 @@ utext_close(UText *ut) {
//
static void
resetChunk(UTextChunk *chunk, int32_t index) {
- if (index==chunk->limit) {
+ if (index==chunk->nativeLimit) {
chunk->offset = chunk->length;
- } else if (index==chunk->start) {
+ } else if (index==chunk->nativeStart) {
chunk->offset = 0;
} else {
- chunk->length = 0;
- chunk->start = index;
- chunk->limit = index;
- chunk->offset = 0;
+ chunk->length = 0;
+ chunk->nativeStart = index;
+ chunk->nativeLimit = index;
+ chunk->offset = 0;
}
}
@@ -452,16 +541,52 @@ resetChunk(UTextChunk *chunk, int32_t index) {
U_CDECL_BEGIN
static UText * U_CALLCONV
-noopTextClone(UText * /* dest */, const UText * /*src*/, UBool /*deep*/, UErrorCode * /*status*/) {
- return NULL; // not supported
+//
+// Clone. This is a generic copy-the-utext-by-value clone function that can be
+// used as-is with some utext types, and as helper by other clones.
+//
+noopTextClone(UText * dest, const UText * src, UBool deep, UErrorCode * status) {
+ if (U_FAILURE(*status)) {
+ return NULL;
+ }
+ int32_t srcExtraSize = src->extraSize;
+
+ //
+ // Use the generic text_setup to allocate storage if required.
+ //
+ dest = utext_setup(dest, srcExtraSize, status);
+ if (U_FAILURE(*status)) {
+ return dest;
+ }
+
+ //
+ // flags (how the UText was allocated) and the pointer to the
+ // extra storage must retain the values in the cloned utext that
+ // were set up by utext_setup. Save them separately before
+ // copying the whole struct.
+ //
+ void *destExtra = dest->pExtra;
+ int32_t flags = dest->flags;
+
+
+ //
+ // Copy the whole UText struct by value.
+ // Any "Extra" storage is copied also.
+ //
+ int sizeToCopy = src->sizeOfStruct;
+ if (sizeToCopy > dest->sizeOfStruct) {
+ sizeToCopy = dest->sizeOfStruct;
+ }
+ uprv_memcpy(dest, src, sizeToCopy);
+ dest->pExtra = destExtra;
+ dest->flags = flags;
+ if (srcExtraSize > 0) {
+ uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
+ }
+
+ return dest;
}
-static int32_t U_CALLCONV
-noopTextGetProperties(UText * /*t*/) {
- return
- I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE)|
- I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
-}
static int32_t U_CALLCONV
noopTextLength(UText * /* t */) {
@@ -483,12 +608,12 @@ noopTextExtract(UText * /* t */,
}
static int32_t U_CALLCONV
-noopTextMapOffsetToNative(UText * /* t */, UTextChunk * /* chunk */, int32_t /* offset */) {
+noopTextMapOffsetToNative(UText * /* t */, int32_t /* offset */) {
return 0;
}
static int32_t U_CALLCONV
-noopTextMapIndexToUTF16(UText * /* t */, UTextChunk * /* chunk */, int32_t /* index */) {
+noopTextMapIndexToUTF16(UText * /* t */, int32_t /* index */) {
return 0;
}
@@ -498,7 +623,6 @@ U_CDECL_END
static const UText noopText={
UTEXT_INITIALZIER_HEAD,
noopTextClone,
- noopTextGetProperties,
noopTextLength,
noopTextAccess,
noopTextExtract,
@@ -550,14 +674,6 @@ struct UTF8Extra {
U_CDECL_BEGIN
-static int32_t U_CALLCONV
-utf8TextGetProperties(UText * /*t*/) {
- return
- I32_FLAG(UTEXT_PROVIDER_NON_UTF16_INDEXES)|
- I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE);
- // not UTEXT_PROVIDER_STABLE_CHUNKS because chunk-related data is kept
- // in UTF8Text, so only one at a time can be active
-}
static int32_t U_CALLCONV
utf8TextLength(UText *ut) {
@@ -590,7 +706,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
return FALSE;
}
- chunk->start=index;
+ chunk->nativeStart=index;
c=s8[index];
if(c<=0x7f) {
// get a run of ASCII characters.
@@ -621,11 +737,11 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
map[i]=index;
chunk->nonUTF16Indexes=TRUE;
}
- chunk->contents = u16buf;
- chunk->length = i;
- chunk->limit = index;
- ut->q = map;
- chunk->offset = 0; // chunkOffset corresponding to index
+ chunk->contents = u16buf;
+ chunk->length = i;
+ chunk->nativeLimit = index;
+ ut->q = map;
+ chunk->offset = 0; // chunkOffset corresponding to index
return TRUE;
} else {
// Reverse Access. The chunk buffer must be filled so as to contain the
@@ -635,7 +751,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
return FALSE;
}
- chunk->limit=index;
+ chunk->nativeLimit=index;
c=s8[index-1];
if(c<=0x7f) {
// get a chunk of ASCII characters. Don't build the index map
@@ -684,10 +800,10 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
chunk->nonUTF16Indexes=TRUE;
}
// Common reverse iteration, for both UTF16 and non-UTIF16 indexes.
- chunk->contents = u16buf+i;
- chunk->length = (UTF8_TEXT_CHUNK_SIZE)-i;
- chunk->start = index;
- chunk->offset = chunk->length; // chunkOffset corresponding to index
+ chunk->contents = u16buf+i;
+ chunk->length = (UTF8_TEXT_CHUNK_SIZE)-i;
+ chunk->nativeStart = index;
+ chunk->offset = chunk->length; // chunkOffset corresponding to index
return TRUE;
}
}
@@ -717,18 +833,20 @@ utf8TextExtract(UText *ut,
// Assume nonUTF16Indexes and 0<=offset<=chunk->length
static int32_t U_CALLCONV
-utf8TextMapOffsetToNative(UText *ut, UTextChunk * /* chunk */, int32_t offset) {
+utf8TextMapOffsetToNative(UText *ut, int32_t offset) {
// UText.q points to the index mapping array that is allocated in the extra storage area.
+ U_ASSERT(offset>=0 && offset<=ut->chunk.length);
int32_t *map=(int32_t *)(ut->q);
return map[offset];
}
// Assume nonUTF16Indexes and chunk->start<=index<=chunk->limit
static int32_t U_CALLCONV
-utf8TextMapIndexToUTF16(UText *ut, UTextChunk * /*chunk */, int32_t index) {
+utf8TextMapIndexToUTF16(UText *ut, int32_t index) {
int32_t *map=(int32_t *)(ut->q);
int32_t offset=0;
+ U_ASSERT(index>=ut->chunk.nativeStart && index<=ut->chunk.nativeLimit);
while(index>map[offset]) {
++offset;
}
@@ -752,9 +870,9 @@ utext_openUTF8(UText *ut, const uint8_t *s, int32_t length, UErrorCode *status)
if (U_FAILURE(*status)) {
return ut;
}
+ ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_NON_UTF16_INDEXES);
ut->clone = noopTextClone;
- ut->properties = utf8TextGetProperties;
ut->length = utf8TextLength;
ut->access = utf8TextAccess;
ut->extract = utf8TextExtract;
@@ -777,190 +895,6 @@ U_CDECL_END
-//------------------------------------------------------------------------------
-//
-// UText implementation for SBCS strings (read-only)
-//
-// Use of UText data members:
-// context pointer to SBCS string
-//
-//------------------------------------------------------------------------------
-
-
-enum { SBCS_TEXT_CHUNK_SIZE=10 };
-
-struct SBCSText : public UText {
- /* pointer to SBCS-to-BMP mapping table */
- const UChar *toU;
- /* length of UTF-8 string (in bytes) */
- int32_t length;
- /* chunk UChars */
- UChar s[SBCS_TEXT_CHUNK_SIZE];
-};
-
-
-U_CDECL_BEGIN
-
-static int32_t U_CALLCONV
-sbcsTextGetProperties(UText * /*t*/) {
- return
- I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE);
- // not UTEXT_PROVIDER_STABLE_CHUNKS because chunk-related data is kept
- // in SBCSText, so only one at a time can be active
-}
-
-static int32_t U_CALLCONV
-sbcsTextLength(UText *t) {
- return ((SBCSText *)t)->length;
-}
-
-static UBool U_CALLCONV
-sbcsTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
- SBCSText *ts=(SBCSText *)ut;
- const uint8_t *s8=(const uint8_t *)ts->context;
- int32_t i, count, length=ts->length;
-
- chunk->nonUTF16Indexes=FALSE;
-
- if(forward) {
- if(length<=index) {
- resetChunk(chunk, length);
- return FALSE;
- }
-
- count=length-index;
- if(count>SBCS_TEXT_CHUNK_SIZE) {
- count=SBCS_TEXT_CHUNK_SIZE;
- }
- chunk->start=index;
- for(i=0; is[i]=ts->toU[s8[index]];
- }
- chunk->contents=ts->s;
- chunk->length=i;
- chunk->limit=index;
- chunk->offset = 0; // chunkOffset corresponding to index
- return TRUE;
- } else {
- if(index<=0) {
- resetChunk(chunk, 0);
- return FALSE;
- }
-
- if(index<=SBCS_TEXT_CHUNK_SIZE) {
- count=index;
- } else {
- count=SBCS_TEXT_CHUNK_SIZE;
- }
- chunk->limit=index;
- for(i=count; i>0;) {
- ts->s[--i]=ts->toU[s8[--index]];
- }
- chunk->contents=ts->s;
- chunk->length=count;
- chunk->start=index;
- chunk->offset=count; // chunkOffset corresponding to index
- return TRUE;
- }
-}
-
-static int32_t U_CALLCONV
-sbcsTextExtract(UText *t,
- int32_t start, int32_t limit,
- UChar *dest, int32_t destCapacity,
- UErrorCode *pErrorCode) {
- SBCSText *ts=(SBCSText *)t;
- if(U_FAILURE(*pErrorCode)) {
- return 0;
- }
- if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- }
- if(start<0 || start>limit || ts->lengthcontext+start;
- UChar *d=dest;
- const UChar *destLimit;
- int32_t destLength=limit-start;
- if(destLength>destCapacity) {
- destLength=destCapacity;
- }
- destLimit=dest+destLength;
- while(dtoU[*s8++];
- }
- return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
-}
-
-static const UText sbcsText={
- UTEXT_INITIALZIER_HEAD,
- noopTextClone,
- sbcsTextGetProperties,
- sbcsTextLength,
- sbcsTextAccess,
- sbcsTextExtract,
- NULL, // replace
- NULL, // copy
- NULL, // mapOffsetToNative
- NULL, // mapIndexToUTF16
- NULL // close
-};
-
-U_DRAFT UText * U_EXPORT2
-utext_openSBCS(UText * /*ut */,
- const UChar /* toU*/[256] ,
- const char *s, int32_t length,
- UErrorCode *pErrorCode) {
- if(U_FAILURE(*pErrorCode)) {
- return NULL;
- }
- if(s==NULL || length<-1) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return NULL;
- }
- SBCSText *ts=(SBCSText *)uprv_malloc(sizeof(SBCSText));
- if(ts==NULL) {
- *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
- return NULL;
- }
- *((UText *)ts)=sbcsText;
- ts->context=s;
- if(length>=0) {
- ts->length=length;
- } else {
- ts->length=(int32_t)uprv_strlen(s);
- }
- return ts;
-}
-
-U_DRAFT void U_EXPORT2
-utext_closeSBCS(UText *t) {
- if(t!=NULL) {
- uprv_free((SBCSText *)t);
- }
-}
-
-U_DRAFT void U_EXPORT2
-utext_resetSBCS(UText *t, const char *s, int32_t length, UErrorCode *pErrorCode) {
- if(U_FAILURE(*pErrorCode)) {
- return;
- }
- if(s==NULL || length<-1) {
- *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
- return;
- }
- SBCSText *ts=(SBCSText *)t;
- ts->context=s;
- if(length>=0) {
- ts->length=length;
- } else {
- ts->length=(int32_t)uprv_strlen(s);
- }
-}
-
-U_CDECL_END
/* UText implementation wrapper for Replaceable (read/write) ---------------- */
@@ -1283,7 +1217,6 @@ unistrTextClone(UText * /* dest */, const UText * /*src*/, UBool /*deep*/, UErro
static int32_t U_CALLCONV
unistrTextGetProperties(UText * /*t*/) {
return
- I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE)|
I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)|
I32_FLAG(UTEXT_PROVIDER_WRITABLE);
}
@@ -1299,13 +1232,13 @@ unistrTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
const UnicodeString *us = (const UnicodeString *)ut->context;
int32_t length = us->length();
- if (chunk->limit != length) {
+ if (chunk->nativeLimit != length) {
// This chunk is not yet set up. Do it now.
- chunk->contents=us->getBuffer();
- chunk->length=length;
- chunk->start=0;
- chunk->limit=length;
- chunk->nonUTF16Indexes=FALSE;
+ chunk->contents = us->getBuffer();
+ chunk->length = length;
+ chunk->nativeStart = 0;
+ chunk->nativeLimit = length;
+ chunk->nonUTF16Indexes = FALSE;
}
// pin the requested index to the bounds of the string,
@@ -1423,7 +1356,6 @@ utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
ut = utext_setup(ut, 0, status);
if (U_SUCCESS(*status)) {
ut->clone = unistrTextClone;
- ut->properties = unistrTextGetProperties;
ut->length = unistrTextLength;
ut->access = unistrTextAccess;
ut->extract = unistrTextExtract;
@@ -1431,6 +1363,205 @@ utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
ut->copy = unistrTextCopy;
ut->context = s;
+ ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)|
+ I32_FLAG(UTEXT_PROVIDER_WRITABLE);
+ }
+ return ut;
+}
+
+
+//------------------------------------------------------------------------------
+//
+// UText implementation for const UChar * strings
+//
+// Use of UText data members:
+// context pointer to UnicodeString
+// a length. -1 if not yet known.
+//
+//------------------------------------------------------------------------------
+
+U_CDECL_BEGIN
+
+
+static UText * U_CALLCONV
+ucstrTextClone(UText * /* dest */, const UText * /*src*/, UBool /*deep*/, UErrorCode * /*status*/) {
+// TODO: fix this.
+ return NULL;
+}
+
+
+static int32_t U_CALLCONV
+ucstrTextLength(UText *ut) {
+ if (ut->a < 0) {
+ // null terminated, we don't yet know the length. Scan for it.
+ // Access is not convenient for doing this
+ // because the current interation postion can't be changed.
+ const UChar *str = (const UChar *)ut->context;
+ for (;;) {
+ if (str[ut->chunk.nativeLimit] == 0) {
+ break;
+ }
+ ut->chunk.nativeLimit++;
+ }
+ ut->a = ut->chunk.nativeLimit;
+ ut->chunk.length = ut->chunk.nativeLimit;
+ ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
+ }
+ return ut->a;
+}
+
+
+static UBool U_CALLCONV
+ucstrTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) {
+ const UChar *str = (const UChar *)ut->context;
+
+ // pin the requested index to the bounds of the string,
+ // and set current iteration position.
+ if (index<0) {
+ index = 0;
+ } else if (index < ut->chunk.nativeLimit) {
+ // The request data is within the chunk as it is known so far.
+ // There is nothing more that needs to be done within this access function.
+ } else if (ut->a >= 0) {
+ // We know the length of this string, and the user is requesting something
+ // at or beyond the length. Trim the requested index to the length.
+ index = ut->a;
+ } else {
+ // Null terminated string, length not yet known.
+ // Scan down another 32 UChars or to the requested index, whichever is further
+ int scanLimit = ut->chunk.nativeLimit + 32;
+ if (scanLimit <= index) {
+ scanLimit = index+1; // TODO: beware int overflow
+ }
+ for (; ut->chunk.nativeLimitchunk.nativeLimit++) {
+ if (str[ut->chunk.nativeLimit] == 0) {
+ // We found the end of the string. Remember it, trim the index to it,
+ // and bail out of here.
+ ut->a = ut->chunk.nativeLimit;
+ ut->chunk.length = ut->chunk.nativeLimit;
+ if (index > ut->chunk.nativeLimit) {
+ index = ut->chunk.nativeLimit;
+ }
+ ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
+ goto breakout;
+ }
+ }
+ // We scanned through the next batch of UChars without finding the end.
+ // The endpoint of a chunk must not be left in the middle of a surrogate pair.
+ // If the current end is on a lead surrogate, back the end up by one.
+ // It doesn't matter if the end char happens to be an unpaired surrogate,
+ // and it's simpler not to worry about it.
+ if (U16_IS_LEAD(str[ut->chunk.nativeLimit-1])) {
+ --ut->chunk.nativeLimit;
+ }
+ }
+breakout:
+ chunk->offset = index;
+
+ // Check whether request is at the start or end
+ UBool retVal = (forward && indexchunk.nativeLimit) || (!forward && index>0);
+ return retVal;
+}
+
+
+
+static int32_t U_CALLCONV
+ucstrTextExtract(UText *ut,
+ int32_t start, int32_t limit,
+ UChar *dest, int32_t destCapacity,
+ UErrorCode *pErrorCode) {
+
+
+ if(U_FAILURE(*pErrorCode)) {
+ return 0;
+ }
+ if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return 0;
+ }
+
+ const UChar *s=(const UChar *)ut->context;
+ int32_t strLength=ut->a;
+ int32_t si, di;
+
+ // If text is null terminated and we haven't yet scanned down as far as the starting
+ // position of the extract, do it now.
+ if (strLength<0 && limit>=ut->chunk.nativeLimit) {
+ ucstrTextAccess(ut, start, TRUE, &ut->chunk);
+ }
+
+ // Raise an error if starting position is outside of the string.
+ if(start<0 || start>limit) {
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ if (strLength >= 0 && limit > strLength) {
+ // String length is known. Trim requested limit to be no more than the length
+ limit = strLength;
+ }
+
+ di = 0;
+ for (si=start; sia = si; // set string length for this UText
+ ut->chunk.nativeLimit = si;
+ ut->chunk.length = si;
+ //
+ break;
+ }
+ if (di=0) {
+ // We have filled the destination buffer, and the string is known.
+ // Cut the loop short. There is no need to scan string termination.
+ di = strLength;
+ break;
+ }
+ }
+ di++;
+ }
+
+ u_terminateUChars(dest, destCapacity, di, pErrorCode);
+ return di;
+ }
+
+
+
+U_CDECL_END
+
+
+U_DRAFT UText * U_EXPORT2
+utext_openUChars(UText *ut, const UChar *s, int32_t length, UErrorCode *status) {
+ if (U_FAILURE(*status)) {
+ return NULL;
+ }
+ if (length < -1) {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ return NULL;
+ }
+ ut = utext_setup(ut, 0, status);
+ if (U_SUCCESS(*status)) {
+ ut->clone = noopTextClone;
+ ut->length = ucstrTextLength;
+ ut->access = ucstrTextAccess;
+ ut->extract = ucstrTextExtract;
+ ut->replace = NULL;
+ ut->copy = NULL;
+
+ ut->context = s;
+ ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
+ if (length==-1) {
+ ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
+ }
+ ut->a = length;
+ ut->chunk.contents = s;
+ ut->chunk.nativeStart = 0;
+ ut->chunk.nativeLimit = length>=0? length : 0;
+ ut->chunk.nonUTF16Indexes = FALSE;
}
return ut;
}
diff --git a/icu4c/source/test/cintltst/utexttst.c b/icu4c/source/test/cintltst/utexttst.c
index 5cb5c8301d6..3413a972822 100644
--- a/icu4c/source/test/cintltst/utexttst.c
+++ b/icu4c/source/test/cintltst/utexttst.c
@@ -15,8 +15,10 @@
#include "unicode/utypes.h"
#include "unicode/utext.h"
+#include "unicode/ustring.h"
#include "cintltst.h"
-
+#include "memory.h"
+#include "string.h"
static void TestAPI(void);
@@ -45,7 +47,7 @@ addUTextTest(TestNode** root)
/*
* TestAPI verify that the UText API is accessible from C programs.
* This is not intended to be a complete test of the API functionality. That is
- * in the C++ intltest program.
+ * in the C++ intltest program.
* This test is intended to check that everything can be accessed and built in
* a pure C enviornment.
*/
@@ -55,19 +57,193 @@ static void TestAPI(void) {
UErrorCode status = U_ZERO_ERROR;
UBool gFailed = FALSE;
- UText utLoc = UTEXT_INITIALIZER;
- const char * cString = "Hello, World";
- UChar uString[] = {0x41, 0x42, 0x43, 0};
- uint8_t *utf8String;
- UText *uta;
- UText *utb;
+ // Open
+ {
+ UText utLoc = UTEXT_INITIALIZER;
+ const char * cString = "Hello, World";
+ UChar uString[] = {0x41, 0x42, 0x43, 0};
+ uint8_t *utf8String;
+ UText *uta;
+ UText *utb;
+ UChar c;
- utf8String = (uint8_t *)cString;
- uta = utext_openUTF8(&utLoc, utf8String, -1, &status);
- TEST_SUCCESS(status);
- TEST_ASSERT(uta == &utLoc);
+ status = U_ZERO_ERROR;
+ uta = utext_openUChars(NULL, uString, -1, &status);
+ TEST_SUCCESS(status);
+ c = utext_next32(uta);
+ TEST_ASSERT(c == 0x41);
+ utb = utext_close(uta);
+ TEST_ASSERT(utb == NULL);
- uta = utext_close(&utLoc);
- TEST_ASSERT(uta == &utLoc);
+ utf8String = (uint8_t *)cString;
+ uta = utext_openUTF8(&utLoc, utf8String, -1, &status);
+ TEST_SUCCESS(status);
+ TEST_ASSERT(uta == &utLoc);
+
+ uta = utext_close(&utLoc);
+ TEST_ASSERT(uta == &utLoc);
+ }
+
+ // utext_clone()
+ {
+ UChar uString[] = {0x41, 0x42, 0x43, 0};
+ int len;
+ UText *uta;
+ UText *utb;
+
+ status = U_ZERO_ERROR;
+ uta = utext_openUChars(NULL, uString, -1, &status);
+ TEST_SUCCESS(status);
+ utb = utext_clone(NULL, uta, FALSE, &status);
+ TEST_SUCCESS(status);
+ TEST_ASSERT(utb != NULL);
+ TEST_ASSERT(utb != uta);
+ len = utext_length(uta);
+ TEST_ASSERT(len == u_strlen(uString));
+ utext_close(uta);
+ utext_close(utb);
+ }
+
+ // basic access functions
+ {
+ UChar uString[] = {0x41, 0x42, 0x43, 0};
+ UText *uta;
+ UChar32 c;
+ int32_t len;
+ UBool b;
+ int32_t i;
+
+ status = U_ZERO_ERROR;
+ uta = utext_openUChars(NULL, uString, -1, &status);
+ TEST_ASSERT(uta!=NULL);
+ TEST_SUCCESS(status);
+ b = utext_isLengthExpensive(uta);
+ TEST_ASSERT(b==TRUE);
+ len = utext_length(uta);
+ TEST_ASSERT(len == u_strlen(uString));
+ b = utext_isLengthExpensive(uta);
+ TEST_ASSERT(b==FALSE);
+
+ c = utext_char32At(uta, 0);
+ TEST_ASSERT(c==uString[0]);
+
+ c = utext_current(uta);
+ TEST_ASSERT(c==uString[0]);
+
+ c = utext_next32(uta);
+ TEST_ASSERT(c==uString[0]);
+ c = utext_current(uta);
+ TEST_ASSERT(c==uString[1]);
+
+ c = utext_previous32(uta);
+ TEST_ASSERT(c==uString[0]);
+ c = utext_current(uta);
+ TEST_ASSERT(c==uString[0]);
+
+ c = utext_next32From(uta, 1);
+ TEST_ASSERT(c==uString[1]);
+ c = utext_next32From(uta, u_strlen(uString));
+ TEST_ASSERT(c==U_SENTINEL);
+
+ c = utext_previous32From(uta, 2);
+ TEST_ASSERT(c==uString[1]);
+ i = utext_getIndex(uta);
+ TEST_ASSERT(i == 1);
+
+ utext_setIndex(uta, 0);
+ b = utext_moveIndex(uta, 1);
+ TEST_ASSERT(b==TRUE);
+ i = utext_getIndex(uta);
+ TEST_ASSERT(i==1);
+
+ b = utext_moveIndex(uta, u_strlen(uString)-1);
+ TEST_ASSERT(b==TRUE);
+ i = utext_getIndex(uta);
+ TEST_ASSERT(i==u_strlen(uString));
+
+ b = utext_moveIndex(uta, 1);
+ TEST_ASSERT(b==FALSE);
+ i = utext_getIndex(uta);
+ TEST_ASSERT(i==u_strlen(uString));
+
+ utext_setIndex(uta, 0);
+ c = UTEXT_NEXT32(uta);
+ TEST_ASSERT(c==uString[0]);
+ c = utext_current(uta);
+ TEST_ASSERT(c==uString[1]);
+
+ c = UTEXT_PREVIOUS32(uta);
+ TEST_ASSERT(c==uString[0]);
+ c = UTEXT_PREVIOUS32(uta);
+ TEST_ASSERT(c==U_SENTINEL);
+
+
+ utext_close(uta);
+ }
+
+ {
+ //
+ // extract
+ //
+ UText *uta;
+ UChar uString[] = {0x41, 0x42, 0x43, 0};
+ UChar buf[100];
+ int32_t i;
+
+ status = U_ZERO_ERROR;
+ uta = utext_openUChars(NULL, uString, -1, &status);
+ TEST_SUCCESS(status);
+
+ status = U_ZERO_ERROR;
+ i = utext_extract(uta, 0, 100, NULL, 0, &status);
+ TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR);
+ TEST_ASSERT(i == u_strlen(uString));
+
+ status = U_ZERO_ERROR;
+ memset(buf, 0, sizeof(buf));
+ i = utext_extract(uta, 0, 100, buf, 100, &status);
+ TEST_SUCCESS(status);
+ TEST_ASSERT(i == u_strlen(uString));
+ i = u_strcmp(uString, buf);
+ TEST_ASSERT(i == 0);
+ utext_close(uta);
+ }
+
+ {
+ //
+ // Copy, Replace, isWritable
+ // Can't create an editable UText from plain C, so all we
+ // can easily do is check that errors returned.
+ UText *uta;
+ UChar uString[] = {0x41, 0x42, 0x43, 0};
+ UBool b;
+
+ status = U_ZERO_ERROR;
+ uta = utext_openUChars(NULL, uString, -1, &status);
+ TEST_SUCCESS(status);
+
+ b = utext_isWriteble(uta);
+ TEST_ASSERT(b == FALSE);
+
+ b = utext_hasMetaData(uta);
+ TEST_ASSERT(b == FALSE);
+
+ utext_replace(uta,
+ 0, 1, /* start, limit */
+ uString, -1, /* replacement, replacement length */
+ &status);
+ TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
+
+
+ utext_copy(uta,
+ 0, 1, /* start, limit */
+ 2, /* destination index */
+ FALSE, /* move flag */
+ &status);
+ TEST_ASSERT(status == U_NO_WRITE_PERMISSION);
+
+ }
+
+
+}
-}
\ No newline at end of file
diff --git a/icu4c/source/test/intltest/utxttest.cpp b/icu4c/source/test/intltest/utxttest.cpp
index 5a7c88db613..72bb7308cc1 100644
--- a/icu4c/source/test/intltest/utxttest.cpp
+++ b/icu4c/source/test/intltest/utxttest.cpp
@@ -285,31 +285,6 @@ void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *c
}
}
- cpIndex = 0;
- for (i=0; i