diff --git a/icu4c/source/common/unicode/utext.h b/icu4c/source/common/unicode/utext.h index 96a44e66f17..44d29c3f36c 100644 --- a/icu4c/source/common/unicode/utext.h +++ b/icu4c/source/common/unicode/utext.h @@ -45,6 +45,63 @@ * or system with a unique text storage format can implement a set of * UText provider functions for that format, which will then allow other * ICU services to operate on that format. + * + * + * Iterating over text + * + * Here is sample code for a forward iteration over the contents of a UText + * + * \code + * UChar32 c; + * UText *ut = whatever(); + * + * for (c=utext_next32From(ut, 0); c!=U_SENTINEL; c=utext_next32(ut)) { + * // do whatever the codepoint c here. + * } + * \endcode + * + * And here is similar code to iterate in the revese direction, from the end + * of the text towards the beginning. + * + * \code + * UChar32 c; + * UText *ut = whatever(); + * int textLength = utext_length(ut); + * for (c=utext_previous32From(ut, textLength); c!=U_SENTINEL; c=utext_previous32(ut)) { + * // do whatever the codepoint c here. + * } + * \endcode + * + * Characters and Indexing + * + * Indexing into text by UText functions is nearly always in terms of the native + * indexing of the underlying text storage. The storage format could be utf-8 + * or utf-32, for example. When coding to the UText access API, no assumptions + * can be made regarding the size of characters, or how far an index + * may move when iterating between characters. + * + * All indices supplied to UText functions are pinned to the length of the + * text. An out-of-bounds index is not considered to be an error, but is + * adjusted to be in the range 0 <= index <= length of input text. + * + * + * When an index position is returned from a UText function, it will be + * a native index to the underlying text. In the case of multi-unit characers, + * tt will always refer to the first position, never to the interior. This + * is essentially the same thing as saying that a returned index will always + * point to a boundary between characters. + * + * When a native index is supplied to a UText function, all indices that + * refer to any part of a multi-unit character representation are considered + * to be equivalent. In the case of multi-unit characers, an incoming index + * will be logically normalized to refer to the start of the character. + * + * It is possible to test whether a native index is on a code point boundary + * by doing a utext_setIndex() followed by a utext_getIndex(). + * If the index returns unchanged, it was on a code point boundary. If + * an adjusted index is returned, the original index referred to the + * interior of a character. + * */ @@ -102,8 +159,10 @@ utext_close(UText *ut); /** * Open a read-only UText implementation for UTF-8 strings. * - * Any invalid utf-8 sequences in the input will appear on the output side - * of the UText as Unicode Replacement characters, \uFFFD. + * Any invalid utf-8 in the input will be handled in this way: + * a sequence of bytes that has the form of a trunctated, but otherwise valid, + * utf-8 sequence will be replaced by a single unicode replacement character, \uFFFD. + * Any other illegal bytes will each be replaced by a \uFFFD. * * @param ut Pointer to a UText struct. If NULL, a new UText will be created. * If non-NULL, must refer to an initialized UText struct, which will then @@ -119,31 +178,6 @@ utext_close(UText *ut); U_DRAFT UText * U_EXPORT2 utext_openUTF8(UText *ut, const uint8_t *s, int32_t length, UErrorCode *status); -/** - * Open a read-only UText implementation for a SBCS strings. - * The implementation converts 1:1 according to the provided mapping table. - * Supplementary code points are not supported. - * - * @param ut Pointer to a UText struct. If NULL, a new UText will be created. - * If non-NULL, must refer to an initialized UText struct, which will then - * be reset to reference the specified input string. - * @param toU Mapping table for conversion from SBCS to Unicode (BMP only). - * The mapping table must be available during the lifetime of the - * UText object. - * @param s A byte text string - * @param length The length of the input string in bytes, or -1 if the string is - * zero terminated. - * @param status Errors are returned here. - * @return A pointer to the UText. If a pre-allocated UText was provided, it - * will always be used and returned. - * @draft ICU 3.4 - */ -U_DRAFT UText * U_EXPORT2 -utext_openSBCS(UText *ut, - const UChar toU[256], - const char *s, int32_t length, - UErrorCode *status); - /** * Open a read-only UText for UChar * string. @@ -160,12 +194,12 @@ utext_openSBCS(UText *ut, * @draft ICU 3.4 */ U_DRAFT UText * U_EXPORT2 -utext_openUChar(UText *ut, const UChar *s, int32_t length, UErrorCode *status); +utext_openUChars(UText *ut, const UChar *s, int32_t length, UErrorCode *status); #ifdef XP_CPLUSPLUS /** - * Open a UText for a UnicodeString. + * Open a writable UText for a non-const UnicodeString. * * @param t Pointer to a UText struct. If NULL, a new UText will be created. * If non-NULL, must refer to an initialized UText struct, which will then @@ -181,7 +215,7 @@ utext_openUnicodeString(UText *t, UnicodeString *s, UErrorCode *status); /** - * Open a UText for a const UnicodeString. The resulting UText will not be writeable. + * Open a UText for a const UnicodeString. The resulting UText will not be writable. * * @param t Pointer to a UText struct. If NULL, a new UText will be created. * If non-NULL, must refer to an initialized UText struct, which will then @@ -254,7 +288,7 @@ utext_clone(UText *dest, const UText *src, UBool deep, UErrorCode *status); /** * Get the length of the text. Depending on the characteristics * of the underlying text represenation, this may be expensive. - * @see utext_lengthIsExpensive() + * @see utext_isLengthExpensive() * * * @param ut the text to be accessed. @@ -269,28 +303,33 @@ utext_length(UText *ut); * Return TRUE if calculating the length of the text could be expensive. * Finding the length of NUL terminated strings is considered to be expensive. * + * Note that the value of this function may change + * as the result of other operations on a UText. + * Once the length of a string has been discovered, it will no longer + * be expensive to report it. + * * @param ut the text to be accessed. * @return TRUE if determining the lenght of the text could be time consuming. * @draft ICU 3.4 */ U_DRAFT UBool U_EXPORT2 -utext_lengthIsExpensive(const UText *ut); +utext_isLengthExpensive(const UText *ut); /** * Returns the code point at the requested index, * or U_SENTINEL (-1) if it is out of bounds. - * Sets the current iteration position to the specified index. * * If the specified index points to the interior of a multi-unit * character - one of the trail bytes of a utf-8 sequence, for example - - * the complete code point will be returned, and the current - * iteration position will be left at the start of the code point. + * the complete code point will be returned. * - * TODO: drop this function as being dangerous? There is no clean way for applications - * to increment the index, which is in native units. Likely user error to increment - * it by utf-16 units. next32From(index) does same thing, except for where iteration - * position is left. + * The iteration position will be set to the start of the returned code point. * + * This function is roughly equivalent to the the sequence + * utext_setIndex(index); + * utext_current(); + * (There is a difference if the index is out of bounds by being less than zero) + * * @param ut the text to be accessed * @param the native index of the character to be accessed. If the index points * to other than the first unit of a multi-unit character, it will be adjusted @@ -299,7 +338,7 @@ utext_lengthIsExpensive(const UText *ut); * @draft ICU 3.4 */ U_DRAFT UChar32 U_EXPORT2 -utext_char32At(UText *ut, int32_t index); +utext_char32At(UText *ut, int32_t nativeIndex); /** @@ -309,7 +348,7 @@ utext_char32At(UText *ut, int32_t index); * the input text. * * @param ut the text to be accessed. - * @return the Unicode code point at the specified index. + * @return the Unicode code point at the current iterator position. * @draft ICU 3.4 */ U_DRAFT UChar32 U_EXPORT2 @@ -358,20 +397,17 @@ utext_previous32(UText *ut); * and return the code point starting at or before that index. * Leave the iteration index at the start of the following code point. * - * An inline macro version of this function, UTEXT_NEXT32FROM(), - * is available for performance critical use. - * This function is the most efficient and convenient way to * begin a forward iteration. * * @param ut the text to be accessed. - * @param index Iteration index. + * @param index Iteration index, in the native units of the text provider. * @return Code point which starts at or before index, * or U_SENTINEL (-1) if it is out of bounds. * @draft ICU 3.4 */ U_DRAFT UChar32 U_EXPORT2 -utext_next32From(UText *ut, int32_t index); +utext_next32From(UText *ut, int32_t nativeIndex); @@ -380,21 +416,18 @@ utext_next32From(UText *ut, int32_t index); * one specified by the initial index. Leave the iteration position * at the start of the returned code point. * - * An inline macro version of this function, UTEXT_PREVIOUS32FROM(), - * is available for performance critical use. - * This function is the most efficient and convenient way to * begin a backwards iteration. * * @param ut the text to be accessed. - * @param index Iteration index. + * @param index Iteration index in the native units of the thext provider. * @return Code point preceding the one at the initial index, * or U_SENTINEL (-1) if it is out of bounds. * * @draft ICU 3.4 */ U_DRAFT UChar32 U_EXPORT2 -utext_previous32From(UText *ut, int32_t index); +utext_previous32From(UText *ut, int32_t nativeIndex); /** * Get the current iterator position, which can range from 0 to @@ -405,33 +438,40 @@ utext_previous32From(UText *ut, int32_t index); * code point boundary * * @param ut the text to be accessed. - * @return the current index position, in native units. + * @return the current index position, in the native units of the text provider. * @draft ICU 3.4 */ U_DRAFT int32_t U_EXPORT2 utext_getIndex(UText *ut); /** - * Set the current iteration position to the specified index. + * Set the current iteration position to the nearest code point + * boundary at or preceding the specified index. * The index is in the native units of the original input text. - * If the index is out of range, it will be trimmed to be witnin + * If the index is out of range, it will be trimmed to be within * the range of the input text. - * If the specifed index does not fall on a code point boundary in - * the input text, it will be adjusted back to do so. *

* It will usually be more efficient to begin an iteration * using the functions utext_next32From() or utext_previous32From() * rather than setIndex(). + *

+ * Moving the index position to an adjacent character is best done + * with utext_next32(), utext_previous32() or utext_moveIndex(). + * Attempting to do direct arithmetic on the index position is + * complicated by the fact that the size (in native units) of a + * character depends on the underlying representation of the character + * (utf-8, utf-16, utf-32, arbitrary codepage), and is not + * easily knowable. * * @param ut the text to be accessed. * @param index the native unit index of the new iteration position. * @draft ICU 3.4 */ U_DRAFT void U_EXPORT2 -utext_setIndex(UText *ut, int32_t index); +utext_setIndex(UText *ut, int32_t nativeIndex); /** - * Move the iterator postion by delta code points. The amount to move + * Move the iterator postion by delta code points. The number of code points * is a signed number; a negative delta will move the iterator backwards, * towards the start of the text. *

@@ -439,6 +479,10 @@ utext_setIndex(UText *ut, int32_t index); * forward or backward, but no further backward than to 0 and * no further forward than to length(). * The resulting index value will be in between 0 and length(), inclusive. + *

+ * Because the index is kept in the native units of the text provider, the + * actual numeric amount by which the index moves depends on the + * underlying text storage representation of the text provider. * * @param ut the text to be accessed. * @param delta the signed number of code points to move the iteration position. @@ -467,22 +511,21 @@ utext_moveIndex(UText *ut, int32_t delta); * @param ut the UText from which to extract data. * @param start the native index of the first character to extract. * @param limit the native string index of the position following the last - * character to extract. + * character to extract. If the specified limit is greater than the length + * of the text, the limit will be trimmed back to the text length. * @param dest the UChar (utf-16) buffer into which the extracted text is placed * @param destCapacity The size, in UChars, of the destination buffer. May be zero * for precomputing the required size. * @param status receives any error status. * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the * buffer was too small. Returns number of UChars for preflighting. - * @return Number of UChars in the data. Does not include a trailing NUL. - * - * TODO: how should invalid source data be handled? Corrupt utf-8, for example. + * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. * * @draft ICU 3.4 */ U_DRAFT int32_t U_EXPORT2 utext_extract(UText *ut, - int32_t start, int32_t limit, + int32_t nativeStart, int32_t nativeLimit, UChar *dest, int32_t destCapacity, UErrorCode *status); @@ -533,38 +576,6 @@ utext_extract(UText *ut, (ut)->chunk.contents[--((ut)->chunk.offset)] : utext_previous32(ut)) -/** - * inline version of utext_next32from(), for performance-critical situations. - * - * Set the iteration index, access the text for forward iteration, - * and return the code point starting at or before that index. - * Leave the iteration index at the start of the following code point. - * - * @draft ICU 3.4 - */ -#define UTEXT_NEXT32FROM(ut, index) \ - ((index) >= (ut)->chunk.start && \ - (index) < (ut)->chunk.limit && \ - !(ut)->chunk.nonUTF16Indexes && \ - (ut)->chunk.contents[(ut)->chunk.offset=(ut)->chunk.start+(index)] < 0xd800 ? \ - (ut)->chunk.contents[((ut)->chunk.offset)++] : utext_next32From(ut, index)) - -/** - * inline version of utext_previous32from(), for performance-critical situations. - * - * Set the iteration index, and return the code point preceding the - * one specified by the initial index. Leave the iteration position - * at the start of the returned code point. - * - * @draft ICU 3.4 - */ -#define UTEXT_PREVIOUS32FROM(ut, index) \ - ((index) > (ut)->chunk.start && \ - (index) <= (ut)->chunk.limit && \ - !(ut)->chunk.nonUTF16Indexes && \ - (ut)->chunk.contents[(ut)->chunk.offset=(ut)->chunk.start+(index-1)] < 0xd800 ? \ - (ut)->chunk.contents[(ut)->chunk.offset] : utext_previous32From(ut, index)) - /************************************************************************************ @@ -587,7 +598,7 @@ utext_extract(UText *ut, * */ U_DRAFT UBool U_EXPORT2 -utext_isWriteable(const UText *ut); +utext_isWriteble(const UText *ut); /** @@ -609,7 +620,7 @@ utext_hasMetaData(const UText *ut); * newly inserted replacement text. * * This function is only available on UText types that support writing, - * that is, ones where utext_isWriteable() returns TRUE. + * that is, ones where utext_isWritable() returns TRUE. * * When using this function, there should be only a single UText opened onto the * underlying native text string. Behavior after a replace operation @@ -617,8 +628,8 @@ utext_hasMetaData(const UText *ut); * modified string. * * @param ut the UText representing the text to be operated on. - * @param start the native index of the start of the region to be replaced - * @param limit the native index of the character following the region to be replaced. + * @param nativeStart the native index of the start of the region to be replaced + * @param nativeLimit the native index of the character following the region to be replaced. * @param replacementText pointer to the replacement text * @param replacmentLength length of the replacement text, or -1 if the text is NUL terminated. * @param status receives any error status. Possible errors include @@ -631,7 +642,7 @@ utext_hasMetaData(const UText *ut); */ U_DRAFT int32_t U_EXPORT2 utext_replace(UText *ut, - int32_t start, int32_t limit, + int32_t nativeStart, int32_t nativeLimit, const UChar *replacementText, int32_t replacementLength, UErrorCode *status); @@ -648,25 +659,25 @@ utext_replace(UText *ut, * it does not replace or overwrite any existing text. * * This function is only available on UText types that support writing, - * that is, ones where utext_isWriteable() returns TRUE. + * that is, ones where utext_isWritable() returns TRUE. * * When using this function, there should be only a single UText opened onto the * underlying native text string. Behavior after a copy operation * on a UText is undefined in any other additional UTexts that refer to the * modified string. * - * @param ut The UText representing the text to be operated on. - * @param start The native index of the start of the region to be copied or moved - * @param limit The native index of the character following the region to be replaced. - * @param destIndex The native destination index to which the source substring is copied or moved. - * @param move If TRUE, then the substring is moved, not copied/duplicated. - * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION + * @param ut The UText representing the text to be operated on. + * @param nativeStart The native index of the start of the region to be copied or moved + * @param nativeLimit The native index of the character following the region to be replaced. + * @param destIndex The native destination index to which the source substring is copied or moved. + * @param move If TRUE, then the substring is moved, not copied/duplicated. + * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION * * @draft ICU 3.4 */ U_DRAFT void U_EXPORT2 utext_copy(UText *ut, - int32_t start, int32_t limit, + int32_t nativeStart, int32_t nativeLimit, int32_t destIndex, UBool move, UErrorCode *status); @@ -709,10 +720,10 @@ struct UTextChunk { int32_t length; /** (Native) text index corresponding to the start of the chunk. */ - int32_t start; + int32_t nativeStart; /** (Native) text index corresponding to the end of the chunk (contents+length). */ - int32_t limit; + int32_t nativeLimit; /** If TRUE, then non-UTF-16 indexes are used in this chunk. */ UBool nonUTF16Indexes; @@ -739,10 +750,10 @@ enum { */ UTEXT_PROVIDER_NON_UTF16_INDEXES, /** - * The provider can return the text length inexpensively. + * It is potentially time consuming for the provider to determine the length of the text. * @draft ICU 3.4 */ - UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE, + UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE, /** * Text chunks remain valid and usable until the text object is modified or * deleted, not just until the next time the access() function is called @@ -799,18 +810,6 @@ enum { typedef UText * U_CALLCONV UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); -/** - * Function type declaration for UText.GetProperties(). - * - * Gets the provider properties for this UText. - * - * @param ut the UText to get properties for. - * @return Provider properties bit field. - * - * @draft ICU 3.4 - */ -typedef int32_t U_CALLCONV -UTextGetProperties(UText *ut); /** * Function type declaration for UText.length(). @@ -821,7 +820,7 @@ UTextGetProperties(UText *ut); * @draft ICU 3.4 */ typedef int32_t U_CALLCONV -UTextLength(UText *ut); +UTextNativeLength(UText *ut); /** * Function type declaration for UText.access(). Get the description of the text chunk @@ -830,22 +829,22 @@ UTextLength(UText *ut); * of bounds, the iteration position will be left at the start or end * of the string, as appropriate. * - * @param ut the UText being accessed. - * @param index Requested (native) index of the text to be accessed. - * @param forward If TRUE, then the returned chunk must contain text - * starting from the index, so that start<=indexlength. - * @return Absolute (native) index corresponding to the UTF-16 offset - * relative to the current text chunk. + * @return Absolute (native) index corresponding to the specified chunk offset. + * The returned native index should always be to a code point boundary. * * @draft ICU 3.4 */ typedef int32_t U_CALLCONV -UTextMapOffsetToNative(UText *ut, UTextChunk *chunk, int32_t offset); +UTextMapOffsetToNative(UText *ut, int32_t offset); /** * Function type declaration for UText.mapIndexToUTF16(). - * This is required only for text providers that do not use native utf-16 indexes. + * Map from a native index to a UChar offset within a text chunk * - * @param ut The UText containing the text chunk. - * @param chunk the text chunk in which the mapping occurs. - * TODO: keep this as a separate parameter, or just imply that the function - * works on the chunk embedded in the UText? - * @param index Absolute (native) text index, chunk->start<=index<=chunk->limit. - * @return Chunk-relative UTF-16 offset corresponding to the absolute (native) - * index. + * This function is required only for text providers that do not use native utf-16 indexes. * - * @see UText + * @param ut The UText containing the text chunk. + * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. + * @return Chunk-relative UTF-16 offset corresponding to the specified native + * index. + * + * TODO: specify behavior with out-of-bounds index? Shouldn't ever occur. * @draft ICU 3.4 */ typedef int32_t U_CALLCONV -UTextMapIndexToUTF16(UText *ut, UTextChunk *chunk, int32_t index); +UTextMapIndexToUTF16(UText *ut, int32_t nativeIndex); /** @@ -1077,6 +1077,15 @@ struct UText { int32_t a, b, c; + /** + * Text provider properties. This set of flags is maintainted by the + * text provider implementation. + * @draft ICU 3.4 + */ + int32_t providerProperties; + + + /** desciptor for the text chunk that includes or is adjacent to * the current iteration position. * @draft ICU 3.4 @@ -1084,14 +1093,6 @@ struct UText { UTextChunk chunk; - /** - * Text provider properties - * @draft ICU 3.4 - */ - int32_t providerProperties; - - - /** * (public) Function pointer for UTextClone * @@ -1100,14 +1101,6 @@ struct UText { */ UTextClone *clone; - /** - * (public) function pointer for UTextGetProperties - * - * @see UTextGetProperties - * @draft ICU 3.4 - */ - UTextGetProperties *properties; - /** * (public) function pointer for UTextLength * May be expensive to compute! @@ -1115,7 +1108,7 @@ struct UText { * @see UTextLength * @draft ICU 3.4 */ - UTextLength *length; + UTextNativeLength *length; /** * (public) Function pointer for UTextAccess. @@ -1224,16 +1217,16 @@ enum { * @internal */ #define UTEXT_INITIALZIER_HEAD \ - NULL, /* context */ \ - NULL, NULL, NULL, /* p, q, r */ \ - NULL, /* pExtra */ \ - 0, /* extraSize */ \ - 0, /* flags */ \ - UTEXT_MAGIC, /* magic */ \ - sizeof(UText), /* sizeOfStruct */ \ - 0, 0, 0, /* a, b, c */ \ - UTEXT_CHUNK_INIT, /* UTextChunk */ \ - -1 /* provderProps */ + NULL, /* context */ \ + NULL, NULL, NULL, /* p, q, r */ \ + NULL, /* pExtra */ \ + 0, /* extraSize */ \ + 0, /* flags */ \ + UTEXT_MAGIC, /* magic */ \ + sizeof(UText), /* sizeOfStruct */ \ + 0, 0, 0, /* a, b, c */ \ + 0, /* providerProps */ \ + UTEXT_CHUNK_INIT /* UTextChunk */ @@ -1247,7 +1240,6 @@ enum { #define UTEXT_INITIALIZER { \ UTEXT_INITIALZIER_HEAD, \ NULL, /* clone () */ \ - NULL, /* properties ()*/ \ NULL, /* length () */ \ NULL, /* access () */ \ NULL, /* extract () */ \ diff --git a/icu4c/source/common/utext.cpp b/icu4c/source/common/utext.cpp index a8e872135fc..6751791cbf6 100644 --- a/icu4c/source/common/utext.cpp +++ b/icu4c/source/common/utext.cpp @@ -21,6 +21,7 @@ #include "ustr_imp.h" #include "cmemory.h" #include "cstring.h" +#include "uassert.h" #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex)) @@ -38,7 +39,7 @@ utext_moveIndex(UText *ut, int32_t delta) { UBool retval = TRUE; if(delta>0) { do { - if(ut->chunk.offset>=ut->chunk.length && !utext_access(ut, ut->chunk.limit, TRUE)) { + if(ut->chunk.offset>=ut->chunk.length && !utext_access(ut, ut->chunk.nativeLimit, TRUE)) { retval = FALSE; break; } @@ -46,7 +47,7 @@ utext_moveIndex(UText *ut, int32_t delta) { } while(--delta>0); } else if (delta<0) { do { - if(ut->chunk.offset<=0 && !utext_access(ut, ut->chunk.start, FALSE)) { + if(ut->chunk.offset<=0 && !utext_access(ut, ut->chunk.nativeStart, FALSE)) { retval = FALSE; break; } @@ -63,12 +64,20 @@ utext_length(UText *ut) { return ut->length(ut); } + +U_DRAFT UBool U_EXPORT2 +utext_isLengthExpensive(const UText *ut) { + UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0; + return r; +} + + U_DRAFT int32_t U_EXPORT2 utext_getIndex(UText *ut) { if(!ut->chunk.nonUTF16Indexes || ut->chunk.offset==0) { - return ut->chunk.start+ut->chunk.offset; + return ut->chunk.nativeStart+ut->chunk.offset; } else { - return ut->mapOffsetToNative(ut, &ut->chunk, ut->chunk.offset); + return ut->mapOffsetToNative(ut, ut->chunk.offset); } } @@ -76,23 +85,23 @@ utext_getIndex(UText *ut) { U_DRAFT void U_EXPORT2 utext_setIndex(UText *ut, int32_t index) { - // TODO - revise for keeping index always valid. - if(indexchunk.start || ut->chunk.limitchunk.nativeStart || ut->chunk.nativeLimitaccess(ut, index, TRUE, &ut->chunk); } else if(ut->chunk.nonUTF16Indexes) { - ut->chunk.offset=ut->mapIndexToUTF16(ut, &ut->chunk, index); + ut->chunk.offset=ut->mapIndexToUTF16(ut, index); } else { - ut->chunk.offset=index-ut->chunk.start; + ut->chunk.offset=index-ut->chunk.nativeStart; // Our convention is that the index must always be on a code point boundary. // If we are somewhere in the middle of a utf-16 buffer, check that new index // is not in the middle of a surrogate pair. - if (index>ut->chunk.start && index < ut->chunk.limit) { // TODO: clean up end-of-chunk / end of input handling. Everywhere. + if (index>ut->chunk.nativeStart && index < ut->chunk.nativeLimit) { UChar c = ut->chunk.contents[ut->chunk.offset]; if (U16_TRAIL(c)) { - utext_current(ut); // force index onto a code point boundary. + utext_current(ut); // force index to the start of the curent code point. } } } @@ -123,6 +132,18 @@ utext_current(UText *ut) { return c; } + +U_DRAFT UChar32 U_EXPORT2 +utext_char32At(UText *ut, int32_t nativeIndex) { + UChar32 c = U_SENTINEL; + utext_setIndex(ut, nativeIndex); + if (nativeIndex >= 0 && nativeIndex < ut->chunk.nativeLimit) { + c = ut->chunk.contents[ut->chunk.offset]; + } + return c; +} + + U_DRAFT UChar32 U_EXPORT2 utext_next32(UText *ut) { UTextChunk *chunk = &ut->chunk; @@ -130,7 +151,7 @@ utext_next32(UText *ut) { UChar32 c = U_SENTINEL; if (offset >= chunk->length) { - if (ut->access(ut, chunk->limit, TRUE, chunk) == FALSE) { + if (ut->access(ut, chunk->nativeLimit, TRUE, chunk) == FALSE) { goto next32_return; } offset = chunk->offset; @@ -160,7 +181,7 @@ utext_previous32(UText *ut) { UChar32 c = U_SENTINEL; if (offset <= 0) { - if (ut->access(ut, chunk->start, FALSE, chunk) == FALSE) { + if (ut->access(ut, chunk->nativeStart, FALSE, chunk) == FALSE) { goto prev32_return; } offset = chunk->offset; @@ -186,16 +207,16 @@ utext_next32From(UText *ut, int32_t index) { UTextChunk *chunk = &ut->chunk; UChar32 c = U_SENTINEL; - if(indexstart || index>=chunk->limit) { + if(indexnativeStart || index>=chunk->nativeLimit) { if(!ut->access(ut, index, TRUE, chunk)) { // no chunk available here goto next32return; } offset = chunk->offset; } else if(chunk->nonUTF16Indexes) { - offset=ut->mapIndexToUTF16(ut, chunk, index); + offset=ut->mapIndexToUTF16(ut, index); } else { - offset = index - chunk->start; + offset = index - chunk->nativeStart; } c = chunk->contents[offset++]; @@ -220,16 +241,16 @@ utext_previous32From(UText *ut, int32_t index) { UTextChunk *chunk = &ut->chunk; UChar32 c = U_SENTINEL; - if(index<=chunk->start || index>chunk->limit) { + if(index<=chunk->nativeStart || index>chunk->nativeLimit) { if(!ut->access(ut, index, FALSE, chunk)) { // no chunk available here goto prev32return; } offset = chunk->offset; } else if(chunk->nonUTF16Indexes) { - offset=ut->mapIndexToUTF16(ut, chunk, index); + offset=ut->mapIndexToUTF16(ut, index); } else { - offset = index - chunk->start; + offset = index - chunk->nativeStart; } offset--; @@ -253,6 +274,66 @@ utext_extract(UText *ut, } + + +U_DRAFT UBool U_EXPORT2 +utext_isWriteble(const UText *ut) +{ + UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0; + return b; +} + + +U_DRAFT UBool U_EXPORT2 +utext_hasMetaData(const UText *ut) +{ + UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0; + return b; +} + + + +U_DRAFT int32_t U_EXPORT2 +utext_replace(UText *ut, + int32_t nativeStart, int32_t nativeLimit, + const UChar *replacementText, int32_t replacementLength, + UErrorCode *status) +{ + if (U_FAILURE(*status)) { + return 0; + } + if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { + *status = U_NO_WRITE_PERMISSION; + return 0; + } + int32_t i = ut->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status); + return i; +} + +U_DRAFT void U_EXPORT2 +utext_copy(UText *ut, + int32_t nativeStart, int32_t nativeLimit, + int32_t destIndex, + UBool move, + UErrorCode *status) +{ + if (U_FAILURE(*status)) { + return; + } + if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { + *status = U_NO_WRITE_PERMISSION; + return; + } + ut->copy(ut, nativeStart, nativeLimit, destIndex, move, status); +} + + + +U_DRAFT UText * U_EXPORT2 +utext_clone(UText *dest, const UText *src, UBool deep, UErrorCode *status) { + return src->clone(dest, src, deep, status); +} + U_DRAFT UBool U_EXPORT2 utext_compare(UText *ut, const UChar *s, int32_t length, UBool codePointOrder) { int32_t segLength, result; @@ -285,7 +366,7 @@ utext_compare(UText *ut, const UChar *s, int32_t length, UBool codePointOrder) { return 0; } - if(!ut->access(ut, ut->chunk.limit, TRUE, &ut->chunk)) { + if(!ut->access(ut, ut->chunk.nativeLimit, TRUE, &ut->chunk)) { // the text ends before the string does return -1; } @@ -321,9 +402,10 @@ enum { // // Extended form of a UText. The purpose is to aid in computing the total size required // when a provider asks for a UText to be allocated with extra storage. -// -struct ExtendedUText: public UText { - void *extension; + +struct ExtendedUText { + UText ut; + UAlignedMemory extension; }; static const UText emptyText = UTEXT_INITIALIZER; @@ -338,14 +420,18 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) { // We need to heap-allocate storage for the new UText int32_t spaceRequired = sizeof(UText); if (extraSpace > 0) { - spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(void *); + spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory); } ut = (UText *)uprv_malloc(spaceRequired); - *ut = emptyText; - ut->flags |= UTEXT_HEAP_ALLOCATED; - if (spaceRequired>0) { - ut->extraSize = spaceRequired; - ut->pExtra = &((ExtendedUText *)ut)->extension; + if (ut == NULL) { + *status = U_MEMORY_ALLOCATION_ERROR; + } else { + *ut = emptyText; + ut->flags |= UTEXT_HEAP_ALLOCATED; + if (spaceRequired>0) { + ut->extraSize = spaceRequired; + ut->pExtra = &((ExtendedUText *)ut)->extension; + } } } else { // We have been supplied with an already existing UText. @@ -378,6 +464,9 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) { } } } + if (U_SUCCESS(*status)) { + ut->flags |= UTEXT_OPEN; + } return ut; } @@ -429,15 +518,15 @@ utext_close(UText *ut) { // static void resetChunk(UTextChunk *chunk, int32_t index) { - if (index==chunk->limit) { + if (index==chunk->nativeLimit) { chunk->offset = chunk->length; - } else if (index==chunk->start) { + } else if (index==chunk->nativeStart) { chunk->offset = 0; } else { - chunk->length = 0; - chunk->start = index; - chunk->limit = index; - chunk->offset = 0; + chunk->length = 0; + chunk->nativeStart = index; + chunk->nativeLimit = index; + chunk->offset = 0; } } @@ -452,16 +541,52 @@ resetChunk(UTextChunk *chunk, int32_t index) { U_CDECL_BEGIN static UText * U_CALLCONV -noopTextClone(UText * /* dest */, const UText * /*src*/, UBool /*deep*/, UErrorCode * /*status*/) { - return NULL; // not supported +// +// Clone. This is a generic copy-the-utext-by-value clone function that can be +// used as-is with some utext types, and as helper by other clones. +// +noopTextClone(UText * dest, const UText * src, UBool deep, UErrorCode * status) { + if (U_FAILURE(*status)) { + return NULL; + } + int32_t srcExtraSize = src->extraSize; + + // + // Use the generic text_setup to allocate storage if required. + // + dest = utext_setup(dest, srcExtraSize, status); + if (U_FAILURE(*status)) { + return dest; + } + + // + // flags (how the UText was allocated) and the pointer to the + // extra storage must retain the values in the cloned utext that + // were set up by utext_setup. Save them separately before + // copying the whole struct. + // + void *destExtra = dest->pExtra; + int32_t flags = dest->flags; + + + // + // Copy the whole UText struct by value. + // Any "Extra" storage is copied also. + // + int sizeToCopy = src->sizeOfStruct; + if (sizeToCopy > dest->sizeOfStruct) { + sizeToCopy = dest->sizeOfStruct; + } + uprv_memcpy(dest, src, sizeToCopy); + dest->pExtra = destExtra; + dest->flags = flags; + if (srcExtraSize > 0) { + uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize); + } + + return dest; } -static int32_t U_CALLCONV -noopTextGetProperties(UText * /*t*/) { - return - I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE)| - I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); -} static int32_t U_CALLCONV noopTextLength(UText * /* t */) { @@ -483,12 +608,12 @@ noopTextExtract(UText * /* t */, } static int32_t U_CALLCONV -noopTextMapOffsetToNative(UText * /* t */, UTextChunk * /* chunk */, int32_t /* offset */) { +noopTextMapOffsetToNative(UText * /* t */, int32_t /* offset */) { return 0; } static int32_t U_CALLCONV -noopTextMapIndexToUTF16(UText * /* t */, UTextChunk * /* chunk */, int32_t /* index */) { +noopTextMapIndexToUTF16(UText * /* t */, int32_t /* index */) { return 0; } @@ -498,7 +623,6 @@ U_CDECL_END static const UText noopText={ UTEXT_INITIALZIER_HEAD, noopTextClone, - noopTextGetProperties, noopTextLength, noopTextAccess, noopTextExtract, @@ -550,14 +674,6 @@ struct UTF8Extra { U_CDECL_BEGIN -static int32_t U_CALLCONV -utf8TextGetProperties(UText * /*t*/) { - return - I32_FLAG(UTEXT_PROVIDER_NON_UTF16_INDEXES)| - I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE); - // not UTEXT_PROVIDER_STABLE_CHUNKS because chunk-related data is kept - // in UTF8Text, so only one at a time can be active -} static int32_t U_CALLCONV utf8TextLength(UText *ut) { @@ -590,7 +706,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) { return FALSE; } - chunk->start=index; + chunk->nativeStart=index; c=s8[index]; if(c<=0x7f) { // get a run of ASCII characters. @@ -621,11 +737,11 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) { map[i]=index; chunk->nonUTF16Indexes=TRUE; } - chunk->contents = u16buf; - chunk->length = i; - chunk->limit = index; - ut->q = map; - chunk->offset = 0; // chunkOffset corresponding to index + chunk->contents = u16buf; + chunk->length = i; + chunk->nativeLimit = index; + ut->q = map; + chunk->offset = 0; // chunkOffset corresponding to index return TRUE; } else { // Reverse Access. The chunk buffer must be filled so as to contain the @@ -635,7 +751,7 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) { return FALSE; } - chunk->limit=index; + chunk->nativeLimit=index; c=s8[index-1]; if(c<=0x7f) { // get a chunk of ASCII characters. Don't build the index map @@ -684,10 +800,10 @@ utf8TextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) { chunk->nonUTF16Indexes=TRUE; } // Common reverse iteration, for both UTF16 and non-UTIF16 indexes. - chunk->contents = u16buf+i; - chunk->length = (UTF8_TEXT_CHUNK_SIZE)-i; - chunk->start = index; - chunk->offset = chunk->length; // chunkOffset corresponding to index + chunk->contents = u16buf+i; + chunk->length = (UTF8_TEXT_CHUNK_SIZE)-i; + chunk->nativeStart = index; + chunk->offset = chunk->length; // chunkOffset corresponding to index return TRUE; } } @@ -717,18 +833,20 @@ utf8TextExtract(UText *ut, // Assume nonUTF16Indexes and 0<=offset<=chunk->length static int32_t U_CALLCONV -utf8TextMapOffsetToNative(UText *ut, UTextChunk * /* chunk */, int32_t offset) { +utf8TextMapOffsetToNative(UText *ut, int32_t offset) { // UText.q points to the index mapping array that is allocated in the extra storage area. + U_ASSERT(offset>=0 && offset<=ut->chunk.length); int32_t *map=(int32_t *)(ut->q); return map[offset]; } // Assume nonUTF16Indexes and chunk->start<=index<=chunk->limit static int32_t U_CALLCONV -utf8TextMapIndexToUTF16(UText *ut, UTextChunk * /*chunk */, int32_t index) { +utf8TextMapIndexToUTF16(UText *ut, int32_t index) { int32_t *map=(int32_t *)(ut->q); int32_t offset=0; + U_ASSERT(index>=ut->chunk.nativeStart && index<=ut->chunk.nativeLimit); while(index>map[offset]) { ++offset; } @@ -752,9 +870,9 @@ utext_openUTF8(UText *ut, const uint8_t *s, int32_t length, UErrorCode *status) if (U_FAILURE(*status)) { return ut; } + ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_NON_UTF16_INDEXES); ut->clone = noopTextClone; - ut->properties = utf8TextGetProperties; ut->length = utf8TextLength; ut->access = utf8TextAccess; ut->extract = utf8TextExtract; @@ -777,190 +895,6 @@ U_CDECL_END -//------------------------------------------------------------------------------ -// -// UText implementation for SBCS strings (read-only) -// -// Use of UText data members: -// context pointer to SBCS string -// -//------------------------------------------------------------------------------ - - -enum { SBCS_TEXT_CHUNK_SIZE=10 }; - -struct SBCSText : public UText { - /* pointer to SBCS-to-BMP mapping table */ - const UChar *toU; - /* length of UTF-8 string (in bytes) */ - int32_t length; - /* chunk UChars */ - UChar s[SBCS_TEXT_CHUNK_SIZE]; -}; - - -U_CDECL_BEGIN - -static int32_t U_CALLCONV -sbcsTextGetProperties(UText * /*t*/) { - return - I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE); - // not UTEXT_PROVIDER_STABLE_CHUNKS because chunk-related data is kept - // in SBCSText, so only one at a time can be active -} - -static int32_t U_CALLCONV -sbcsTextLength(UText *t) { - return ((SBCSText *)t)->length; -} - -static UBool U_CALLCONV -sbcsTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) { - SBCSText *ts=(SBCSText *)ut; - const uint8_t *s8=(const uint8_t *)ts->context; - int32_t i, count, length=ts->length; - - chunk->nonUTF16Indexes=FALSE; - - if(forward) { - if(length<=index) { - resetChunk(chunk, length); - return FALSE; - } - - count=length-index; - if(count>SBCS_TEXT_CHUNK_SIZE) { - count=SBCS_TEXT_CHUNK_SIZE; - } - chunk->start=index; - for(i=0; is[i]=ts->toU[s8[index]]; - } - chunk->contents=ts->s; - chunk->length=i; - chunk->limit=index; - chunk->offset = 0; // chunkOffset corresponding to index - return TRUE; - } else { - if(index<=0) { - resetChunk(chunk, 0); - return FALSE; - } - - if(index<=SBCS_TEXT_CHUNK_SIZE) { - count=index; - } else { - count=SBCS_TEXT_CHUNK_SIZE; - } - chunk->limit=index; - for(i=count; i>0;) { - ts->s[--i]=ts->toU[s8[--index]]; - } - chunk->contents=ts->s; - chunk->length=count; - chunk->start=index; - chunk->offset=count; // chunkOffset corresponding to index - return TRUE; - } -} - -static int32_t U_CALLCONV -sbcsTextExtract(UText *t, - int32_t start, int32_t limit, - UChar *dest, int32_t destCapacity, - UErrorCode *pErrorCode) { - SBCSText *ts=(SBCSText *)t; - if(U_FAILURE(*pErrorCode)) { - return 0; - } - if(destCapacity<0 || (dest==NULL && destCapacity>0)) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - } - if(start<0 || start>limit || ts->lengthcontext+start; - UChar *d=dest; - const UChar *destLimit; - int32_t destLength=limit-start; - if(destLength>destCapacity) { - destLength=destCapacity; - } - destLimit=dest+destLength; - while(dtoU[*s8++]; - } - return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); -} - -static const UText sbcsText={ - UTEXT_INITIALZIER_HEAD, - noopTextClone, - sbcsTextGetProperties, - sbcsTextLength, - sbcsTextAccess, - sbcsTextExtract, - NULL, // replace - NULL, // copy - NULL, // mapOffsetToNative - NULL, // mapIndexToUTF16 - NULL // close -}; - -U_DRAFT UText * U_EXPORT2 -utext_openSBCS(UText * /*ut */, - const UChar /* toU*/[256] , - const char *s, int32_t length, - UErrorCode *pErrorCode) { - if(U_FAILURE(*pErrorCode)) { - return NULL; - } - if(s==NULL || length<-1) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - SBCSText *ts=(SBCSText *)uprv_malloc(sizeof(SBCSText)); - if(ts==NULL) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - return NULL; - } - *((UText *)ts)=sbcsText; - ts->context=s; - if(length>=0) { - ts->length=length; - } else { - ts->length=(int32_t)uprv_strlen(s); - } - return ts; -} - -U_DRAFT void U_EXPORT2 -utext_closeSBCS(UText *t) { - if(t!=NULL) { - uprv_free((SBCSText *)t); - } -} - -U_DRAFT void U_EXPORT2 -utext_resetSBCS(UText *t, const char *s, int32_t length, UErrorCode *pErrorCode) { - if(U_FAILURE(*pErrorCode)) { - return; - } - if(s==NULL || length<-1) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return; - } - SBCSText *ts=(SBCSText *)t; - ts->context=s; - if(length>=0) { - ts->length=length; - } else { - ts->length=(int32_t)uprv_strlen(s); - } -} - -U_CDECL_END /* UText implementation wrapper for Replaceable (read/write) ---------------- */ @@ -1283,7 +1217,6 @@ unistrTextClone(UText * /* dest */, const UText * /*src*/, UBool /*deep*/, UErro static int32_t U_CALLCONV unistrTextGetProperties(UText * /*t*/) { return - I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_INEXPENSIVE)| I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)| I32_FLAG(UTEXT_PROVIDER_WRITABLE); } @@ -1299,13 +1232,13 @@ unistrTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) { const UnicodeString *us = (const UnicodeString *)ut->context; int32_t length = us->length(); - if (chunk->limit != length) { + if (chunk->nativeLimit != length) { // This chunk is not yet set up. Do it now. - chunk->contents=us->getBuffer(); - chunk->length=length; - chunk->start=0; - chunk->limit=length; - chunk->nonUTF16Indexes=FALSE; + chunk->contents = us->getBuffer(); + chunk->length = length; + chunk->nativeStart = 0; + chunk->nativeLimit = length; + chunk->nonUTF16Indexes = FALSE; } // pin the requested index to the bounds of the string, @@ -1423,7 +1356,6 @@ utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) { ut = utext_setup(ut, 0, status); if (U_SUCCESS(*status)) { ut->clone = unistrTextClone; - ut->properties = unistrTextGetProperties; ut->length = unistrTextLength; ut->access = unistrTextAccess; ut->extract = unistrTextExtract; @@ -1431,6 +1363,205 @@ utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) { ut->copy = unistrTextCopy; ut->context = s; + ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS)| + I32_FLAG(UTEXT_PROVIDER_WRITABLE); + } + return ut; +} + + +//------------------------------------------------------------------------------ +// +// UText implementation for const UChar * strings +// +// Use of UText data members: +// context pointer to UnicodeString +// a length. -1 if not yet known. +// +//------------------------------------------------------------------------------ + +U_CDECL_BEGIN + + +static UText * U_CALLCONV +ucstrTextClone(UText * /* dest */, const UText * /*src*/, UBool /*deep*/, UErrorCode * /*status*/) { +// TODO: fix this. + return NULL; +} + + +static int32_t U_CALLCONV +ucstrTextLength(UText *ut) { + if (ut->a < 0) { + // null terminated, we don't yet know the length. Scan for it. + // Access is not convenient for doing this + // because the current interation postion can't be changed. + const UChar *str = (const UChar *)ut->context; + for (;;) { + if (str[ut->chunk.nativeLimit] == 0) { + break; + } + ut->chunk.nativeLimit++; + } + ut->a = ut->chunk.nativeLimit; + ut->chunk.length = ut->chunk.nativeLimit; + ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); + } + return ut->a; +} + + +static UBool U_CALLCONV +ucstrTextAccess(UText *ut, int32_t index, UBool forward, UTextChunk *chunk) { + const UChar *str = (const UChar *)ut->context; + + // pin the requested index to the bounds of the string, + // and set current iteration position. + if (index<0) { + index = 0; + } else if (index < ut->chunk.nativeLimit) { + // The request data is within the chunk as it is known so far. + // There is nothing more that needs to be done within this access function. + } else if (ut->a >= 0) { + // We know the length of this string, and the user is requesting something + // at or beyond the length. Trim the requested index to the length. + index = ut->a; + } else { + // Null terminated string, length not yet known. + // Scan down another 32 UChars or to the requested index, whichever is further + int scanLimit = ut->chunk.nativeLimit + 32; + if (scanLimit <= index) { + scanLimit = index+1; // TODO: beware int overflow + } + for (; ut->chunk.nativeLimitchunk.nativeLimit++) { + if (str[ut->chunk.nativeLimit] == 0) { + // We found the end of the string. Remember it, trim the index to it, + // and bail out of here. + ut->a = ut->chunk.nativeLimit; + ut->chunk.length = ut->chunk.nativeLimit; + if (index > ut->chunk.nativeLimit) { + index = ut->chunk.nativeLimit; + } + ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); + goto breakout; + } + } + // We scanned through the next batch of UChars without finding the end. + // The endpoint of a chunk must not be left in the middle of a surrogate pair. + // If the current end is on a lead surrogate, back the end up by one. + // It doesn't matter if the end char happens to be an unpaired surrogate, + // and it's simpler not to worry about it. + if (U16_IS_LEAD(str[ut->chunk.nativeLimit-1])) { + --ut->chunk.nativeLimit; + } + } +breakout: + chunk->offset = index; + + // Check whether request is at the start or end + UBool retVal = (forward && indexchunk.nativeLimit) || (!forward && index>0); + return retVal; +} + + + +static int32_t U_CALLCONV +ucstrTextExtract(UText *ut, + int32_t start, int32_t limit, + UChar *dest, int32_t destCapacity, + UErrorCode *pErrorCode) { + + + if(U_FAILURE(*pErrorCode)) { + return 0; + } + if(destCapacity<0 || (dest==NULL && destCapacity>0)) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + + const UChar *s=(const UChar *)ut->context; + int32_t strLength=ut->a; + int32_t si, di; + + // If text is null terminated and we haven't yet scanned down as far as the starting + // position of the extract, do it now. + if (strLength<0 && limit>=ut->chunk.nativeLimit) { + ucstrTextAccess(ut, start, TRUE, &ut->chunk); + } + + // Raise an error if starting position is outside of the string. + if(start<0 || start>limit) { + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + + if (strLength >= 0 && limit > strLength) { + // String length is known. Trim requested limit to be no more than the length + limit = strLength; + } + + di = 0; + for (si=start; sia = si; // set string length for this UText + ut->chunk.nativeLimit = si; + ut->chunk.length = si; + // + break; + } + if (di=0) { + // We have filled the destination buffer, and the string is known. + // Cut the loop short. There is no need to scan string termination. + di = strLength; + break; + } + } + di++; + } + + u_terminateUChars(dest, destCapacity, di, pErrorCode); + return di; + } + + + +U_CDECL_END + + +U_DRAFT UText * U_EXPORT2 +utext_openUChars(UText *ut, const UChar *s, int32_t length, UErrorCode *status) { + if (U_FAILURE(*status)) { + return NULL; + } + if (length < -1) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + ut = utext_setup(ut, 0, status); + if (U_SUCCESS(*status)) { + ut->clone = noopTextClone; + ut->length = ucstrTextLength; + ut->access = ucstrTextAccess; + ut->extract = ucstrTextExtract; + ut->replace = NULL; + ut->copy = NULL; + + ut->context = s; + ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); + if (length==-1) { + ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); + } + ut->a = length; + ut->chunk.contents = s; + ut->chunk.nativeStart = 0; + ut->chunk.nativeLimit = length>=0? length : 0; + ut->chunk.nonUTF16Indexes = FALSE; } return ut; } diff --git a/icu4c/source/test/cintltst/utexttst.c b/icu4c/source/test/cintltst/utexttst.c index 5cb5c8301d6..3413a972822 100644 --- a/icu4c/source/test/cintltst/utexttst.c +++ b/icu4c/source/test/cintltst/utexttst.c @@ -15,8 +15,10 @@ #include "unicode/utypes.h" #include "unicode/utext.h" +#include "unicode/ustring.h" #include "cintltst.h" - +#include "memory.h" +#include "string.h" static void TestAPI(void); @@ -45,7 +47,7 @@ addUTextTest(TestNode** root) /* * TestAPI verify that the UText API is accessible from C programs. * This is not intended to be a complete test of the API functionality. That is - * in the C++ intltest program. + * in the C++ intltest program. * This test is intended to check that everything can be accessed and built in * a pure C enviornment. */ @@ -55,19 +57,193 @@ static void TestAPI(void) { UErrorCode status = U_ZERO_ERROR; UBool gFailed = FALSE; - UText utLoc = UTEXT_INITIALIZER; - const char * cString = "Hello, World"; - UChar uString[] = {0x41, 0x42, 0x43, 0}; - uint8_t *utf8String; - UText *uta; - UText *utb; + // Open + { + UText utLoc = UTEXT_INITIALIZER; + const char * cString = "Hello, World"; + UChar uString[] = {0x41, 0x42, 0x43, 0}; + uint8_t *utf8String; + UText *uta; + UText *utb; + UChar c; - utf8String = (uint8_t *)cString; - uta = utext_openUTF8(&utLoc, utf8String, -1, &status); - TEST_SUCCESS(status); - TEST_ASSERT(uta == &utLoc); + status = U_ZERO_ERROR; + uta = utext_openUChars(NULL, uString, -1, &status); + TEST_SUCCESS(status); + c = utext_next32(uta); + TEST_ASSERT(c == 0x41); + utb = utext_close(uta); + TEST_ASSERT(utb == NULL); - uta = utext_close(&utLoc); - TEST_ASSERT(uta == &utLoc); + utf8String = (uint8_t *)cString; + uta = utext_openUTF8(&utLoc, utf8String, -1, &status); + TEST_SUCCESS(status); + TEST_ASSERT(uta == &utLoc); + + uta = utext_close(&utLoc); + TEST_ASSERT(uta == &utLoc); + } + + // utext_clone() + { + UChar uString[] = {0x41, 0x42, 0x43, 0}; + int len; + UText *uta; + UText *utb; + + status = U_ZERO_ERROR; + uta = utext_openUChars(NULL, uString, -1, &status); + TEST_SUCCESS(status); + utb = utext_clone(NULL, uta, FALSE, &status); + TEST_SUCCESS(status); + TEST_ASSERT(utb != NULL); + TEST_ASSERT(utb != uta); + len = utext_length(uta); + TEST_ASSERT(len == u_strlen(uString)); + utext_close(uta); + utext_close(utb); + } + + // basic access functions + { + UChar uString[] = {0x41, 0x42, 0x43, 0}; + UText *uta; + UChar32 c; + int32_t len; + UBool b; + int32_t i; + + status = U_ZERO_ERROR; + uta = utext_openUChars(NULL, uString, -1, &status); + TEST_ASSERT(uta!=NULL); + TEST_SUCCESS(status); + b = utext_isLengthExpensive(uta); + TEST_ASSERT(b==TRUE); + len = utext_length(uta); + TEST_ASSERT(len == u_strlen(uString)); + b = utext_isLengthExpensive(uta); + TEST_ASSERT(b==FALSE); + + c = utext_char32At(uta, 0); + TEST_ASSERT(c==uString[0]); + + c = utext_current(uta); + TEST_ASSERT(c==uString[0]); + + c = utext_next32(uta); + TEST_ASSERT(c==uString[0]); + c = utext_current(uta); + TEST_ASSERT(c==uString[1]); + + c = utext_previous32(uta); + TEST_ASSERT(c==uString[0]); + c = utext_current(uta); + TEST_ASSERT(c==uString[0]); + + c = utext_next32From(uta, 1); + TEST_ASSERT(c==uString[1]); + c = utext_next32From(uta, u_strlen(uString)); + TEST_ASSERT(c==U_SENTINEL); + + c = utext_previous32From(uta, 2); + TEST_ASSERT(c==uString[1]); + i = utext_getIndex(uta); + TEST_ASSERT(i == 1); + + utext_setIndex(uta, 0); + b = utext_moveIndex(uta, 1); + TEST_ASSERT(b==TRUE); + i = utext_getIndex(uta); + TEST_ASSERT(i==1); + + b = utext_moveIndex(uta, u_strlen(uString)-1); + TEST_ASSERT(b==TRUE); + i = utext_getIndex(uta); + TEST_ASSERT(i==u_strlen(uString)); + + b = utext_moveIndex(uta, 1); + TEST_ASSERT(b==FALSE); + i = utext_getIndex(uta); + TEST_ASSERT(i==u_strlen(uString)); + + utext_setIndex(uta, 0); + c = UTEXT_NEXT32(uta); + TEST_ASSERT(c==uString[0]); + c = utext_current(uta); + TEST_ASSERT(c==uString[1]); + + c = UTEXT_PREVIOUS32(uta); + TEST_ASSERT(c==uString[0]); + c = UTEXT_PREVIOUS32(uta); + TEST_ASSERT(c==U_SENTINEL); + + + utext_close(uta); + } + + { + // + // extract + // + UText *uta; + UChar uString[] = {0x41, 0x42, 0x43, 0}; + UChar buf[100]; + int32_t i; + + status = U_ZERO_ERROR; + uta = utext_openUChars(NULL, uString, -1, &status); + TEST_SUCCESS(status); + + status = U_ZERO_ERROR; + i = utext_extract(uta, 0, 100, NULL, 0, &status); + TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR); + TEST_ASSERT(i == u_strlen(uString)); + + status = U_ZERO_ERROR; + memset(buf, 0, sizeof(buf)); + i = utext_extract(uta, 0, 100, buf, 100, &status); + TEST_SUCCESS(status); + TEST_ASSERT(i == u_strlen(uString)); + i = u_strcmp(uString, buf); + TEST_ASSERT(i == 0); + utext_close(uta); + } + + { + // + // Copy, Replace, isWritable + // Can't create an editable UText from plain C, so all we + // can easily do is check that errors returned. + UText *uta; + UChar uString[] = {0x41, 0x42, 0x43, 0}; + UBool b; + + status = U_ZERO_ERROR; + uta = utext_openUChars(NULL, uString, -1, &status); + TEST_SUCCESS(status); + + b = utext_isWriteble(uta); + TEST_ASSERT(b == FALSE); + + b = utext_hasMetaData(uta); + TEST_ASSERT(b == FALSE); + + utext_replace(uta, + 0, 1, /* start, limit */ + uString, -1, /* replacement, replacement length */ + &status); + TEST_ASSERT(status == U_NO_WRITE_PERMISSION); + + + utext_copy(uta, + 0, 1, /* start, limit */ + 2, /* destination index */ + FALSE, /* move flag */ + &status); + TEST_ASSERT(status == U_NO_WRITE_PERMISSION); + + } + + +} -} \ No newline at end of file diff --git a/icu4c/source/test/intltest/utxttest.cpp b/icu4c/source/test/intltest/utxttest.cpp index 5a7c88db613..72bb7308cc1 100644 --- a/icu4c/source/test/intltest/utxttest.cpp +++ b/icu4c/source/test/intltest/utxttest.cpp @@ -285,31 +285,6 @@ void UTextTest::TestAccess(const UnicodeString &us, UText *ut, int cpCount, m *c } } - cpIndex = 0; - for (i=0; i