diff --git a/icu4c/source/common/caniter.cpp b/icu4c/source/common/caniter.cpp index fdfa0a53e92..83148e2cc04 100644 --- a/icu4c/source/common/caniter.cpp +++ b/icu4c/source/common/caniter.cpp @@ -582,7 +582,7 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_ int32_t inputLen = 0; UChar decomp[decompSize]; - UTF_APPEND_CHAR(temp, inputLen, bufSize, comp); + U16_APPEND_UNSAFE(temp, inputLen, comp); int32_t decompLen = unorm_getDecomposition(comp, FALSE, decomp, decompSize); if(decompLen < 0) { decompLen = -decompLen; @@ -597,7 +597,9 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_ UChar32 decompCp; UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp); - int32_t i = 0; + int32_t i; + UBool overflow = FALSE; + i = segmentPos; while(i < segLen) { UTF_NEXT_CHAR(segment, i, segLen, cp); @@ -620,7 +622,19 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_ // brute force approach - UTF_APPEND_CHAR(buff, bufLen, bufSize, cp); + U16_APPEND(buff, bufLen, bufSize, cp, overflow); + + if(overflow) { + /* + * ### TODO handle buffer overflow + * The buffer is large, but an overflow may still happen with + * unusual input (many combining marks?). + * Reallocate buffer and continue. + * markus 20020929 + */ + + overflow = FALSE; + } /* TODO: optimize // since we know that the classes are monotonically increasing, after zero diff --git a/icu4c/source/common/common.dsp b/icu4c/source/common/common.dsp index 548c4470506..9785298ad1a 100644 --- a/icu4c/source/common/common.dsp +++ b/icu4c/source/common/common.dsp @@ -3100,6 +3100,10 @@ InputPath=.\unicode\utf8.h # End Source File # Begin Source File +SOURCE=.\unicode\utf_old.h +# End Source File +# Begin Source File + SOURCE=.\util.h # End Source File # Begin Source File diff --git a/icu4c/source/common/ucnv_cnv.c b/icu4c/source/common/ucnv_cnv.c index 3180ca99cc0..34b8ca95482 100644 --- a/icu4c/source/common/ucnv_cnv.c +++ b/icu4c/source/common/ucnv_cnv.c @@ -141,7 +141,7 @@ ucnv_getUChar32KeepOverflow(UConverter *cnv, const UChar *buffer, int32_t length /* get the first code point in the buffer */ i=0; - UTF_NEXT_CHAR_SAFE(buffer, i, length, c, FALSE); + UTF_NEXT_CHAR(buffer, i, length, c); if(iUCharErrorBuffer; diff --git a/icu4c/source/common/unicode/ustring.h b/icu4c/source/common/unicode/ustring.h index 8b25a9a2005..32e95cd212f 100644 --- a/icu4c/source/common/unicode/ustring.h +++ b/icu4c/source/common/unicode/ustring.h @@ -156,7 +156,7 @@ u_strstr(const UChar *s, const UChar *substring); * but u_strchr32() will find neither because they * combine to the code point U+10000. * Either function will find U+d800 in "a\ud800b". - * This behavior ensures that UTF_GET_CHAR(u_strchr32(c))==c. + * This behavior ensures that U16_GET(u_strchr32(c))==c. * * @param s The string to search. * @param c The code point (0..0x10ffff) to find. @@ -628,7 +628,7 @@ u_memchr(const UChar *src, UChar ch, int32_t count); * but u_memchr32() will find neither because they * combine to the code point U+10000. * Either function will find U+d800 in "a\ud800b". - * This behavior ensures that UTF_GET_CHAR(u_memchr32(c))==c. + * This behavior ensures that U16_GET(u_memchr32(c))==c. * * @param src string to search in * @param ch character to find diff --git a/icu4c/source/common/unicode/utf.h b/icu4c/source/common/unicode/utf.h index fd352add9e7..bdad0aeefae 100644 --- a/icu4c/source/common/unicode/utf.h +++ b/icu4c/source/common/unicode/utf.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2001, International Business Machines +* Copyright (C) 1999-2002, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -15,77 +15,89 @@ */ /** -* \file -* \brief C API: UChar and UChar32 data types and UTF macros for C Unicode string handling -* -*

This file defines the UChar and UChar32 data types for Unicode code units -* and code points, as well as macros for efficiently getting code points -* in and out of a string.

-* -*

utf.h is included by utypes.h and itself includes the utfXX.h after some -* common definitions. Those files define the macros for each UTF-size.

-* -*

The original concept for these files was for ICU to allow -* in principle to set which UTF (UTF-8/16/32) is used internally -* by defining UTF_SIZE to either 8, 16, or 32. utf.h would then define the UChar type -* accordingly. UTF-16 was the default.

-* -*

This concept has been abandoned. -* A lot of the ICU source code — especially low-level code like -* conversion, normalization, and collation — assumes UTF-16, -* utf.h enforces the default of UTF-16. -* The UTF-8 and UTF-32 macros remain for now for completeness and backward compatibility.

-* -*

Accordingly, utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then -* UChar is defined to be exactly wchar_t, otherwise uint16_t.

-* -*

UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit -* Unicode code point (Unicode scalar value, 0..0x10ffff). -* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as -* the definition of UChar. For details see the documentation for UChar32 itself.

-* -*

utf.h also defines a number of C macros for handling single Unicode code points and -* for using UTF Unicode strings. It includes utf8.h, utf16.h, and utf32.h for the actual -* implementations of those macros and then aliases one set of them (for UTF-16) for general use. -* The UTF-specific macros have the UTF size in the macro name prefixes (UTF16_...), while -* the general alias macros always begin with UTF_...

-* -*

Many string operations can be done with or without error checking. -* Where such a distinction is useful, there are two versions of the macros, "unsafe" and "safe" -* ones with ..._UNSAFE and ..._SAFE suffixes. The unsafe macros are fast but may cause -* program failures if the strings are not well-formed. The safe macros have an additional, boolean -* parameter "strict". If strict is FALSE, then only illegal sequences are detected. -* Otherwise, irregular sequences and non-characters are detected as well (like single surrogates). -* Safe macros return special error code points for illegal/irregular sequences: -* Typically, U+ffff, or values that would result in a code unit sequence of the same length -* as the erroneous input sequence.
-* Note that _UNSAFE macros have fewer parameters: They do not have the strictness parameter, and -* they do not have start/length parameters for boundary checking.

-* -*

Here, the macros are aliased in two steps: -* In the first step, the UTF-specific macros with UTF16_ prefix and _UNSAFE and _SAFE suffixes are -* aliased according to the UTF_SIZE to macros with UTF_ prefix and the same suffixes and signatures. -* Then, in a second step, the default, general alias macros are set to use either the unsafe or -* the safe/not strict (default) or the safe/strict macro; -* these general macros do not have a strictness parameter.

-* -*

It is possible to change the default choice for the general alias macros to be unsafe, safe/not strict or safe/strict. -* The default is safe/not strict. It is not recommended to select the unsafe macros as the basis for -* Unicode string handling in ICU! To select this, define UTF_SAFE, UTF_STRICT, or UTF_UNSAFE.

-* -*

For general use, one should use the default, general macros with UTF_ prefix and no _SAFE/_UNSAFE suffix. -* Only in some cases it may be necessary to control the choice of macro directly and use a less generic alias. -* For example, if it can be assumed that a string is well-formed and the index will stay within the bounds, -* then the _UNSAFE version may be used. -* If a UTF-8 string is to be processed, then the macros with UTF8_ prefixes need to be used.

-*

Usage: ICU coding guidelines for if() statements should be followed when using these macros. -* Compound statements (curly braces {}) must be used for if-else-while... -* bodies and all macro statements should be terminated with semicolon.

-*/ + * \file + * \brief C API: UChar and UChar32 data types and code point macros + * + * This file defines the UChar and UChar32 data types for Unicode code units + * and code points, as well as macros for checking whether a code point is + * a surrogate or a non-character. + * + * utf.h is included by utypes.h and itself includes utf8.h and utf16.h after some + * common definitions. Those files define macros for efficiently getting code points + * in and out of UTF-8/16 strings. + * utf16.h macros have "U16_" prefixes. + * utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling. + * + * ICU processes 16-bit Unicode strings. + * Most of the time, such strings are well-formed UTF-16. + * Single, unpaired surrogates must be handled as well, and are treated in ICU + * like regular code points where possible. + * (Pairs of surrogate code points are indistinguishable from supplementary + * code points encoded as pairs of supplementary code units.) + * + * In fact, almost all Unicode code points in normal text (>99%) + * are on the BMP (<=U+ffff) and even <=U+d7ff. + * ICU functions handle supplementary code points (U+10000..U+10ffff) + * but are optimized for the much more frequently occurring BMP code points. + * + * utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then + * UChar is defined to be exactly wchar_t, otherwise uint16_t. + * + * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit + * Unicode code point (Unicode scalar value, 0..0x10ffff). + * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as + * the definition of UChar. For details see the documentation for UChar32 itself. + * + * utf.h also defines a small number of C macros for single Unicode code points. + * These are simple checks for surrogates and non-characters. + * For actual Unicode character properties see uchar.h. + * + * By default, string operations must be done with error checking in case + * a string is not well-formed UTF-16. + * The macros will detect if a surrogate code unit is unpaired + * (lead unit without trail unit or vice versa) and just return the unit itself + * as the code point. + * (It is an accidental property of Unicode and UTF-16 that all + * malformed sequences can be expressed unambiguously with a distinct subrange + * of Unicode code points.) + * + * When it is safe to assume that text is well-formed UTF-16 + * (does not contain single, unpaired surrogates), then one can use + * U16_..._UNSAFE macros. + * These do not check for proper code unit sequences or truncated text and may + * yield wrong results or even cause a crash if they are used with "malformed" + * text. + * In practice, U16_..._UNSAFE macros will produce slightly less code but + * should not be faster because the processing is only different when a + * surrogate code unit is detected, which will be rare. + * + * Similarly for UTF-8, there are "safe" macros without a suffix, + * and U8_..._UNSAFE versions. + * The performance differences are much larger here because UTF-8 provides so + * many opportunities for malformed sequences. + * The unsafe UTF-8 macros are entirely implemented inside the macro definitions + * and are fast, while the safe UTF-8 macros call functions for all but the + * trivial (ASCII) cases. + * + * Unlike with UTF-16, malformed sequences cannot be expressed with distinct + * code point values (0..U+10ffff). They are indicated with negative values instead. + * + * For more information see the ICU User Guide Strings chapter + * (http://oss.software.ibm.com/icu/userguide/). + * + * Usage: + * ICU coding guidelines for if() statements should be followed when using these macros. + * Compound statements (curly braces {}) must be used for if-else-while... + * bodies and all macro statements should be terminated with semicolon. + * + * @draft ICU 2.4 + */ #ifndef __UTF_H__ #define __UTF_H__ +/* wchar_t-related definitions ---------------------------------------------- */ + /* * ANSI C headers: * stddef.h defines wchar_t @@ -94,18 +106,11 @@ #include /* include the utfXX.h after the following definitions */ -/* If there is no compiler option for the preferred UTF size, then default to UTF-16. */ -#ifndef UTF_SIZE - /** Number of bits in a Unicode string code unit, same as x in UTF-x (8, 16, or 32). */ -# define UTF_SIZE 16 -#endif - -/** Number of bytes in a UChar (sizeof(UChar)). */ -#define U_SIZEOF_UCHAR (UTF_SIZE>>3) - /*! * \def U_SIZEOF_WCHAR_T * U_SIZEOF_WCHAR_T==sizeof(wchar_t). + * + * @stable */ #ifndef U_HAVE_WCHAR_H # define U_HAVE_WCHAR_H 1 @@ -120,10 +125,14 @@ /*! * \def U_WCHAR_IS_UTF16 * Defined if wchar_t uses UTF-16. + * + * @stable */ /*! * \def U_WCHAR_IS_UTF32 * Defined if wchar_t uses UTF-32. + * + * @stable */ #if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32) # ifdef __STDC_ISO_10646__ @@ -145,139 +154,10 @@ # endif #endif -/** - * Define UChar32 as a type for single Unicode code points. - * UChar32 is a signed 32-bit integer. - * - * The Unicode code point range is 0..0x10ffff. - * All other values (negative or >=0x110000) are illegal as Unicode code points. - * They may be used as sentinel values to indicate "done", "error" - * or similar non-code point conditions. - * - * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined - * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned) - * or else to be uint32_t. - * That is, the definition of UChar32 was platform-dependent. - * - * @see UTF_SENTINEL - * @draft ICU 2.4 - */ -typedef int32_t UChar32; +/* UChar and UChar32 definitions -------------------------------------------- */ -/** - * Unicode string and array offset and index type. - * ICU always counts Unicode code units (UChars) for - * string offsets, indexes, and lengths, not Unicode code points. - * - * @deprecated Use int32_t directly. UTextOffset to be removed after 2003-mar. - */ -typedef int32_t UTextOffset; - -/* Specify which macro versions are the default ones - safe or fast. */ -#if !defined(UTF_SAFE) && !defined(UTF_STRICT) && !defined(UTF_UNSAFE) - /** - * The default choice for general Unicode string macros is to use the ..._SAFE macro implementations - * with strict=FALSE. See the utf.h file description. - */ -# define UTF_SAFE -#endif - -/* internal definitions ----------------------------------------------------- */ - -/** - *

UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8, - * which need 1 or 2 bytes in UTF-8:
- * U+0015 = NAK = Negative Acknowledge, C0 control character
- * U+009f = highest C1 control character

- * - *

These are used by ("safe") UTF-8 macros so that they can return an error value - * that needs the same number of code units (bytes) as were seen by - * a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().

- * - * @internal - */ -#define UTF8_ERROR_VALUE_1 0x15 -/** - * See documentation on UTF8_ERROR_VALUE_1 for details. - */ -#define UTF8_ERROR_VALUE_2 0x9f - -/** - * Error value for all UTFs. This code point value will be set by macros with error - * checking if an error is detected. - */ -#define UTF_ERROR_VALUE 0xffff - -/* single-code point definitions -------------------------------------------- */ - -/** - * This value is intended for sentinel values for APIs that - * (take or) return single code points (UChar32). - * It is outside of the Unicode code point range 0..0x10ffff. - * - * For example, a "done" or "error" value in a new API - * could be indicated with UTF_SENTINEL. - * - * ICU APIs designed before ICU 2.4 usually define service-specific "done" - * values, mostly 0xffff. - * Those may need to be distinguished from - * actual U+ffff text contents by calling functions like - * CharacterIterator::hasNext() or UnicodeString::length(). - * - * @see UChar32 - * @draft ICU 2.4 - */ -#define UTF_SENTINEL (-1) - -/** Is this code unit or code point a surrogate (U+d800..U+dfff)? */ -#define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800) - -/** - * Is a given 32-bit code point a Unicode noncharacter? - */ -#define UTF_IS_UNICODE_NONCHAR(c) \ - ((c)>=0xfdd0 && \ - ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ - (uint32_t)(c)<=0x10ffff) - -/** - * Is a given 32-bit code point/Unicode scalar value - * actually a valid Unicode (abstract) character? - * - * Code points that are not characters include: - * - single surrogate code points (U+d800..U+dfff, 2048 code points) - * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) - * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) - * - the highest Unicode code point value is U+10ffff - * - * This means that all code points below U+d800 are character code points, - * and that boundary is tested first for performance. - */ -#define UTF_IS_UNICODE_CHAR(c) \ - ((uint32_t)(c)<0xd800 || \ - ((uint32_t)(c)>0xdfff && \ - (uint32_t)(c)<=0x10ffff && \ - !UTF_IS_UNICODE_NONCHAR(c))) - -/** - * Is a given 32-bit code an error value - * as returned by one of the macros for any UTF? - */ -#define UTF_IS_ERROR(c) \ - (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) - -/** This is a combined macro: Is c a valid Unicode value _and_ not an error code? */ -#define UTF_IS_VALID(c) \ - (UTF_IS_UNICODE_CHAR(c) && \ - (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2) - -/* include the utfXX.h ------------------------------------------------------ */ - -#include "unicode/utf8.h" -#include "unicode/utf16.h" -#include "unicode/utf32.h" - -/* Define types and macros according to the selected UTF size. -------------- */ +/** Number of bytes in a UChar. @stable */ +#define U_SIZEOF_UCHAR 2 /*! * \var UChar @@ -290,262 +170,127 @@ typedef int32_t UTextOffset; * @stable */ -#if UTF_SIZE==8 - -# error UTF-8 is not implemented, undefine UTF_SIZE or define it to 16 - -/* - * ANSI C header: - * limits.h defines CHAR_MAX - */ -# include - - /* Define UChar to be compatible with char if possible. */ -# if CHAR_MAX>=255 - typedef char UChar; -# else - typedef uint8_t UChar; -# endif - -#elif UTF_SIZE==16 - - /* Define UChar to be compatible with wchar_t if possible. */ -# if U_SIZEOF_WCHAR_T==2 - typedef wchar_t UChar; -# else - typedef uint16_t UChar; -# endif - - /** Does this code unit alone encode a code point? */ -# define UTF_IS_SINGLE(uchar) UTF16_IS_SINGLE(uchar) - /** Is this code unit the first one of several? */ -# define UTF_IS_LEAD(uchar) UTF16_IS_LEAD(uchar) - /** Is this code unit one of several but not the first one? */ -# define UTF_IS_TRAIL(uchar) UTF16_IS_TRAIL(uchar) - - /** Does this code point require multiple code units? */ -# define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c) - /** How many code units are used to encode this code point? */ -# define UTF_CHAR_LENGTH(c) UTF16_CHAR_LENGTH(c) - /** How many code units are used at most for any Unicode code point? */ -# define UTF_MAX_CHAR_LENGTH UTF16_MAX_CHAR_LENGTH - /** Estimate the number of code units for a string based on the number of UTF-16 code units. */ -# define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size) - - /** See file documentation and UTF_GET_CHAR. */ -# define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c) - /** See file documentation and UTF_GET_CHAR. */ -# define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) - - /** See file documentation and UTF_NEXT_CHAR. */ -# define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c) - /** See file documentation and UTF_NEXT_CHAR. */ -# define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) - - /** See file documentation and UTF_APPEND_CHAR. */ -# define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c) - /** See file documentation and UTF_APPEND_CHAR. */ -# define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) - - /** See file documentation and UTF_FWD_1. */ -# define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i) - /** See file documentation and UTF_FWD_1. */ -# define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length) - - /** See file documentation and UTF_FWD_N. */ -# define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n) - /** See file documentation and UTF_FWD_N. */ -# define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n) - - /** See file documentation and UTF_SET_CHAR_START. */ -# define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i) - /** See file documentation and UTF_SET_CHAR_START. */ -# define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i) - - /** See file documentation and UTF_PREV_CHAR. */ -# define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c) - /** See file documentation and UTF_PREV_CHAR. */ -# define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) - - /** See file documentation and UTF_BACK_1. */ -# define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i) - /** See file documentation and UTF_BACK_1. */ -# define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i) - - /** See file documentation and UTF_BACK_N. */ -# define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n) - /** See file documentation and UTF_BACK_N. */ -# define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n) - - /** See file documentation and UTF_SET_CHAR_LIMIT. */ -# define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) - /** See file documentation and UTF_SET_CHAR_LIMIT. */ -# define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) - -#elif UTF_SIZE==32 - -# error UTF-32 is not implemented, undefine UTF_SIZE or define it to 16 - - typedef UChar32 UChar; - +/* Define UChar to be compatible with wchar_t if possible. */ +#if U_SIZEOF_WCHAR_T==2 + typedef wchar_t UChar; #else -# error UTF_SIZE must be undefined or one of { 8, 16, 32 } - only 16 is implemented + typedef uint16_t UChar; #endif -/* Define the default macros for handling UTF characters. ------------------- */ +/** + * Define UChar32 as a type for single Unicode code points. + * UChar32 is a signed 32-bit integer (same as int32_t). + * + * The Unicode code point range is 0..0x10ffff. + * All other values (negative or >=0x110000) are illegal as Unicode code points. + * They may be used as sentinel values to indicate "done", "error" + * or similar non-code point conditions. + * + * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined + * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned) + * or else to be uint32_t. + * That is, the definition of UChar32 was platform-dependent. + * + * @see U_SENTINEL + * @draft ICU 2.4 + */ +typedef int32_t UChar32; + +/* single-code point definitions -------------------------------------------- */ /** - * \def UTF_GET_CHAR(s, start, i, length, c) + * This value is intended for sentinel values for APIs that + * (take or) return single code points (UChar32). + * It is outside of the Unicode code point range 0..0x10ffff. + * + * For example, a "done" or "error" value in a new API + * could be indicated with U_SENTINEL. * - * Set c to the code point that contains the code unit i. - * i could point to the first, the last, or an intermediate code unit. - * i is not modified. - * \pre 0<=i=0xfdd0 && \ + ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ + (uint32_t)(c)<=0x10ffff) /** - * \def UTF_APPEND_CHAR(s, i, length, c) + * Is c a Unicode code point value (0..U+10ffff) + * that can be assigned a character? * - * Append the code units of code point c to the string at index i - * and advance i to beyond the new code units (post-increment). - * The code units beginning at index i will be overwritten. - * \pre 0<=c<=0x10ffff - * \pre 0<=i0xdfff && \ + (uint32_t)(c)<=0x10ffff && \ + !U_IS_UNICODE_NONCHAR(c))) /** - * \def UTF_FWD_1(s, i, length) - * - * Advance i to beyond the code units of the code point that begins at i. - * I.e., advance i by one code point. - * i must point to the first code unit of a code point. - * \pre 0<=i -*

Usage: ICU coding guidelines for if() statements should be followed when using these macros. -* Compound statements (curly braces {}) must be used for if-else-while... -* bodies and all macro statements should be terminated with semicolon.

-*/ + * \file + * \brief C API: 16-bit Unicode handling macros + * + * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings. + * utf16.h is included by utf.h after unicode/umachine.h + * and some common definitions. + * + * For more information see utf.h and the ICU User Guide Strings chapter + * (http://oss.software.ibm.com/icu/userguide/). + * + * Usage: + * ICU coding guidelines for if() statements should be followed when using these macros. + * Compound statements (curly braces {}) must be used for if-else-while... + * bodies and all macro statements should be terminated with semicolon. + */ + +/* utf.h must be included first. */ +#ifndef __UTF_H__ +# include "unicode/utf.h" +#endif #ifndef __UTF16_H__ #define __UTF16_H__ /* single-code point definitions -------------------------------------------- */ -/* handle surrogate pairs */ -#define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800) -#define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00) - -#define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0) - -/** Get the UTF-32 value directly from the surrogate pseudo-characters */ -#define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) - -#define UTF16_GET_PAIR_VALUE(first, second) \ - (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) - -/* get the first and second surrogates for a supplementary code point */ /** - * Takes a supplementary code point (0x10000..0x10ffff) - * and computes the first surrogate (0xd800..0xdbff) - * for UTF-16 encoding. + * Does this code unit alone encode a code point (BMP, not a surrogate)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @draft ICU 2.4 */ -#define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) +#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c) /** - * Takes a supplementary code point (0x10000..0x10ffff) - * and computes the second surrogate (0xdc00..0xdfff) - * for UTF-16 encoding. + * Is this code unit a lead surrogate (U+d800..U+dbff)? + * @param c 16-bit code unit + * @return TRUE or FALSE + * @draft ICU 2.4 */ -#define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) - -/** alias for UTF_FIRST_SURROGATE */ -#define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary) - -/** alias for UTF_SECOND_SURROGATE */ -#define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary) - -/* classes of code unit values */ -#define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar) -#define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar) -#define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar) - -/* number of code units per code point */ -#define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff) -#define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) -#define UTF16_MAX_CHAR_LENGTH 2 - -/* average number of code units compared to UTF-16 */ -#define UTF16_ARRAY_SIZE(size) (size) +#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800) /** - * Get a single code point from an offset that points to any - * of the code units that belong to that code point. - * Assume 0<=i>10)+0xd7c0) + +/** + * Get the trail surrogate (0xdc00..0xdfff) for a + * supplementary code point (0x10000..0x10ffff). + * @param c 32-bit code point (U+10000..U+10ffff) + * @return trail surrogate (U+dc00..U+dfff) for c + * @draft ICU 2.4 + */ +#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) + +/** + * How many 16-bit code units are used to encode this Unicode code point? (1 or 2) + * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff). + * @param c 32-bit code point + * @return 1 or 2 + * @draft ICU 2.4 + */ +#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + +/** + * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff). + * @return 2 + * @draft ICU 2.4 + */ +#define U16_MAX_LENGTH 2 + +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * The offset may point to either the lead or trail surrogate unit + * for a supplementary code point, in which case the macro will read + * the adjacent matching surrogate as well. + * The result is undefined if the offset points to a single, unpaired surrogate. + * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U16_GET + * @draft ICU 2.4 + */ +#define U16_GET_UNSAFE(s, i, c) { \ (c)=(s)[i]; \ - if(UTF_IS_SURROGATE(c)) { \ - if(UTF_IS_SURROGATE_FIRST(c)) { \ - (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \ + if(U16_IS_SURROGATE(c)) { \ + if(U16_IS_SURROGATE_LEAD(c)) { \ + (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \ } else { \ - (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \ + (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \ } \ } \ } -#define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The offset may point to either the lead or trail surrogate unit + * for a supplementary code point, in which case the macro will read + * the adjacent matching surrogate as well. + * If the offset points to a single, unpaired surrogate, then that itself + * will be returned as the code point. + * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, start<=i=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ - (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ - /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ - } else if(strict) {\ - /* unmatched second surrogate */ \ - (c)=UTF_ERROR_VALUE; \ + if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ } \ } \ - } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ - (c)=UTF_ERROR_VALUE; \ } \ } /* definitions with forward iteration --------------------------------------- */ -/* - * all the macros that go forward assume that - * the initial offset is 0<=i>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } else /* c>0x10ffff or not enough space */ { \ + (isError)=TRUE; \ + } \ +} + +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_FWD_1 + * @draft ICU 2.4 + */ +#define U16_FWD_1_UNSAFE(s, i) { \ + if(U16_IS_LEAD((s)[(i)++])) { \ ++(i); \ } \ } -#define UTF16_FWD_N_UNSAFE(s, i, n) { \ +/** + * Advance the string offset from one code point boundary to the next. + * (Post-incrementing iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param i string offset, i0) { \ - UTF16_FWD_1_UNSAFE(s, i); \ + U16_FWD_1_UNSAFE(s, i); \ --__N; \ } \ } /** - * Set a random-access offset and adjust it so that - * it points to the beginning of a Unicode character. - * The offset that is passed in points to - * any code unit of a code point - * and will point to the first code unit after - * the macro invocation. - * Never increments the offset. + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param i string offset, i>10)+0xd7c0); \ - (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ - } else /* not enough space */ { \ - (s)[(i)++]=UTF_ERROR_VALUE; \ - } \ - } else /* c>0x10ffff, write error value */ { \ - (s)[(i)++]=UTF_ERROR_VALUE; \ - } \ -} - -#define UTF16_FWD_1_SAFE(s, i, length) { \ - if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \ - ++(i); \ - } \ -} - -#define UTF16_FWD_N_SAFE(s, i, length, n) { \ +#define U16_FWD_N(s, i, length, n) { \ int32_t __N=(n); \ while(__N>0 && (i)<(length)) { \ - UTF16_FWD_1_SAFE(s, i, length); \ + U16_FWD_1(s, i, length); \ --__N; \ } \ } -#define UTF16_SET_CHAR_START_SAFE(s, start, i) { \ - if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to the trail surrogate of a surrogate pair, + * then the offset is decremented. + * Otherwise, it is not modified. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_SET_CP_START + * @draft ICU 2.4 + */ +#define U16_SET_CP_START_UNSAFE(s, i) { \ + if(U16_IS_TRAIL((s)[i])) { \ + --(i); \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to the trail surrogate of a surrogate pair, + * then the offset is decremented. + * Otherwise, it is not modified. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, start<=i + * @see U16_SET_CP_START_UNSAFE + * @draft ICU 2.4 + */ +#define U16_SET_CP_START(s, start, i) { \ + if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ --(i); \ } \ } /* definitions with backward iteration -------------------------------------- */ -/* - * all the macros that go backward assume that - * the valid buffer range starts at offset 0 - * and that the initial offset is 0(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \ + --(i); \ + (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \ + } \ + } \ +} + +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_BACK_1 + * @draft ICU 2.4 + */ +#define U16_BACK_1_UNSAFE(s, i) { \ + if(U16_IS_TRAIL((s)[--(i)])) { \ --(i); \ } \ } -#define UTF16_BACK_N_UNSAFE(s, i, n) { \ +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, start<=i + * @see U16_BACK_1_UNSAFE + * @draft ICU 2.4 + */ +#define U16_BACK_1(s, start, i) { \ + if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \ + --(i); \ + } \ +} + +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @param n number of code points to skip + * @see U16_BACK_N + * @draft ICU 2.4 + */ +#define U16_BACK_N_UNSAFE(s, i, n) { \ int32_t __N=(n); \ while(__N>0) { \ - UTF16_BACK_1_UNSAFE(s, i); \ + U16_BACK_1_UNSAFE(s, i); \ --__N; \ } \ } /** - * Set a random-access offset and adjust it so that - * it points after the end of a Unicode character. - * The offset that is passed in points behind - * any code unit of a code point - * and will point behind the last code unit after - * the macro invocation. - * Never decrements the offset. + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param i string offset, i0 && (i)>(start)) { \ + U16_BACK_1(s, start, i); \ + --__N; \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind the lead surrogate of a surrogate pair, + * then the offset is incremented. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-16. + * + * @param s const UChar * string + * @param i string offset + * @see U16_SET_CP_LIMIT + * @draft ICU 2.4 + */ +#define U16_SET_CP_LIMIT_UNSAFE(s, i) { \ + if(U16_IS_LEAD((s)[(i)-1])) { \ ++(i); \ } \ } -/* safe versions with error-checking and optional regularity-checking */ - -#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \ - (c)=(s)[--(i)]; \ - if(UTF_IS_SECOND_SURROGATE(c)) { \ - uint16_t __c2; \ - if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ - --(i); \ - (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ - /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ - } else if(strict) {\ - /* unmatched second surrogate */ \ - (c)=UTF_ERROR_VALUE; \ - } \ - } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ - /* unmatched first surrogate or other non-character */ \ - (c)=UTF_ERROR_VALUE; \ - } \ -} - -#define UTF16_BACK_1_SAFE(s, start, i) { \ - if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ - --(i); \ - } \ -} - -#define UTF16_BACK_N_SAFE(s, start, i, n) { \ - int32_t __N=(n); \ - while(__N>0 && (i)>(start)) { \ - UTF16_BACK_1_SAFE(s, start, i); \ - --__N; \ - } \ -} - -#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \ - if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \ +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind the lead surrogate of a surrogate pair, + * then the offset is incremented. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, start<=i<=length + * @param length string length + * @see U16_SET_CP_LIMIT_UNSAFE + * @draft ICU 2.4 + */ +#define U16_SET_CP_LIMIT(s, start, i, length) { \ + if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \ ++(i); \ } \ } diff --git a/icu4c/source/common/unicode/utf32.h b/icu4c/source/common/unicode/utf32.h index 7b3a1da2ce7..eaafb2b3d6e 100644 --- a/icu4c/source/common/unicode/utf32.h +++ b/icu4c/source/common/unicode/utf32.h @@ -14,146 +14,10 @@ * created by: Markus W. Scherer */ /** -* \file -* \brief C API: UTF-32 macros -* -* This file defines macros to deal with UTF-32 code units and code points. -* Signatures and semantics are the same as for the similarly named macros -* in utf16.h. -* utf32.h is included by utf.h after unicode/umachine.h

-* and some common definitions. -*

Usage: ICU coding guidelines for if() statements should be followed when using these macros. -* Compound statements (curly braces {}) must be used for if-else-while... -* bodies and all macro statements should be terminated with semicolon.

-*/ - -#ifndef __UTF32_H__ -#define __UTF32_H__ - -/* internal definitions ----------------------------------------------------- */ - -#define UTF32_IS_SAFE(c, strict) \ - (!(strict) ? \ - (uint32_t)(c)<=0x10ffff : \ - UTF_IS_UNICODE_CHAR(c)) - -/* - * For the semantics of all of these macros, see utf16.h. - * The UTF-32 versions are trivial because any code point is - * encoded using exactly one code unit. + * \file + * \brief C API: UTF-32 macros + * + * This file is deprecated and its contents moved to utf_old.h. + * See utf_old.h and Jitterbug 2150 and its discussion on the ICU mailing list + * in September 2002. */ - -/* single-code point definitions -------------------------------------------- */ - -/* classes of code unit values */ -#define UTF32_IS_SINGLE(uchar) 1 -#define UTF32_IS_LEAD(uchar) 0 -#define UTF32_IS_TRAIL(uchar) 0 - -/* number of code units per code point */ -#define UTF32_NEED_MULTIPLE_UCHAR(c) 0 -#define UTF32_CHAR_LENGTH(c) 1 -#define UTF32_MAX_CHAR_LENGTH 1 - -/* average number of code units compared to UTF-16 */ -#define UTF32_ARRAY_SIZE(size) (size) - -#define UTF32_GET_CHAR_UNSAFE(s, i, c) { \ - (c)=(s)[i]; \ -} - -#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ - (c)=(s)[i]; \ - if(!UTF32_IS_SAFE(c, strict)) { \ - (c)=UTF_ERROR_VALUE; \ - } \ -} - -/* definitions with forward iteration --------------------------------------- */ - -#define UTF32_NEXT_CHAR_UNSAFE(s, i, c) { \ - (c)=(s)[(i)++]; \ -} - -#define UTF32_APPEND_CHAR_UNSAFE(s, i, c) { \ - (s)[(i)++]=(c); \ -} - -#define UTF32_FWD_1_UNSAFE(s, i) { \ - ++(i); \ -} - -#define UTF32_FWD_N_UNSAFE(s, i, n) { \ - (i)+=(n); \ -} - -#define UTF32_SET_CHAR_START_UNSAFE(s, i) { \ -} - -#define UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ - (c)=(s)[(i)++]; \ - if(!UTF32_IS_SAFE(c, strict)) { \ - (c)=UTF_ERROR_VALUE; \ - } \ -} - -#define UTF32_APPEND_CHAR_SAFE(s, i, length, c) { \ - if((uint32_t)(c)<=0x10ffff) { \ - (s)[(i)++]=(c); \ - } else /* c>0x10ffff, write 0xfffd */ { \ - (s)[(i)++]=0xfffd; \ - } \ -} - -#define UTF32_FWD_1_SAFE(s, i, length) { \ - ++(i); \ -} - -#define UTF32_FWD_N_SAFE(s, i, length, n) { \ - if(((i)+=(n))>(length)) { \ - (i)=(length); \ - } \ -} - -#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \ -} - -/* definitions with backward iteration -------------------------------------- */ - -#define UTF32_PREV_CHAR_UNSAFE(s, i, c) { \ - (c)=(s)[--(i)]; \ -} - -#define UTF32_BACK_1_UNSAFE(s, i) { \ - --(i); \ -} - -#define UTF32_BACK_N_UNSAFE(s, i, n) { \ - (i)-=(n); \ -} - -#define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \ -} - -#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \ - (c)=(s)[--(i)]; \ - if(!UTF32_IS_SAFE(c, strict)) { \ - (c)=UTF_ERROR_VALUE; \ - } \ -} - -#define UTF32_BACK_1_SAFE(s, start, i) { \ - --(i); \ -} - -#define UTF32_BACK_N_SAFE(s, start, i, n) { \ - (i)-=(n); \ - if((i)<(start)) { \ - (i)=(start); \ - } \ -} - -#define UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) { \ -} - -#endif diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h index be6644fa6d3..13f5d618ed1 100644 --- a/icu4c/source/common/unicode/utf8.h +++ b/icu4c/source/common/unicode/utf8.h @@ -15,23 +15,25 @@ */ /** -* \file -* \brief C API: UTF-8 macros -* -* This file defines macros to deal with UTF-8 code units and code points. -* Signatures and semantics are the same as for the similarly named macros -* in utf16.h. -* utf8.h is included by utf.h after unicode/umachine.h -* and some common definitions.

-*

Usage: ICU coding guidelines for if() statements should be followed when using these macros. -* Compound statements (curly braces {}) must be used for if-else-while... -* bodies and all macro statements should be terminated with semicolon.

-*/ - + * \file + * \brief C API: 8-bit Unicode handling macros + * + * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. + * utf8.h is included by utf.h after unicode/umachine.h + * and some common definitions. + * + * For more information see utf.h and the ICU User Guide Strings chapter + * (http://oss.software.ibm.com/icu/userguide/). + * + * Usage: + * ICU coding guidelines for if() statements should be followed when using these macros. + * Compound statements (curly braces {}) must be used for if-else-while... + * bodies and all macro statements should be terminated with semicolon. + */ /* utf.h must be included first. */ #ifndef __UTF_H__ -# include "unicode/utf.h" +# include "unicode/utf.h" #endif #ifndef __UTF8_H__ @@ -39,6 +41,12 @@ /* internal definitions ----------------------------------------------------- */ +/** + * \var utf8_countTrailBytes + * Internal array with numbers of trail bytes for any given byte used in + * lead byte position. + * @internal + */ #ifdef U_UTF8_IMPL U_CAPI const uint8_t utf8_countTrailBytes[256]; @@ -48,114 +56,166 @@ utf8_countTrailBytes[256]; #endif /** - * Count the trail bytes for a lead byte - - * this macro should be used so that the assembler code - * that is mentioned in utf_impl.c could be used here. + * Count the trail bytes for a UTF-8 lead byte. + * @internal */ -#define UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) +#define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) -/* use a macro here, too - there may be a simpler way with some machines */ -#define UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) +/** + * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. + * @internal + */ +#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) +/** + * Function for handling "next code point" with error-checking. + * @internal + */ U_CAPI UChar32 U_EXPORT2 -utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict, UBool *pIsError); +utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); +/** + * Function for handling "append code point" with error-checking. + * @internal + */ U_CAPI int32_t U_EXPORT2 -utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c); +utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); +/** + * Function for handling "previous code point" with error-checking. + * @internal + */ U_CAPI UChar32 U_EXPORT2 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); +/** + * Function for handling "skip backward one code point" with error-checking. + * @internal + */ U_CAPI int32_t U_EXPORT2 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); -/* - * For the semantics of all of these macros, see utf16.h. - * The UTF-8 macros favor sequences more the shorter they are. - * Sometimes, only the single-byte case is covered by a macro, - * while longer sequences are handled by a function call. - */ - /* single-code point definitions -------------------------------------------- */ -/** Is this this code point a single code unit (byte)? */ -#define UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0) -/** Is this this code unit the lead code unit (byte) of a code point? */ -#define UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e) -/** Is this this code unit a trailing code unit (byte) of a code point? */ -#define UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80) - -/** Does this scalar Unicode value need multiple code units for storage? */ -#define UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f) +/** + * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @draft ICU 2.4 + */ +#define U8_IS_SINGLE(c) (((c)&0x80)==0) /** - * Given the lead character, how many bytes are taken by this code point. - * ICU does not deal with code points >0x10ffff - * unless necessary for advancing in the byte stream. - * - * These length macros take into account that for values >0x10ffff - * the "safe" append macros would write the error code point 0xffff - * with 3 bytes. - * Code point comparisons need to be in uint32_t because UChar32 - * may be a signed type, and negative values must be recognized. + * Is this code unit (byte) a UTF-8 lead byte? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @draft ICU 2.4 */ -#if 1 -# define UTF8_CHAR_LENGTH(c) \ - ((uint32_t)(c)<=0x7f ? 1 : \ - ((uint32_t)(c)<=0x7ff ? 2 : \ - ((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \ - ) \ - ) -#else -# define UTF8_CHAR_LENGTH(c) \ - ((uint32_t)(c)<=0x7f ? 1 : \ - ((uint32_t)(c)<=0x7ff ? 2 : \ - ((uint32_t)(c)<=0xffff ? 3 : \ - ((uint32_t)(c)<=0x10ffff ? 4 : \ - ((uint32_t)(c)<=0x3ffffff ? 5 : \ - ((uint32_t)(c)<=0x7fffffff ? 6 : 3) \ - ) \ - ) \ +#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) + +/** + * Is this code unit (byte) a UTF-8 trail byte? + * @param c 8-bit code unit (byte) + * @return TRUE or FALSE + * @draft ICU 2.4 + */ +#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80) + +/** + * How many code units (bytes) are used for the UTF-8 encoding + * of this Unicode code point? + * @param c 32-bit code point + * @return 1..4, or 0 if c is a surrogate or not a Unicode code point + * @draft ICU 2.4 + */ +#define U8_LENGTH(c) \ + ((uint32_t)(c)<=0x7f ? 1 : \ + ((uint32_t)(c)<=0x7ff ? 2 : \ + ((uint32_t)(c)<=0xd7ff ? 3 : \ + ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ + ((uint32_t)(c)<=0xffff ? 3 : 4)\ ) \ ) \ - ) -#endif + ) \ + ) -/** The maximum number of bytes per code point */ -#define UTF8_MAX_CHAR_LENGTH 4 +/** + * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). + * @return 4 + * @draft ICU 2.4 + */ +#define U8_MAX_LENGTH 4 -/** Average number of code units compared to UTF-16 */ -#define UTF8_ARRAY_SIZE(size) ((5*(size))/2) - -#define UTF8_GET_CHAR_UNSAFE(s, i, c) { \ +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * The offset may point to either the lead byte or one of the trail bytes + * for a code point, in which case the macro will read all of the bytes + * for the code point. + * The result is undefined if the offset points to an illegal UTF-8 + * byte sequence. + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U8_GET + * @draft ICU 2.4 + */ +#define U8_GET_UNSAFE(s, i, c) { \ int32_t __I=(int32_t)(i); \ - UTF8_SET_CHAR_START_UNSAFE(s, __I); \ - UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \ + U8_SET_CP_START_UNSAFE(s, __I); \ + U8_NEXT_UNSAFE(s, __I, c); \ } -#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ +/** + * Get a code point from a string at a random-access offset, + * without changing the offset. + * The offset may point to either the lead byte or one of the trail bytes + * for a code point, in which case the macro will read all of the bytes + * for the code point. + * If the offset points to an illegal UTF-8 byte sequence, then + * c is set to a negative value. + * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. + * + * @param s const UChar * string + * @param start starting string offset + * @param i string offset, start<=i instead of <0>. - * The strict checks also check for non-characters. + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * The offset may point to the lead byte of a multi-byte sequence, + * in which case the macro will read the whole sequence. + * The result is undefined if the offset points to a trail byte + * or an illegal UTF-8 sequence. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U8_NEXT + * @draft ICU 2.4 */ -#define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \ +#define U8_NEXT_UNSAFE(s, i, c) { \ (c)=(s)[(i)++]; \ if((uint8_t)((c)-0xc0)<0x35) { \ - uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \ - UTF8_MASK_LEAD_BYTE(c, __count); \ + uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \ + U8_MASK_LEAD_BYTE(c, __count); \ switch(__count) { \ /* each following branch falls through to the next one */ \ case 3: \ @@ -170,7 +230,49 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); } \ } -#define UTF8_APPEND_CHAR_UNSAFE(s, i, c) { \ +/** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The offset may point to the lead byte of a multi-byte sequence, + * in which case the macro will read the whole sequence. + * If the offset points to a trail byte or an illegal UTF-8 sequence, then + * c is set to a negative value. + * + * @param s const UChar * string + * @param i string offset, i=0x80) { \ + if(U8_IS_LEAD(c)) { \ + (c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, -1); \ + } else { \ + (c)=U_SENTINEL; \ + } \ + } \ +} + +/** + * Append a code point to a string, overwriting 1 to 4 bytes. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Unsafe" macro, assumes a valid code point and sufficient space in the string. + * Otherwise, the result is undefined. + * + * @param s const UChar * string buffer + * @param i string offset + * @param c code point to append + * @see U8_APPEND + * @draft ICU 2.4 + */ +#define U8_APPEND_UNSAFE(s, i, c) { \ if((uint32_t)(c)<=0x7f) { \ (s)[(i)++]=(uint8_t)(c); \ } else { \ @@ -189,74 +291,172 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); } \ } -#define UTF8_FWD_1_UNSAFE(s, i) { \ - (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \ -} - -#define UTF8_FWD_N_UNSAFE(s, i, n) { \ - int32_t __N=(n); \ - while(__N>0) { \ - UTF8_FWD_1_UNSAFE(s, i); \ - --__N; \ - } \ -} - -#define UTF8_SET_CHAR_START_UNSAFE(s, i) { \ - while(UTF8_IS_TRAIL((s)[i])) { --(i); } \ -} - -#define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ - (c)=(s)[(i)++]; \ - if((c)>=0x80) { \ - if(UTF8_IS_LEAD(c)) { \ - (c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict, NULL); \ - } else { \ - (c)=UTF8_ERROR_VALUE_1; \ - } \ - } \ -} - -#define UTF8_APPEND_CHAR_SAFE(s, i, length, c) { \ +/** + * Append a code point to a string, overwriting 1 or 2 code units. + * The offset points to the current end of the string contents + * and is advanced (post-increment). + * "Safe" macro, checks for a valid code point. + * If a non-ASCII code point is written, checks for sufficient space in the string. + * If the code point is not valid or trail bytes do not fit, + * then isError is set to TRUE. + * + * @param s const UChar * string buffer + * @param i string offset, i(length)) { \ __count=(uint8_t)((length)-(i)); \ } \ - while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \ + while(__count>0 && U8_IS_TRAIL((s)[i])) { \ ++(i); \ --__count; \ } \ } \ } -#define UTF8_FWD_N_SAFE(s, i, length, n) { \ +/** + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const UChar * string + * @param i string offset + * @param n number of code points to skip + * @see U8_FWD_N + * @draft ICU 2.4 + */ +#define U8_FWD_N_UNSAFE(s, i, n) { \ int32_t __N=(n); \ - while(__N>0 && (i)<(length)) { \ - UTF8_FWD_1_SAFE(s, i, length); \ + while(__N>0) { \ + U8_FWD_1_UNSAFE(s, i); \ --__N; \ } \ } -#define UTF8_SET_CHAR_START_SAFE(s, start, i) { \ - if(UTF8_IS_TRAIL((s)[(i)])) { \ +/** + * Advance the string offset from one code point boundary to the n-th next one, + * i.e., move forward by n code points. + * (Post-incrementing iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * @param s const UChar * string + * @param i string offset, i0 && (i)<(length)) { \ + U8_FWD_1(s, i, length); \ + --__N; \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to a UTF-8 trail byte, + * then the offset is moved backward to the corresponding lead byte. + * Otherwise, it is not modified. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const UChar * string + * @param i string offset + * @see U8_SET_CP_START + * @draft ICU 2.4 + */ +#define U8_SET_CP_START_UNSAFE(s, i) { \ + while(U8_IS_TRAIL((s)[i])) { --(i); } \ +} + +/** + * Adjust a random-access offset to a code point boundary + * at the start of a code point. + * If the offset points to a UTF-8 trail byte, + * then the offset is moved backward to the corresponding lead byte. + * Otherwise, it is not modified. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, start<=i + * @see U8_SET_CP_START_UNSAFE + * @draft ICU 2.4 + */ +#define U8_SET_CP_START(s, start, i) { \ + if(U8_IS_TRAIL((s)[(i)])) { \ (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ } \ } /* definitions with backward iteration -------------------------------------- */ -#define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \ +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Unsafe" macro, assumes well-formed UTF-8. + * + * The input offset may be the same as the string length. + * If the offset is behind a multi-byte sequence, then the macro will read + * the whole sequence. + * If the offset is behind a lead byte, then that itself + * will be returned as the code point. + * The result is undefined if the offset is behind an illegal UTF-8 sequence. + * + * @param s const UChar * string + * @param i string offset + * @param c output UChar32 variable + * @see U8_PREV + * @draft ICU 2.4 + */ +#define U8_PREV_UNSAFE(s, i, c) { \ (c)=(s)[--(i)]; \ - if(UTF8_IS_TRAIL(c)) { \ + if(U8_IS_TRAIL(c)) { \ uint8_t __b, __count=1, __shift=6; \ \ /* c is a trail byte */ \ @@ -264,7 +464,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); for(;;) { \ __b=(s)[--(i)]; \ if(__b>=0xc0) { \ - UTF8_MASK_LEAD_BYTE(__b, __count); \ + U8_MASK_LEAD_BYTE(__b, __count); \ (c)|=(UChar32)__b<<__shift; \ break; \ } else { \ @@ -276,57 +476,151 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); } \ } -#define UTF8_BACK_1_UNSAFE(s, i) { \ - while(UTF8_IS_TRAIL((s)[--(i)])) {} \ -} - -#define UTF8_BACK_N_UNSAFE(s, i, n) { \ - int32_t __N=(n); \ - while(__N>0) { \ - UTF8_BACK_1_UNSAFE(s, i); \ - --__N; \ - } \ -} - -#define UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) { \ - UTF8_BACK_1_UNSAFE(s, i); \ - UTF8_FWD_1_UNSAFE(s, i); \ -} - -#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \ +/** + * Move the string offset from one code point boundary to the previous one + * and get the code point between them. + * (Pre-decrementing backward iteration.) + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * The input offset may be the same as the string length. + * If the offset is behind a multi-byte sequence, then the macro will read + * the whole sequence. + * If the offset is behind a lead byte, then that itself + * will be returned as the code point. + * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, start<=i + * @param length string length + * @param c output UChar32 variable, set to <0 in case of an error + * @see U8_PREV_UNSAFE + * @draft ICU 2.4 + */ +#define U8_PREV(s, start, i, c) { \ (c)=(s)[--(i)]; \ if((c)>=0x80) { \ if((c)<=0xbf) { \ - (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \ + (c)=utf8_prevCharSafeBody(s, start, &(i), c, -1); \ } else { \ - (c)=UTF8_ERROR_VALUE_1; \ + (c)=U_SENTINEL; \ } \ } \ } -#define UTF8_BACK_1_SAFE(s, start, i) { \ - if(UTF8_IS_TRAIL((s)[--(i)])) { \ +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const UChar * string + * @param i string offset + * @see U8_BACK_1 + * @draft ICU 2.4 + */ +#define U8_BACK_1_UNSAFE(s, i) { \ + while(U8_IS_TRAIL((s)[--(i)])) {} \ +} + +/** + * Move the string offset from one code point boundary to the previous one. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, start<=i + * @see U8_BACK_1_UNSAFE + * @draft ICU 2.4 + */ +#define U8_BACK_1(s, start, i) { \ + if(U8_IS_TRAIL((s)[--(i)])) { \ (i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \ } \ } -#define UTF8_BACK_N_SAFE(s, start, i, n) { \ +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const UChar * string + * @param i string offset + * @param n number of code points to skip + * @see U8_BACK_N + * @draft ICU 2.4 + */ +#define U8_BACK_N_UNSAFE(s, i, n) { \ int32_t __N=(n); \ - while(__N>0 && (i)>(start)) { \ - UTF8_BACK_1_SAFE(s, start, i); \ + while(__N>0) { \ + U8_BACK_1_UNSAFE(s, i); \ --__N; \ } \ } -/* - * Need to use UTF8_FWD_1_SAFE() because UTF8_BACK_1_SAFE() - * may have started from the middle of the sequence and not checked - * all trail bytes. +/** + * Move the string offset from one code point boundary to the n-th one before it, + * i.e., move backward by n code points. + * (Pre-decrementing backward iteration.) + * The input offset may be the same as the string length. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * @param s const UChar * string + * @param i string offset, i0 && (i)>(start)) { \ + U8_BACK_1(s, start, i); \ + --__N; \ + } \ +} + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind a partial multi-byte sequence, + * then the offset is incremented to behind the whole sequence. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Unsafe" macro, assumes well-formed UTF-8. + * + * @param s const UChar * string + * @param i string offset + * @see U8_SET_CP_LIMIT + * @draft ICU 2.4 + */ +#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \ + U8_BACK_1_UNSAFE(s, i); \ + U8_FWD_1_UNSAFE(s, i); \ +} + +/** + * Adjust a random-access offset to a code point boundary after a code point. + * If the offset is behind a partial multi-byte sequence, + * then the offset is incremented to behind the whole sequence. + * Otherwise, it is not modified. + * The input offset may be the same as the string length. + * "Safe" macro, checks for illegal sequences and for string boundaries. + * + * @param s const UChar * string + * @param start starting string offset (usually 0) + * @param i string offset, start<=i<=length + * @param length string length + * @see U8_SET_CP_LIMIT_UNSAFE + * @draft ICU 2.4 + */ +#define U8_SET_CP_LIMIT(s, start, i, length) { \ if((start)<(i) && (i)<(length)) { \ - UTF8_BACK_1_SAFE(s, start, i); \ - UTF8_FWD_1_SAFE(s, i, length); \ + U8_BACK_1(s, start, i); \ + U8_FWD_1(s, i, length); \ } \ } diff --git a/icu4c/source/common/unicode/utf_old.h b/icu4c/source/common/unicode/utf_old.h new file mode 100644 index 00000000000..d859a54a0db --- /dev/null +++ b/icu4c/source/common/unicode/utf_old.h @@ -0,0 +1,1153 @@ +/* +******************************************************************************* +* +* Copyright (C) 2002, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: utf.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2002sep21 +* created by: Markus W. Scherer +*/ + +/** + * \file + * The macros in utf_old.h are all deprecated and their use discouraged. + * Some of the design principles behind the set of UTF macros + * have changed or proved impractical. + * Almost all of the old "UTF macros" are at least renamed. + * If you are looking for a new equivalent to an old macro, please see the + * comment at the old one. + * + * utf_old.h is included by utf.h after unicode/umachine.h + * and some common definitions, to not break old code. + * + * Brief summary of reasons for deprecation: + * - Switch on UTF_SIZE (selection of UTF-8/16/32 default string processing) + * was impractical. + * - Switch on UTF_SAFE etc. (selection of unsafe/safe/strict default string processing) + * was of little use and impractical. + * - Whole classes of macros became obsolete outside of the UTF_SIZE/UTF_SAFE + * selection framework: UTF32_ macros (all trivial) + * and UTF_ default and intermediate macros (all aliases). + * - The selection framework also caused many macro aliases. + * - Change in Unicode standard: "irregular" sequences (3.0) became illegal (3.2). + * - Change of language in Unicode standard: + * Growing distinction between internal x-bit Unicode strings and external UTF-x + * forms, with the former more lenient. + * Suggests renaming of UTF16_ macros to U16_. + * - The prefix "UTF_" without a width number confused some users. + * - "Safe" append macros needed the addition of an error indicator output. + * - "Safe" UTF-8 macros used legitimate (if rarely used) code point values + * to indicate error conditions. + * - The use of the "_CHAR" infix for code point operations confused some users. + * + * More details: + * + * Until ICU 2.2, utf.h theoretically allowed to choose among UTF-8/16/32 + * for string processing, and among unsafe/safe/strict default macros for that. + * + * It proved nearly impossible to write non-trivial, high-performance code + * that is UTF-generic. + * Unsafe default macros would be dangerous for default string processing, + * and the main reason for the "strict" versions disappeared: + * Between Unicode 3.0 and 3.2 all "irregular" UTF-8 sequences became illegal. + * The only other conditions that "strict" checked for were non-characters, + * which are valid during processing. Only during text input/output should they + * be checked, and at that time other well-formedness checks may be + * necessary or useful as well. + * This can still be done by using U16_NEXT and U_IS_UNICODE_NONCHAR + * or U_IS_UNICODE_CHAR. + * + * The old UTF8_..._SAFE macros also used some normal Unicode code points + * to indicate malformed sequences. + * The new UTF8_ macros without suffix use negative values instead. + * + * The entire contents of utf32.h was moved here without replacement + * because all those macros were trivial and + * were meaningful only in the framework of choosing the UTF size. + * + * See Jitterbug 2150 and its discussion on the ICU mailing list + * in September 2002. + * + *
+ * + * Obsolete part of pre-ICU 2.4 utf.h file documentation: + * + *

The original concept for these files was for ICU to allow + * in principle to set which UTF (UTF-8/16/32) is used internally + * by defining UTF_SIZE to either 8, 16, or 32. utf.h would then define the UChar type + * accordingly. UTF-16 was the default.

+ * + *

This concept has been abandoned. + * A lot of the ICU source code — especially low-level code like + * conversion, normalization, and collation — assumes UTF-16, + * utf.h enforces the default of UTF-16. + * The UTF-8 and UTF-32 macros remain for now for completeness and backward compatibility.

+ * + *

Accordingly, utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then + * UChar is defined to be exactly wchar_t, otherwise uint16_t.

+ * + *

UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit + * Unicode code point (Unicode scalar value, 0..0x10ffff). + * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as + * the definition of UChar. For details see the documentation for UChar32 itself.

+ * + *

utf.h also defines a number of C macros for handling single Unicode code points and + * for using UTF Unicode strings. It includes utf8.h, utf16.h, and utf32.h for the actual + * implementations of those macros and then aliases one set of them (for UTF-16) for general use. + * The UTF-specific macros have the UTF size in the macro name prefixes (UTF16_...), while + * the general alias macros always begin with UTF_...

+ * + *

Many string operations can be done with or without error checking. + * Where such a distinction is useful, there are two versions of the macros, "unsafe" and "safe" + * ones with ..._UNSAFE and ..._SAFE suffixes. The unsafe macros are fast but may cause + * program failures if the strings are not well-formed. The safe macros have an additional, boolean + * parameter "strict". If strict is FALSE, then only illegal sequences are detected. + * Otherwise, irregular sequences and non-characters are detected as well (like single surrogates). + * Safe macros return special error code points for illegal/irregular sequences: + * Typically, U+ffff, or values that would result in a code unit sequence of the same length + * as the erroneous input sequence.
+ * Note that _UNSAFE macros have fewer parameters: They do not have the strictness parameter, and + * they do not have start/length parameters for boundary checking.

+ * + *

Here, the macros are aliased in two steps: + * In the first step, the UTF-specific macros with UTF16_ prefix and _UNSAFE and _SAFE suffixes are + * aliased according to the UTF_SIZE to macros with UTF_ prefix and the same suffixes and signatures. + * Then, in a second step, the default, general alias macros are set to use either the unsafe or + * the safe/not strict (default) or the safe/strict macro; + * these general macros do not have a strictness parameter.

+ * + *

It is possible to change the default choice for the general alias macros to be unsafe, safe/not strict or safe/strict. + * The default is safe/not strict. It is not recommended to select the unsafe macros as the basis for + * Unicode string handling in ICU! To select this, define UTF_SAFE, UTF_STRICT, or UTF_UNSAFE.

+ * + *

For general use, one should use the default, general macros with UTF_ prefix and no _SAFE/_UNSAFE suffix. + * Only in some cases it may be necessary to control the choice of macro directly and use a less generic alias. + * For example, if it can be assumed that a string is well-formed and the index will stay within the bounds, + * then the _UNSAFE version may be used. + * If a UTF-8 string is to be processed, then the macros with UTF8_ prefixes need to be used.

+ * + *
+ * + * @deprecated since ICU 2.4. Use the macros in utf.h, utf16.h, utf8.h instead. + */ + +#ifndef __UTF_OLD_H__ +#define __UTF_OLD_H__ + +/* Formerly utf.h, part 1 --------------------------------------------------- */ + +/** + * Unicode string and array offset and index type. + * ICU always counts Unicode code units (UChars) for + * string offsets, indexes, and lengths, not Unicode code points. + * + * @deprecated Use int32_t directly. UTextOffset to be removed after 2003-mar. + */ +typedef int32_t UTextOffset; + +/** Number of bits in a Unicode string code unit - ICU uses 16-bit Unicode. @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF_SIZE 16 + +/** + * The default choice for general Unicode string macros is to use the ..._SAFE macro implementations + * with strict=FALSE. + * + * @deprecated since ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_SAFE +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#undef UTF_UNSAFE +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#undef UTF_STRICT + +/** + *

UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8, + * which need 1 or 2 bytes in UTF-8:
+ * U+0015 = NAK = Negative Acknowledge, C0 control character
+ * U+009f = highest C1 control character

+ * + *

These are used by UTF8_..._SAFE macros so that they can return an error value + * that needs the same number of code units (bytes) as were seen by + * a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().

+ * + * @deprecated since ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF8_ERROR_VALUE_1 0x15 + +/** + * See documentation on UTF8_ERROR_VALUE_1 for details. + * + * @deprecated since ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF8_ERROR_VALUE_2 0x9f + +/** + * Error value for all UTFs. This code point value will be set by macros with error + * checking if an error is detected. + * + * @deprecated since ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_ERROR_VALUE 0xffff + +/** + * Is a given 32-bit code an error value + * as returned by one of the macros for any UTF? + * + * @deprecated since ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_IS_ERROR(c) \ + (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2) + +/** + * This is a combined macro: Is c a valid Unicode value _and_ not an error code? + * + * @deprecated since ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_IS_VALID(c) \ + (UTF_IS_UNICODE_CHAR(c) && \ + (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2) + +/** + * Is this code unit or code point a surrogate (U+d800..U+dfff)? + * @deprecated since ICU 2.4. Renamed to U_IS_SURROGATE and U16_IS_SURROGATE, see utf_old.h. + */ +#define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800) + +/** + * Is a given 32-bit code point a Unicode noncharacter? + * + * @deprecated since ICU 2.4. Renamed to U_IS_UNICODE_NONCHAR, see utf_old.h. + */ +#define UTF_IS_UNICODE_NONCHAR(c) \ + ((c)>=0xfdd0 && \ + ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \ + (uint32_t)(c)<=0x10ffff) + +/** + * Is a given 32-bit value a Unicode code point value (0..U+10ffff) + * that can be assigned a character? + * + * Code points that are not characters include: + * - single surrogate code points (U+d800..U+dfff, 2048 code points) + * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points) + * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points) + * - the highest Unicode code point value is U+10ffff + * + * This means that all code points below U+d800 are character code points, + * and that boundary is tested first for performance. + * + * @deprecated since ICU 2.4. Renamed to U_IS_UNICODE_CHAR, see utf_old.h. + */ +#define UTF_IS_UNICODE_CHAR(c) \ + ((uint32_t)(c)<0xd800 || \ + ((uint32_t)(c)>0xdfff && \ + (uint32_t)(c)<=0x10ffff && \ + !UTF_IS_UNICODE_NONCHAR(c))) + +/* Formerly utf8.h ---------------------------------------------------------- */ + +/** + * Count the trail bytes for a UTF-8 lead byte. + * @deprecated since ICU 2.4. Renamed to U8_COUNT_TRAIL_BYTES, see utf_old.h. + */ +#define UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte]) + +/** + * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. + * @deprecated since ICU 2.4. Renamed to U8_MASK_LEAD_BYTE, see utf_old.h. + */ +#define UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) + +/** Is this this code point a single code unit (byte)? @deprecated since ICU 2.4. Renamed to U8_IS_SINGLE, see utf_old.h. */ +#define UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0) +/** Is this this code unit the lead code unit (byte) of a code point? @deprecated since ICU 2.4. Renamed to U8_IS_LEAD, see utf_old.h. */ +#define UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e) +/** Is this this code unit a trailing code unit (byte) of a code point? @deprecated since ICU 2.4. Renamed to U8_IS_TRAIL, see utf_old.h. */ +#define UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80) + +/** Does this scalar Unicode value need multiple code units for storage? @deprecated since ICU 2.4. Use U8_LENGTH or test ((uint32_t)(c)>0x7f) instead, see utf_old.h. */ +#define UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f) + +/** + * Given the lead character, how many bytes are taken by this code point. + * ICU does not deal with code points >0x10ffff + * unless necessary for advancing in the byte stream. + * + * These length macros take into account that for values >0x10ffff + * the UTF8_APPEND_CHAR_SAFE macros would write the error code point 0xffff + * with 3 bytes. + * Code point comparisons need to be in uint32_t because UChar32 + * may be a signed type, and negative values must be recognized. + * + * @deprecated since ICU 2.4. Use U8_LENGTH instead, see utf_old.h. + */ +#if 1 +# define UTF8_CHAR_LENGTH(c) \ + ((uint32_t)(c)<=0x7f ? 1 : \ + ((uint32_t)(c)<=0x7ff ? 2 : \ + ((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \ + ) \ + ) +#else +# define UTF8_CHAR_LENGTH(c) \ + ((uint32_t)(c)<=0x7f ? 1 : \ + ((uint32_t)(c)<=0x7ff ? 2 : \ + ((uint32_t)(c)<=0xffff ? 3 : \ + ((uint32_t)(c)<=0x10ffff ? 4 : \ + ((uint32_t)(c)<=0x3ffffff ? 5 : \ + ((uint32_t)(c)<=0x7fffffff ? 6 : 3) \ + ) \ + ) \ + ) \ + ) \ + ) +#endif + +/** The maximum number of bytes per code point. @deprecated since ICU 2.4. Renamed to U8_MAX_LENGTH, see utf_old.h. */ +#define UTF8_MAX_CHAR_LENGTH 4 + +/** Average number of code units compared to UTF-16. @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF8_ARRAY_SIZE(size) ((5*(size))/2) + +/** @deprecated since ICU 2.4. Renamed to U8_GET_UNSAFE, see utf_old.h. */ +#define UTF8_GET_CHAR_UNSAFE(s, i, c) { \ + int32_t __I=(int32_t)(i); \ + UTF8_SET_CHAR_START_UNSAFE(s, __I); \ + UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \ +} + +/** @deprecated since ICU 2.4. Use U8_GET instead, see utf_old.h. */ +#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ + int32_t __I=(int32_t)(i); \ + UTF8_SET_CHAR_START_SAFE(s, start, __I); \ + UTF8_NEXT_CHAR_SAFE(s, __I, length, c, strict); \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_NEXT_UNSAFE, see utf_old.h. */ +#define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[(i)++]; \ + if((uint8_t)((c)-0xc0)<0x35) { \ + uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \ + UTF8_MASK_LEAD_BYTE(c, __count); \ + switch(__count) { \ + /* each following branch falls through to the next one */ \ + case 3: \ + (c)=((c)<<6)|((s)[(i)++]&0x3f); \ + case 2: \ + (c)=((c)<<6)|((s)[(i)++]&0x3f); \ + case 1: \ + (c)=((c)<<6)|((s)[(i)++]&0x3f); \ + /* no other branches to optimize switch() */ \ + break; \ + } \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_APPEND_UNSAFE, see utf_old.h. */ +#define UTF8_APPEND_CHAR_UNSAFE(s, i, c) { \ + if((uint32_t)(c)<=0x7f) { \ + (s)[(i)++]=(uint8_t)(c); \ + } else { \ + if((uint32_t)(c)<=0x7ff) { \ + (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ + } else { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ + } else { \ + (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ + (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ + } \ + (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_FWD_1_UNSAFE, see utf_old.h. */ +#define UTF8_FWD_1_UNSAFE(s, i) { \ + (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_FWD_N_UNSAFE, see utf_old.h. */ +#define UTF8_FWD_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + UTF8_FWD_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_SET_CP_START_UNSAFE, see utf_old.h. */ +#define UTF8_SET_CHAR_START_UNSAFE(s, i) { \ + while(UTF8_IS_TRAIL((s)[i])) { --(i); } \ +} + +/** @deprecated since ICU 2.4. Use U8_NEXT instead, see utf_old.h. */ +#define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ + (c)=(s)[(i)++]; \ + if((c)>=0x80) { \ + if(UTF8_IS_LEAD(c)) { \ + (c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict); \ + } else { \ + (c)=UTF8_ERROR_VALUE_1; \ + } \ + } \ +} + +/** @deprecated since ICU 2.4. Use U8_APPEND instead, see utf_old.h. */ +#define UTF8_APPEND_CHAR_SAFE(s, i, length, c) { \ + if((uint32_t)(c)<=0x7f) { \ + (s)[(i)++]=(uint8_t)(c); \ + } else { \ + (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, NULL); \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_FWD_1, see utf_old.h. */ +#define UTF8_FWD_1_SAFE(s, i, length) U8_FWD_1(s, i, length) + +/** @deprecated since ICU 2.4. Renamed to U8_FWD_N, see utf_old.h. */ +#define UTF8_FWD_N_SAFE(s, i, length, n) U8_FWD_N(s, i, length, n) + +/** @deprecated since ICU 2.4. Renamed to U8_SET_CP_START, see utf_old.h. */ +#define UTF8_SET_CHAR_START_SAFE(s, start, i) U8_SET_CP_START(s, start, i) + +/** @deprecated since ICU 2.4. Renamed to U8_PREV_UNSAFE, see utf_old.h. */ +#define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[--(i)]; \ + if(UTF8_IS_TRAIL(c)) { \ + uint8_t __b, __count=1, __shift=6; \ +\ + /* c is a trail byte */ \ + (c)&=0x3f; \ + for(;;) { \ + __b=(s)[--(i)]; \ + if(__b>=0xc0) { \ + UTF8_MASK_LEAD_BYTE(__b, __count); \ + (c)|=(UChar32)__b<<__shift; \ + break; \ + } else { \ + (c)|=(UChar32)(__b&0x3f)<<__shift; \ + ++__count; \ + __shift+=6; \ + } \ + } \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_BACK_1_UNSAFE, see utf_old.h. */ +#define UTF8_BACK_1_UNSAFE(s, i) { \ + while(UTF8_IS_TRAIL((s)[--(i)])) {} \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_BACK_N_UNSAFE, see utf_old.h. */ +#define UTF8_BACK_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + UTF8_BACK_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_SET_CP_LIMIT_UNSAFE, see utf_old.h. */ +#define UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) { \ + UTF8_BACK_1_UNSAFE(s, i); \ + UTF8_FWD_1_UNSAFE(s, i); \ +} + +/** @deprecated since ICU 2.4. Use U8_PREV instead, see utf_old.h. */ +#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \ + (c)=(s)[--(i)]; \ + if((c)>=0x80) { \ + if((c)<=0xbf) { \ + (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \ + } else { \ + (c)=UTF8_ERROR_VALUE_1; \ + } \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U8_BACK_1, see utf_old.h. */ +#define UTF8_BACK_1_SAFE(s, start, i) U8_BACK_1(s, start, i) + +/** @deprecated since ICU 2.4. Renamed to U8_BACK_N, see utf_old.h. */ +#define UTF8_BACK_N_SAFE(s, start, i, n) U8_BACK_N(s, start, i, n) + +/** @deprecated since ICU 2.4. Renamed to U8_SET_CP_LIMIT, see utf_old.h. */ +#define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) U8_SET_CP_LIMIT(s, start, i, length) + +/* Formerly utf16.h --------------------------------------------------------- */ + +/** Is uchar a first/lead surrogate? @deprecated since ICU 2.4. Renamed to U_IS_LEAD and U16_IS_LEAD, see utf_old.h. */ +#define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800) + +/** Is uchar a second/trail surrogate? @deprecated since ICU 2.4. Renamed to U_IS_TRAIL and U16_IS_TRAIL, see utf_old.h. */ +#define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00) + +/** Assuming c is a surrogate, is it a first/lead surrogate? @deprecated since ICU 2.4. Renamed to U_IS_SURROGATE_LEAD and U16_IS_SURROGATE_LEAD, see utf_old.h. */ +#define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0) + +/** Helper constant for UTF16_GET_PAIR_VALUE. @deprecated since ICU 2.4. Renamed to U16_SURROGATE_OFFSET, see utf_old.h. */ +#define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000) + +/** Get the UTF-32 value from the surrogate code units. @deprecated since ICU 2.4. Renamed to U16_GET_SUPPLEMENTARY, see utf_old.h. */ +#define UTF16_GET_PAIR_VALUE(first, second) \ + (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET) + +/** @deprecated since ICU 2.4. Renamed to U16_LEAD, see utf_old.h. */ +#define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0) + +/** @deprecated since ICU 2.4. Renamed to U16_TRAIL, see utf_old.h. */ +#define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00) + +/** @deprecated since ICU 2.4. Renamed to U16_LEAD, see utf_old.h. */ +#define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary) + +/** @deprecated since ICU 2.4. Renamed to U16_TRAIL, see utf_old.h. */ +#define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary) + +/** @deprecated since ICU 2.4. Renamed to U16_IS_SINGLE, see utf_old.h. */ +#define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar) + +/** @deprecated since ICU 2.4. Renamed to U16_IS_LEAD, see utf_old.h. */ +#define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar) + +/** @deprecated since ICU 2.4. Renamed to U16_IS_TRAIL, see utf_old.h. */ +#define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar) + +/** Does this scalar Unicode value need multiple code units for storage? @deprecated since ICU 2.4. Use U16_LENGTH or test ((uint32_t)(c)>0xffff) instead, see utf_old.h. */ +#define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff) + +/** @deprecated since ICU 2.4. Renamed to U16_LENGTH, see utf_old.h. */ +#define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2) + +/** @deprecated since ICU 2.4. Renamed to U16_MAX_LENGTH, see utf_old.h. */ +#define UTF16_MAX_CHAR_LENGTH 2 + +/** Average number of code units compared to UTF-16. @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF16_ARRAY_SIZE(size) (size) + +/** + * Get a single code point from an offset that points to any + * of the code units that belong to that code point. + * Assume 0<=i=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ + (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ + /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ + } else if(strict) {\ + /* unmatched second surrogate */ \ + (c)=UTF_ERROR_VALUE; \ + } \ + } \ + } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_NEXT_UNSAFE, see utf_old.h. */ +#define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[(i)++]; \ + if(UTF_IS_FIRST_SURROGATE(c)) { \ + (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_APPEND_UNSAFE, see utf_old.h. */ +#define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint16_t)(c); \ + } else { \ + (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_FWD_1_UNSAFE, see utf_old.h. */ +#define UTF16_FWD_1_UNSAFE(s, i) { \ + if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \ + ++(i); \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_FWD_N_UNSAFE, see utf_old.h. */ +#define UTF16_FWD_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + UTF16_FWD_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_START_UNSAFE, see utf_old.h. */ +#define UTF16_SET_CHAR_START_UNSAFE(s, i) { \ + if(UTF_IS_SECOND_SURROGATE((s)[i])) { \ + --(i); \ + } \ +} + +/** @deprecated since ICU 2.4. Use U16_NEXT instead, see utf_old.h. */ +#define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ + (c)=(s)[(i)++]; \ + if(UTF_IS_FIRST_SURROGATE(c)) { \ + uint16_t __c2; \ + if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=UTF16_GET_PAIR_VALUE((c), __c2); \ + /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ + } else if(strict) {\ + /* unmatched first surrogate */ \ + (c)=UTF_ERROR_VALUE; \ + } \ + } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ + /* unmatched second surrogate or other non-character */ \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated since ICU 2.4. Use U16_APPEND instead, see utf_old.h. */ +#define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \ + if((uint32_t)(c)<=0xffff) { \ + (s)[(i)++]=(uint16_t)(c); \ + } else if((uint32_t)(c)<=0x10ffff) { \ + if((i)+1<(length)) { \ + (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \ + (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \ + } else /* not enough space */ { \ + (s)[(i)++]=UTF_ERROR_VALUE; \ + } \ + } else /* c>0x10ffff, write error value */ { \ + (s)[(i)++]=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_FWD_1, see utf_old.h. */ +#define UTF16_FWD_1_SAFE(s, i, length) U16_FWD_1(s, i, length) + +/** @deprecated since ICU 2.4. Renamed to U16_FWD_N, see utf_old.h. */ +#define UTF16_FWD_N_SAFE(s, i, length, n) U16_FWD_N(s, i, length, n) + +/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_START, see utf_old.h. */ +#define UTF16_SET_CHAR_START_SAFE(s, start, i) U16_SET_CP_START(s, start, i) + +/** @deprecated since ICU 2.4. Renamed to U16_PREV_UNSAFE, see utf_old.h. */ +#define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[--(i)]; \ + if(UTF_IS_SECOND_SURROGATE(c)) { \ + (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_BACK_1_UNSAFE, see utf_old.h. */ +#define UTF16_BACK_1_UNSAFE(s, i) { \ + if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \ + --(i); \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_BACK_N_UNSAFE, see utf_old.h. */ +#define UTF16_BACK_N_UNSAFE(s, i, n) { \ + int32_t __N=(n); \ + while(__N>0) { \ + UTF16_BACK_1_UNSAFE(s, i); \ + --__N; \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_LIMIT_UNSAFE, see utf_old.h. */ +#define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \ + if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \ + ++(i); \ + } \ +} + +/** @deprecated since ICU 2.4. Use U16_PREV instead, see utf_old.h. */ +#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \ + (c)=(s)[--(i)]; \ + if(UTF_IS_SECOND_SURROGATE(c)) { \ + uint16_t __c2; \ + if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \ + --(i); \ + (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \ + /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \ + } else if(strict) {\ + /* unmatched second surrogate */ \ + (c)=UTF_ERROR_VALUE; \ + } \ + } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \ + /* unmatched first surrogate or other non-character */ \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated since ICU 2.4. Renamed to U16_BACK_1, see utf_old.h. */ +#define UTF16_BACK_1_SAFE(s, start, i) U16_BACK_1(s, start, i) + +/** @deprecated since ICU 2.4. Renamed to U16_BACK_N, see utf_old.h. */ +#define UTF16_BACK_N_SAFE(s, start, i, n) U16_BACK_N(s, start, i, n) + +/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h. */ +#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length) + +/* Formerly utf32.h --------------------------------------------------------- */ + +/* +* Old documentation: +* +* This file defines macros to deal with UTF-32 code units and code points. +* Signatures and semantics are the same as for the similarly named macros +* in utf16.h. +* utf32.h is included by utf.h after unicode/umachine.h

+* and some common definitions. +*

Usage: ICU coding guidelines for if() statements should be followed when using these macros. +* Compound statements (curly braces {}) must be used for if-else-while... +* bodies and all macro statements should be terminated with semicolon.

+*/ + +/* internal definitions ----------------------------------------------------- */ + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_IS_SAFE(c, strict) \ + (!(strict) ? \ + (uint32_t)(c)<=0x10ffff : \ + UTF_IS_UNICODE_CHAR(c)) + +/* + * For the semantics of all of these macros, see utf16.h. + * The UTF-32 versions are trivial because any code point is + * encoded using exactly one code unit. + */ + +/* single-code point definitions -------------------------------------------- */ + +/* classes of code unit values */ + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_IS_SINGLE(uchar) 1 +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_IS_LEAD(uchar) 0 +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_IS_TRAIL(uchar) 0 + +/* number of code units per code point */ + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_NEED_MULTIPLE_UCHAR(c) 0 +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_CHAR_LENGTH(c) 1 +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_MAX_CHAR_LENGTH 1 + +/* average number of code units compared to UTF-16 */ + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_ARRAY_SIZE(size) (size) + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_GET_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[i]; \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \ + (c)=(s)[i]; \ + if(!UTF32_IS_SAFE(c, strict)) { \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/* definitions with forward iteration --------------------------------------- */ + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_NEXT_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[(i)++]; \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_APPEND_CHAR_UNSAFE(s, i, c) { \ + (s)[(i)++]=(c); \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_FWD_1_UNSAFE(s, i) { \ + ++(i); \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_FWD_N_UNSAFE(s, i, n) { \ + (i)+=(n); \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_SET_CHAR_START_UNSAFE(s, i) { \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) { \ + (c)=(s)[(i)++]; \ + if(!UTF32_IS_SAFE(c, strict)) { \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_APPEND_CHAR_SAFE(s, i, length, c) { \ + if((uint32_t)(c)<=0x10ffff) { \ + (s)[(i)++]=(c); \ + } else /* c>0x10ffff, write 0xfffd */ { \ + (s)[(i)++]=0xfffd; \ + } \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_FWD_1_SAFE(s, i, length) { \ + ++(i); \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_FWD_N_SAFE(s, i, length, n) { \ + if(((i)+=(n))>(length)) { \ + (i)=(length); \ + } \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \ +} + +/* definitions with backward iteration -------------------------------------- */ + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_PREV_CHAR_UNSAFE(s, i, c) { \ + (c)=(s)[--(i)]; \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_BACK_1_UNSAFE(s, i) { \ + --(i); \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_BACK_N_UNSAFE(s, i, n) { \ + (i)-=(n); \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \ + (c)=(s)[--(i)]; \ + if(!UTF32_IS_SAFE(c, strict)) { \ + (c)=UTF_ERROR_VALUE; \ + } \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_BACK_1_SAFE(s, start, i) { \ + --(i); \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_BACK_N_SAFE(s, start, i, n) { \ + (i)-=(n); \ + if((i)<(start)) { \ + (i)=(start); \ + } \ +} + +/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */ +#define UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) { \ +} + +/* Formerly utf.h, part 2 --------------------------------------------------- */ + +/** + * Estimate the number of code units for a string based on the number of UTF-16 code units. + * + * @deprecated since ICU 2.4. Obsolete, see utf_old.h. + */ +#define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size) + +/** @deprecated since ICU 2.4. Renamed to U16_GET_UNSAFE, see utf_old.h. */ +#define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c) + +/** @deprecated since ICU 2.4. Use U16_GET instead, see utf_old.h. */ +#define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) + + +/** @deprecated since ICU 2.4. Renamed to U16_NEXT, see utf_old.h. */ +#define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c) + +/** @deprecated since ICU 2.4. Use U16_NEXT instead, see utf_old.h. */ +#define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) + + +/** @deprecated since ICU 2.4. Renamed to U16_APPEND_UNSAFE, see utf_old.h. */ +#define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c) + +/** @deprecated since ICU 2.4. Use U16_APPEND instead, see utf_old.h. */ +#define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c) + + +/** @deprecated since ICU 2.4. Renamed to U16_FWD_1_UNSAFE, see utf_old.h. */ +#define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i) + +/** @deprecated since ICU 2.4. Renamed to U16_FWD_1, see utf_old.h. */ +#define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length) + + +/** @deprecated since ICU 2.4. Renamed to U16_FWD_N_UNSAFE, see utf_old.h. */ +#define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n) + +/** @deprecated since ICU 2.4. Renamed to U16_FWD_N, see utf_old.h. */ +#define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n) + + +/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_START_UNSAFE, see utf_old.h. */ +#define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i) + +/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_START, see utf_old.h. */ +#define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i) + + +/** @deprecated since ICU 2.4. Renamed to U16_PREV_UNSAFE, see utf_old.h. */ +#define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c) + +/** @deprecated since ICU 2.4. Use U16_PREV instead, see utf_old.h. */ +#define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) + + +/** @deprecated since ICU 2.4. Renamed to U16_BACK_1_UNSAFE, see utf_old.h. */ +#define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i) + +/** @deprecated since ICU 2.4. Renamed to U16_BACK_1, see utf_old.h. */ +#define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i) + + +/** @deprecated since ICU 2.4. Renamed to U16_BACK_N_UNSAFE, see utf_old.h. */ +#define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n) + +/** @deprecated since ICU 2.4. Renamed to U16_BACK_N, see utf_old.h. */ +#define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n) + + +/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_LIMIT_UNSAFE, see utf_old.h. */ +#define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) + +/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h. */ +#define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) + +/* Define default macros (UTF-16 "safe") ------------------------------------ */ + +/** + * Does this code unit alone encode a code point (BMP, not a surrogate)? + * Same as UTF16_IS_SINGLE. + * @deprecated since ICU 2.4. Renamed to U_IS_SINGLE and U16_IS_SINGLE, see utf_old.h. + */ +#define UTF_IS_SINGLE(uchar) U16_IS_SINGLE(uchar) + +/** + * Is this code unit the first one of several (a lead surrogate)? + * Same as UTF16_IS_LEAD. + * @deprecated since ICU 2.4. Renamed to U_IS_LEAD and U16_IS_LEAD, see utf_old.h. + */ +#define UTF_IS_LEAD(uchar) U16_IS_LEAD(uchar) + +/** + * Is this code unit one of several but not the first one (a trail surrogate)? + * Same as UTF16_IS_TRAIL. + * @deprecated since ICU 2.4. Renamed to U_IS_TRAIL and U16_IS_TRAIL, see utf_old.h. + */ +#define UTF_IS_TRAIL(uchar) U16_IS_TRAIL(uchar) + +/** + * Does this code point require multiple code units (is it a supplementary code point)? + * Same as UTF16_NEED_MULTIPLE_UCHAR. + * @deprecated since ICU 2.4. Use U16_LENGTH or test ((uint32_t)(c)>0xffff) instead. + */ +#define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c) + +/** + * How many code units are used to encode this code point (1 or 2)? + * Same as UTF16_CHAR_LENGTH. + * @deprecated since ICU 2.4. Renamed to U16_LENGTH, see utf_old.h. + */ +#define UTF_CHAR_LENGTH(c) U16_LENGTH(c) + +/** + * How many code units are used at most for any Unicode code point (2)? + * Same as UTF16_MAX_CHAR_LENGTH. + * @deprecated since ICU 2.4. Renamed to U16_MAX_LENGTH, see utf_old.h. + */ +#define UTF_MAX_CHAR_LENGTH U16_MAX_LENGTH + +/** + * Set c to the code point that contains the code unit i. + * i could point to the lead or the trail surrogate for the code point. + * i is not modified. + * Same as UTF16_GET_CHAR. + * \pre 0<=i= 0) { + return (UChar *)string + index; + } else { + return NULL; } - - if (single) - { - const UChar *matchItr; - const UChar *strItr; - - for (strItr = string; *strItr; strItr++) - { - for (matchItr = matchSet; *matchItr; matchItr++) - { - if (*matchItr == *strItr) - { - return (UChar *)strItr; - } - } - } - } - else - { - int32_t matchItr; - int32_t strItr; - UChar32 stringCh, matchSetCh; - int32_t stringLen = u_strlen(string); - - for (strItr = 0; strItr < stringLen; strItr++) - { - UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE); - for (matchItr = 0; matchItr < matchLen; matchItr++) - { - UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE); - if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE - || string[strItr] == UTF_ERROR_VALUE - || (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr])))) - { - return (UChar *)string + strItr; - } - } - } - } - - /* Didn't find it. */ - return NULL; } /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ U_CAPI int32_t U_EXPORT2 u_strcspn(const UChar *string, const UChar *matchSet) { - const UChar *foundStr = u_strpbrk(string, matchSet); - if (foundStr == NULL) - { - return u_strlen(string); + int32_t index = _matchFromSet(string, matchSet, TRUE); + if(index >= 0) { + return index; + } else { + return -index - 1; /* == u_strlen(string) */ } - return foundStr - string; } /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ U_CAPI int32_t U_EXPORT2 u_strspn(const UChar *string, const UChar *matchSet) { - UBool single = TRUE; - UBool match = TRUE; - int32_t matchLen; - int32_t retValue; - - for (matchLen = 0; matchSet[matchLen]; matchLen++) - { - if (!UTF_IS_SINGLE(matchSet[matchLen])) - { - single = FALSE; - } + int32_t index = _matchFromSet(string, matchSet, FALSE); + if(index >= 0) { + return index; + } else { + return -index - 1; /* == u_strlen(string) */ } - - if (single) - { - const UChar *matchItr; - const UChar *strItr; - - for (strItr = string; *strItr && match; strItr++) - { - match = FALSE; - for (matchItr = matchSet; *matchItr; matchItr++) - { - if (*matchItr == *strItr) - { - match = TRUE; - break; - } - } - } - retValue = strItr - string - (match == FALSE); - } - else - { - int32_t matchItr; - int32_t strItr; - UChar32 stringCh, matchSetCh; - int32_t stringLen = u_strlen(string); - - for (strItr = 0; strItr < stringLen && match; strItr++) - { - match = FALSE; - UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE); - for (matchItr = 0; matchItr < matchLen; matchItr++) - { - UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE); - if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE - || string[strItr] == UTF_ERROR_VALUE - || (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr])))) - { - match = TRUE; - break; - } - } - } - retValue = strItr - (match == FALSE); - } - - /* Found a mismatch or didn't find it. */ - return retValue; } /* ----- Text manipulation functions --- */ diff --git a/icu4c/source/common/ustrtrns.c b/icu4c/source/common/ustrtrns.c index 15b9576ecc5..df6da4263cc 100644 --- a/icu4c/source/common/ustrtrns.c +++ b/icu4c/source/common/ustrtrns.c @@ -228,7 +228,6 @@ u_strFromUTF8(UChar *dest, int32_t index = 0; int32_t reqLength = 0; uint8_t* pSrc = (uint8_t*) src; - UBool isError; /* args check */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ @@ -249,8 +248,8 @@ u_strFromUTF8(UChar *dest, if(ch <=0x7f){ *pDest++=(UChar)ch; }else{ - ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, FALSE, &isError); - if(isError){ + ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1); + if(ch<0){ *pErrorCode = U_INVALID_CHAR_FOUND; return NULL; }else if(ch<=0xFFFF){ @@ -272,8 +271,8 @@ u_strFromUTF8(UChar *dest, if(ch <= 0x7f){ reqLength++; }else{ - ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, FALSE, &isError); - if(isError){ + ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1); + if(ch<0){ *pErrorCode = U_INVALID_CHAR_FOUND; return NULL; } diff --git a/icu4c/source/common/utf_impl.c b/icu4c/source/common/utf_impl.c index ce3eb9bc0ef..5ace9404c38 100644 --- a/icu4c/source/common/utf_impl.c +++ b/icu4c/source/common/utf_impl.c @@ -83,7 +83,7 @@ utf8_errorValue[6]={ }; U_CAPI UChar32 U_EXPORT2 -utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict, UBool *pIsError) { +utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) { int32_t i=*pi; uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); if((i)+count<=(length)) { @@ -118,10 +118,11 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, illegal|=(trail&0xc0)^0x80; break; case 0: - if(pIsError!=NULL) { - *pIsError=TRUE; + if(strict>=0) { + return UTF8_ERROR_VALUE_1; + } else { + return U_SENTINEL; } - return UTF8_ERROR_VALUE_1; /* no default branch to optimize switch() - all values are covered */ } @@ -132,6 +133,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, * Starting with Unicode 3.0.1, non-shortest forms are illegal. * Starting with Unicode 3.2, surrogate code points must not be * encoded in UTF-8, and there are no irregular sequences any more. + * + * U8_ macros (new in ICU 2.4) return negative values for error conditions. */ /* correct sequence - all trail bytes have (b7..b6)==(10)? */ @@ -145,21 +148,14 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, ++(i); --count; } - c=utf8_errorValue[errorCount-count]; - if(pIsError!=NULL) { - *pIsError=TRUE; + if(strict>=0) { + c=utf8_errorValue[errorCount-count]; + } else { + c=U_SENTINEL; } - } else if((strict) && UTF_IS_UNICODE_NONCHAR(c)) { + } else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) { /* strict: forbid non-characters like U+fffe */ c=utf8_errorValue[count]; - if(pIsError!=NULL) { - *pIsError=TRUE; - } - } else { - /* good result */ - if(pIsError!=NULL) { - *pIsError=FALSE; - } } } else /* too few bytes left */ { /* error handling */ @@ -168,9 +164,10 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, while((i)<(length) && UTF8_IS_TRAIL(s[i])) { ++(i); } - c=utf8_errorValue[i-i0]; - if(pIsError!=NULL) { - *pIsError=TRUE; + if(strict>=0) { + c=utf8_errorValue[i-i0]; + } else { + c=U_SENTINEL; } } *pi=i; @@ -178,8 +175,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, } U_CAPI int32_t U_EXPORT2 -utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) { - if((c)<=0x7ff) { +utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) { + if((uint32_t)(c)<=0x7ff) { if((i)+1<(length)) { (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); @@ -187,7 +184,7 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) { } } else if((uint32_t)(c)<=0xffff) { /* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */ - if((i)+2<(length) && !UTF_IS_SURROGATE(c)) { + if((i)+2<(length) && !U_IS_SURROGATE(c)) { (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); @@ -203,18 +200,22 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) { } } /* c>0x10ffff or not enough space, write an error value */ - length-=i; - if(length>0) { - int32_t offset; - if(length>3) { - length=3; + if(pIsError!=NULL) { + *pIsError=TRUE; + } else { + length-=i; + if(length>0) { + int32_t offset; + if(length>3) { + length=3; + } + s+=i; + offset=0; + c=utf8_errorValue[length-1]; + UTF8_APPEND_CHAR_UNSAFE(s, offset, c); + i=i+offset; } - s+=i; - offset=0; - c=utf8_errorValue[length-1]; - UTF8_APPEND_CHAR_UNSAFE(s, offset, c); - i=i+offset; - } + } return i; } @@ -229,7 +230,11 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U for(;;) { if(i<=start) { /* no lead byte at all */ - c=UTF8_ERROR_VALUE_1; + if(strict>=0) { + return UTF8_ERROR_VALUE_1; + } else { + return U_SENTINEL; + } break; } @@ -250,7 +255,11 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U if(count>=4) { count=3; } - c=utf8_errorValue[count]; + if(strict>=0) { + c=utf8_errorValue[count]; + } else { + c=U_SENTINEL; + } } else { /* exit with correct c */ } @@ -260,9 +269,17 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U include the trail byte that we started with */ if(count=0) { + c=utf8_errorValue[count]; + } else { + c=U_SENTINEL; + } } else { - c=UTF8_ERROR_VALUE_1; + if(strict>=0) { + c=UTF8_ERROR_VALUE_1; + } else { + c=U_SENTINEL; + } } } break; @@ -273,12 +290,20 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U shift+=6; } else { /* more than 5 trail bytes is illegal */ - c=UTF8_ERROR_VALUE_1; + if(strict>=0) { + c=UTF8_ERROR_VALUE_1; + } else { + c=U_SENTINEL; + } break; } } else { /* single-byte character precedes trailing bytes */ - c=UTF8_ERROR_VALUE_1; + if(strict>=0) { + c=UTF8_ERROR_VALUE_1; + } else { + c=U_SENTINEL; + } break; } } diff --git a/icu4c/source/test/cintltst/cucdtst.c b/icu4c/source/test/cintltst/cucdtst.c index ee5bfed94ac..33bb27ea664 100644 --- a/icu4c/source/test/cintltst/cucdtst.c +++ b/icu4c/source/test/cintltst/cucdtst.c @@ -942,39 +942,39 @@ static void TestCodePoint(){ UChar32 c=codePoint[i]; log_verbose("Testing code unit value of \\u%4X\n", c); if(i<6){ - if(!UTF_IS_SURROGATE(c)){ + if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){ log_err("ERROR: isSurrogate() failed for \\u%4X\n", c); } if(UTF_IS_VALID(c)){ log_err("ERROR: isValid() failed for \\u%4X\n", c); } - if(UTF_IS_UNICODE_CHAR(c)){ + if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c); } if(UTF_IS_ERROR(c)){ log_err("ERROR: isError() failed for \\u%4X\n", c); } }else if(i >=6 && i<18){ - if(UTF_IS_SURROGATE(c)){ + if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ log_err("ERROR: isSurrogate() failed for \\u%4X\n", c); } if(!UTF_IS_VALID(c)){ log_err("ERROR: isValid() failed for \\u%4X\n", c); } - if(!UTF_IS_UNICODE_CHAR(c)){ + if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c); } if(UTF_IS_ERROR(c)){ log_err("ERROR: isError() failed for \\u%4X\n", c); } }else if(i >=18 && i<20){ - if(UTF_IS_SURROGATE(c)){ + if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ log_err("ERROR: isSurrogate() failed for \\u%4X\n", c); } if(UTF_IS_VALID(c)){ log_err("ERROR: isValid() failed for \\u%4X\n", c); } - if(!UTF_IS_UNICODE_CHAR(c)){ + if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c); } if(!UTF_IS_ERROR(c)){ @@ -982,13 +982,13 @@ static void TestCodePoint(){ } } else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){ - if(UTF_IS_SURROGATE(c)){ + if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ log_err("ERROR: isSurrogate() failed for \\u%4X\n", c); } if(UTF_IS_VALID(c)){ log_err("ERROR: isValid() failed for \\u%4X\n", c); } - if(UTF_IS_UNICODE_CHAR(c)){ + if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c); } if(!UTF_IS_ERROR(c)){ @@ -1018,7 +1018,7 @@ static void TestCharLength() UBool multiple; for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){ UChar32 c=codepoint[i+1]; - if(UTF_CHAR_LENGTH(c) != codepoint[i]){ + if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){ log_err("The no: of code units for \\u%4X:- Expected: %d Got: %d", c, codepoint[i], UTF_CHAR_LENGTH(c)); }else{ log_verbose("The no: of code units for \\u%4X is %d", c, UTF_CHAR_LENGTH(c)); @@ -1457,7 +1457,6 @@ static void TestStringFunctions() static void TestStringSearching() { - UChar ucharBuf[255]; const UChar testString[] = {0x0061, 0x0062, 0x0063, 0x0064, 0x0064, 0x0061, 0}; const UChar testSurrogateString[] = {0xdbff, 0x0061, 0x0062, 0xdbff, 0xdfff, 0x0063, 0x0064, 0x0064, 0xdbff, 0xdfff, 0xdb00, 0xdf00, 0x0061, 0}; const UChar surrMatchSet1[] = {0xdbff, 0xdfff, 0}; @@ -1467,55 +1466,67 @@ static void TestStringSearching() const UChar surrMatchSetBad[] = {0xdbff, 0x0061, 0}; const UChar surrMatchSetBad2[] = {0x0061, 0xdbff, 0}; const UChar surrMatchSetBad3[] = {0xdbff, 0x0061, 0x0062, 0xdbff, 0xdfff, 0}; /* has partial surrogate */ + const UChar + empty[] = { 0 }, + a[] = { 0x61, 0 }, + ab[] = { 0x61, 0x62, 0 }, + ba[] = { 0x62, 0x61, 0 }, + abcd[] = { 0x61, 0x62, 0x63, 0x64, 0 }, + cd[] = { 0x63, 0x64, 0 }, + dc[] = { 0x64, 0x63, 0 }, + cdh[] = { 0x63, 0x64, 0x68, 0 }, + f[] = { 0x66, 0 }, + fg[] = { 0x66, 0x67, 0 }, + gf[] = { 0x67, 0x66, 0 }; log_verbose("Testing u_strpbrk()"); - if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "a")) != &testString[0]) { + if (u_strpbrk(testString, a) != &testString[0]) { log_err("u_strpbrk couldn't find first letter a.\n"); } - if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "dc")) != &testString[2]) { + if (u_strpbrk(testString, dc) != &testString[2]) { log_err("u_strpbrk couldn't find d or c.\n"); } - if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "cd")) != &testString[2]) { + if (u_strpbrk(testString, cd) != &testString[2]) { log_err("u_strpbrk couldn't find c or d.\n"); } - if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "cdh")) != &testString[2]) { + if (u_strpbrk(testString, cdh) != &testString[2]) { log_err("u_strpbrk couldn't find c, d or h.\n"); } - if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "f")) != NULL) { + if (u_strpbrk(testString, f) != NULL) { log_err("u_strpbrk didn't return NULL for \"f\".\n"); } - if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "fg")) != NULL) { + if (u_strpbrk(testString, fg) != NULL) { log_err("u_strpbrk didn't return NULL for \"fg\".\n"); } - if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "gf")) != NULL) { + if (u_strpbrk(testString, gf) != NULL) { log_err("u_strpbrk didn't return NULL for \"gf\".\n"); } - if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "")) != NULL) { + if (u_strpbrk(testString, empty) != NULL) { log_err("u_strpbrk didn't return NULL for \"\".\n"); } log_verbose("Testing u_strpbrk() with surrogates"); - if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "a")) != &testSurrogateString[1]) { + if (u_strpbrk(testSurrogateString, a) != &testSurrogateString[1]) { log_err("u_strpbrk couldn't find first letter a.\n"); } - if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != &testSurrogateString[5]) { + if (u_strpbrk(testSurrogateString, dc) != &testSurrogateString[5]) { log_err("u_strpbrk couldn't find d or c.\n"); } - if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != &testSurrogateString[5]) { + if (u_strpbrk(testSurrogateString, cd) != &testSurrogateString[5]) { log_err("u_strpbrk couldn't find c or d.\n"); } - if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "cdh")) != &testSurrogateString[5]) { + if (u_strpbrk(testSurrogateString, cdh) != &testSurrogateString[5]) { log_err("u_strpbrk couldn't find c, d or h.\n"); } - if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != NULL) { + if (u_strpbrk(testSurrogateString, f) != NULL) { log_err("u_strpbrk didn't return NULL for \"f\".\n"); } - if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "fg")) != NULL) { + if (u_strpbrk(testSurrogateString, fg) != NULL) { log_err("u_strpbrk didn't return NULL for \"fg\".\n"); } - if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "gf")) != NULL) { + if (u_strpbrk(testSurrogateString, gf) != NULL) { log_err("u_strpbrk didn't return NULL for \"gf\".\n"); } if (u_strpbrk(testSurrogateString, surrMatchSet1) != &testSurrogateString[3]) { @@ -1536,49 +1547,49 @@ static void TestStringSearching() log_verbose("Testing u_strcspn()"); - if (u_strcspn(testString, u_uastrcpy(ucharBuf, "a")) != 0) { + if (u_strcspn(testString, a) != 0) { log_err("u_strcspn couldn't find first letter a.\n"); } - if (u_strcspn(testString, u_uastrcpy(ucharBuf, "dc")) != 2) { + if (u_strcspn(testString, dc) != 2) { log_err("u_strcspn couldn't find d or c.\n"); } - if (u_strcspn(testString, u_uastrcpy(ucharBuf, "cd")) != 2) { + if (u_strcspn(testString, cd) != 2) { log_err("u_strcspn couldn't find c or d.\n"); } - if (u_strcspn(testString, u_uastrcpy(ucharBuf, "cdh")) != 2) { + if (u_strcspn(testString, cdh) != 2) { log_err("u_strcspn couldn't find c, d or h.\n"); } - if (u_strcspn(testString, u_uastrcpy(ucharBuf, "f")) != u_strlen(testString)) { + if (u_strcspn(testString, f) != u_strlen(testString)) { log_err("u_strcspn didn't return NULL for \"f\".\n"); } - if (u_strcspn(testString, u_uastrcpy(ucharBuf, "fg")) != u_strlen(testString)) { + if (u_strcspn(testString, fg) != u_strlen(testString)) { log_err("u_strcspn didn't return NULL for \"fg\".\n"); } - if (u_strcspn(testString, u_uastrcpy(ucharBuf, "gf")) != u_strlen(testString)) { + if (u_strcspn(testString, gf) != u_strlen(testString)) { log_err("u_strcspn didn't return NULL for \"gf\".\n"); } log_verbose("Testing u_strcspn() with surrogates"); - if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "a")) != 1) { + if (u_strcspn(testSurrogateString, a) != 1) { log_err("u_strcspn couldn't find first letter a.\n"); } - if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != 5) { + if (u_strcspn(testSurrogateString, dc) != 5) { log_err("u_strcspn couldn't find d or c.\n"); } - if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != 5) { + if (u_strcspn(testSurrogateString, cd) != 5) { log_err("u_strcspn couldn't find c or d.\n"); } - if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "cdh")) != 5) { + if (u_strcspn(testSurrogateString, cdh) != 5) { log_err("u_strcspn couldn't find c, d or h.\n"); } - if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != u_strlen(testSurrogateString)) { + if (u_strcspn(testSurrogateString, f) != u_strlen(testSurrogateString)) { log_err("u_strcspn didn't return NULL for \"f\".\n"); } - if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "fg")) != u_strlen(testSurrogateString)) { + if (u_strcspn(testSurrogateString, fg) != u_strlen(testSurrogateString)) { log_err("u_strcspn didn't return NULL for \"fg\".\n"); } - if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "gf")) != u_strlen(testSurrogateString)) { + if (u_strcspn(testSurrogateString, gf) != u_strlen(testSurrogateString)) { log_err("u_strcspn didn't return NULL for \"gf\".\n"); } if (u_strcspn(testSurrogateString, surrMatchSet1) != 3) { @@ -1597,25 +1608,25 @@ static void TestStringSearching() log_verbose("Testing u_strspn()"); - if (u_strspn(testString, u_uastrcpy(ucharBuf, "a")) != 1) { + if (u_strspn(testString, a) != 1) { log_err("u_strspn couldn't skip first letter a.\n"); } - if (u_strspn(testString, u_uastrcpy(ucharBuf, "ab")) != 2) { + if (u_strspn(testString, ab) != 2) { log_err("u_strspn couldn't skip a or b.\n"); } - if (u_strspn(testString, u_uastrcpy(ucharBuf, "ba")) != 2) { + if (u_strspn(testString, ba) != 2) { log_err("u_strspn couldn't skip a or b.\n"); } - if (u_strspn(testString, u_uastrcpy(ucharBuf, "f")) != 0) { + if (u_strspn(testString, f) != 0) { log_err("u_strspn didn't return 0 for \"f\".\n"); } - if (u_strspn(testString, u_uastrcpy(ucharBuf, "dc")) != 0) { + if (u_strspn(testString, dc) != 0) { log_err("u_strspn couldn't find first letter a (skip d or c).\n"); } - if (u_strspn(testString, u_uastrcpy(ucharBuf, "abcd")) != u_strlen(testString)) { + if (u_strspn(testString, abcd) != u_strlen(testString)) { log_err("u_strspn couldn't skip over the whole string.\n"); } - if (u_strspn(testString, u_uastrcpy(ucharBuf, "")) != 0) { + if (u_strspn(testString, empty) != 0) { log_err("u_strspn should have returned 0 for empty string.\n"); } @@ -1626,13 +1637,13 @@ static void TestStringSearching() if (u_strspn(testSurrogateString, surrMatchSetBad2) != 2) { log_err("u_strspn couldn't skip 0xdbff or a.\n"); } - if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != 0) { + if (u_strspn(testSurrogateString, f) != 0) { log_err("u_strspn couldn't skip d or c (skip first letter).\n"); } - if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != 0) { + if (u_strspn(testSurrogateString, dc) != 0) { log_err("u_strspn couldn't skip d or c (skip first letter).\n"); } - if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != 0) { + if (u_strspn(testSurrogateString, cd) != 0) { log_err("u_strspn couldn't skip d or c (skip first letter).\n"); } if (u_strspn(testSurrogateString, testSurrogateString) != u_strlen(testSurrogateString)) { diff --git a/icu4c/source/test/cintltst/utf16tst.c b/icu4c/source/test/cintltst/utf16tst.c index e5367367b22..9dd395a9145 100644 --- a/icu4c/source/test/cintltst/utf16tst.c +++ b/icu4c/source/test/cintltst/utf16tst.c @@ -21,6 +21,7 @@ #include "cintltst.h" #include +#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) static void printUChars(const UChar *uchars); @@ -31,6 +32,7 @@ static void TestNextPrevChar(void); static void TestFwdBack(void); static void TestSetChar(void); static void TestAppendChar(void); +static void TestAppend(void); static void TestSurrogate(void); void addUTF16Test(TestNode** root); @@ -45,6 +47,7 @@ addUTF16Test(TestNode** root) addTest(root, &TestFwdBack, "utf16tst/TestFwdBack" ); addTest(root, &TestSetChar, "utf16tst/TestSetChar" ); addTest(root, &TestAppendChar, "utf16tst/TestAppendChar" ); + addTest(root, &TestAppend, "utf8tst/TestAppend" ); addTest(root, &TestSurrogate, "utf16tst/TestSurrogate" ); } @@ -57,17 +60,17 @@ static void TestCodeUnitValues() UChar c=codeunit[i]; log_verbose("Testing code unit value of %x\n", c); if(i<4){ - if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c)){ + if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c) || !U16_IS_SINGLE(c) || U16_IS_LEAD(c) || U16_IS_TRAIL(c)){ log_err("ERROR: %x is a single character\n", c); } } if(i >= 4 && i< 8){ - if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c)){ + if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c) || !U16_IS_LEAD(c) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c)){ log_err("ERROR: %x is a first surrogate\n", c); } } if(i >= 8 && i< 12){ - if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c)){ + if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || !U16_IS_TRAIL(c) || U16_IS_SINGLE(c) || U16_IS_LEAD(c)){ log_err("ERROR: %x is a second surrogate\n", c); } } @@ -93,7 +96,7 @@ static void TestCharLength() UBool multiple; for(i=0; i 0){ + U16_BACK_1_UNSAFE(input, offunsafe); + if(offunsafe != back_unsafe[i]){ + log_err("ERROR: U16_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); + } + i++; + } + i=0; while(offsafe > 0){ UTF16_BACK_1_SAFE(input,0, offsafe); @@ -321,6 +408,16 @@ static void TestFwdBack(){ } i++; } + + i=0; + while(offsafe > 0){ + U16_BACK_1(input,0, offsafe); + if(offsafe != back_safe[i]){ + log_err("ERROR: U16_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); + } + i++; + } + offunsafe=0; offsafe=0; for(i=0; i 0) { setOffset=offset; UTF16_SET_CHAR_LIMIT_UNSAFE(input, setOffset); if(setOffset != limit_unsafe[i]){ log_err("ERROR: UTF16_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_unsafe[i], setOffset); } + + setOffset=offset; + U16_SET_CP_LIMIT_UNSAFE(input, setOffset); + if(setOffset != limit_unsafe[i]){ + log_err("ERROR: U16_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_unsafe[i], setOffset); + } } + setOffset=offset; - UTF16_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input)/U_SIZEOF_UCHAR); + U16_SET_CP_LIMIT(input,0, setOffset, sizeof(input)/U_SIZEOF_UCHAR); if(setOffset != limit_safe[i]){ - log_err("ERROR: UTF16_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_safe[i], setOffset); + log_err("ERROR: U16_SET_CHAR_LIMIT failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_safe[i], setOffset); } + i++; } } @@ -487,6 +643,67 @@ static void TestAppendChar(){ } +static void TestAppend() { + static const UChar32 codePoints[]={ + 0x61, 0xdf, 0x901, 0x3040, + 0xac00, 0xd800, 0xdbff, 0xdcde, + 0xdffd, 0xe000, 0xffff, 0x10000, + 0x12345, 0xe0021, 0x10ffff, 0x110000, + 0x234567, 0x7fffffff, -1, -1000, + 0, 0x400 + }; + static const UChar expectUnsafe[]={ + 0x61, 0xdf, 0x901, 0x3040, + 0xac00, 0xd800, 0xdbff, 0xdcde, + 0xdffd, 0xe000, 0xffff, 0xd800, 0xdc00, + 0xd848, 0xdf45, 0xdb40, 0xdc21, 0xdbff, 0xdfff, /* not 0x110000 */ + /* none from this line */ + 0, 0x400 + }, expectSafe[]={ + 0x61, 0xdf, 0x901, 0x3040, + 0xac00, 0xd800, 0xdbff, 0xdcde, + 0xdffd, 0xe000, 0xffff, 0xd800, 0xdc00, + 0xd848, 0xdf45, 0xdb40, 0xdc21, 0xdbff, 0xdfff, /* not 0x110000 */ + /* none from this line */ + 0, 0x400 + }; + + UChar buffer[100]; + UChar32 c; + int32_t i, length; + UBool isError, expectIsError, wrongIsError; + + length=0; + for(i=0; i= 0 : c != result[i+1]){ + log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); + } + UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE); if(c != result[i+1]){ log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } + UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE); if(c != result[i+2]){ log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c); @@ -225,7 +241,18 @@ static void TestNextPrevChar(){ if(c != result[i]){ log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } + + setOffset=offset; + U8_NEXT_UNSAFE(input, setOffset, c); + if(setOffset != movedOffset[i]){ + log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", + offset, movedOffset[i], setOffset); + } + if(c != result[i]){ + log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); + } } + setOffset=offset; UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE); if(setOffset != movedOffset[i+1]){ @@ -235,6 +262,17 @@ static void TestNextPrevChar(){ if(c != result[i+1]){ log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } + + setOffset=offset; + U8_NEXT(input, setOffset, sizeof(input), c); + if(setOffset != movedOffset[i+1]){ + log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", + offset, movedOffset[i+1], setOffset); + } + if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){ + log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); + } + setOffset=offset; UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE); if(setOffset != movedOffset[i+1]){ @@ -244,8 +282,10 @@ static void TestNextPrevChar(){ if(c != result[i+2]){ log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c); } + i=i+6; } + i=0; for(offset=sizeof(input); offset > 0; --offset){ setOffset=offset; @@ -257,6 +297,7 @@ static void TestNextPrevChar(){ if(c != result[i+3]){ log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c); } + setOffset=offset; UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); if(setOffset != movedOffset[i+4]){ @@ -266,6 +307,17 @@ static void TestNextPrevChar(){ if(c != result[i+4]){ log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); } + + setOffset=offset; + U8_PREV(input, 0, setOffset, c); + if(setOffset != movedOffset[i+4]){ + log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", + offset, movedOffset[i+4], setOffset); + } + if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){ + log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); + } + setOffset=offset; UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); if(setOffset != movedOffset[i+5]){ @@ -275,6 +327,7 @@ static void TestNextPrevChar(){ if(c != result[i+5]){ log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c); } + i=i+6; } @@ -295,6 +348,7 @@ static void TestFwdBack(){ uint32_t offunsafe=0, offsafe=0; + uint32_t i=0; while(offunsafe < sizeof(input)){ UTF8_FWD_1_UNSAFE(input, offunsafe); @@ -303,6 +357,16 @@ static void TestFwdBack(){ } i++; } + + i=0; + while(offunsafe < sizeof(input)){ + U8_FWD_1_UNSAFE(input, offunsafe); + if(offunsafe != fwd_unsafe[i]){ + log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe); + } + i++; + } + i=0; while(offsafe < sizeof(input)){ UTF8_FWD_1_SAFE(input, offsafe, sizeof(input)); @@ -311,6 +375,16 @@ static void TestFwdBack(){ } i++; } + + i=0; + while(offsafe < sizeof(input)){ + U8_FWD_1(input, offsafe, sizeof(input)); + if(offsafe != fwd_safe[i]){ + log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe); + } + i++; + } + offunsafe=sizeof(input); i=0; while(offunsafe > 0){ @@ -320,6 +394,17 @@ static void TestFwdBack(){ } i++; } + + offunsafe=sizeof(input); + i=0; + while(offunsafe > 0){ + U8_BACK_1_UNSAFE(input, offunsafe); + if(offunsafe != back_unsafe[i]){ + log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe); + } + i++; + } + i=0; offsafe=sizeof(input); while(offsafe > 0){ @@ -329,14 +414,34 @@ static void TestFwdBack(){ } i++; } + + i=0; + offsafe=sizeof(input); + while(offsafe > 0){ + U8_BACK_1(input, 0, offsafe); + if(offsafe != back_safe[i]){ + log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe); + } + i++; + } + offunsafe=0; - offsafe=0; for(i=0; i