diff --git a/icu4c/source/common/caniter.cpp b/icu4c/source/common/caniter.cpp
index fdfa0a53e92..83148e2cc04 100644
--- a/icu4c/source/common/caniter.cpp
+++ b/icu4c/source/common/caniter.cpp
@@ -582,7 +582,7 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
int32_t inputLen = 0;
UChar decomp[decompSize];
- UTF_APPEND_CHAR(temp, inputLen, bufSize, comp);
+ U16_APPEND_UNSAFE(temp, inputLen, comp);
int32_t decompLen = unorm_getDecomposition(comp, FALSE, decomp, decompSize);
if(decompLen < 0) {
decompLen = -decompLen;
@@ -597,7 +597,9 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
UChar32 decompCp;
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
- int32_t i = 0;
+ int32_t i;
+ UBool overflow = FALSE;
+
i = segmentPos;
while(i < segLen) {
UTF_NEXT_CHAR(segment, i, segLen, cp);
@@ -620,7 +622,19 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
// brute force approach
- UTF_APPEND_CHAR(buff, bufLen, bufSize, cp);
+ U16_APPEND(buff, bufLen, bufSize, cp, overflow);
+
+ if(overflow) {
+ /*
+ * ### TODO handle buffer overflow
+ * The buffer is large, but an overflow may still happen with
+ * unusual input (many combining marks?).
+ * Reallocate buffer and continue.
+ * markus 20020929
+ */
+
+ overflow = FALSE;
+ }
/* TODO: optimize
// since we know that the classes are monotonically increasing, after zero
diff --git a/icu4c/source/common/common.dsp b/icu4c/source/common/common.dsp
index 548c4470506..9785298ad1a 100644
--- a/icu4c/source/common/common.dsp
+++ b/icu4c/source/common/common.dsp
@@ -3100,6 +3100,10 @@ InputPath=.\unicode\utf8.h
# End Source File
# Begin Source File
+SOURCE=.\unicode\utf_old.h
+# End Source File
+# Begin Source File
+
SOURCE=.\util.h
# End Source File
# Begin Source File
diff --git a/icu4c/source/common/ucnv_cnv.c b/icu4c/source/common/ucnv_cnv.c
index 3180ca99cc0..34b8ca95482 100644
--- a/icu4c/source/common/ucnv_cnv.c
+++ b/icu4c/source/common/ucnv_cnv.c
@@ -141,7 +141,7 @@ ucnv_getUChar32KeepOverflow(UConverter *cnv, const UChar *buffer, int32_t length
/* get the first code point in the buffer */
i=0;
- UTF_NEXT_CHAR_SAFE(buffer, i, length, c, FALSE);
+ UTF_NEXT_CHAR(buffer, i, length, c);
if(iUCharErrorBuffer;
diff --git a/icu4c/source/common/unicode/ustring.h b/icu4c/source/common/unicode/ustring.h
index 8b25a9a2005..32e95cd212f 100644
--- a/icu4c/source/common/unicode/ustring.h
+++ b/icu4c/source/common/unicode/ustring.h
@@ -156,7 +156,7 @@ u_strstr(const UChar *s, const UChar *substring);
* but u_strchr32() will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
- * This behavior ensures that UTF_GET_CHAR(u_strchr32(c))==c.
+ * This behavior ensures that U16_GET(u_strchr32(c))==c.
*
* @param s The string to search.
* @param c The code point (0..0x10ffff) to find.
@@ -628,7 +628,7 @@ u_memchr(const UChar *src, UChar ch, int32_t count);
* but u_memchr32() will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
- * This behavior ensures that UTF_GET_CHAR(u_memchr32(c))==c.
+ * This behavior ensures that U16_GET(u_memchr32(c))==c.
*
* @param src string to search in
* @param ch character to find
diff --git a/icu4c/source/common/unicode/utf.h b/icu4c/source/common/unicode/utf.h
index fd352add9e7..bdad0aeefae 100644
--- a/icu4c/source/common/unicode/utf.h
+++ b/icu4c/source/common/unicode/utf.h
@@ -1,7 +1,7 @@
/*
*******************************************************************************
*
-* Copyright (C) 1999-2001, International Business Machines
+* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@@ -15,77 +15,89 @@
*/
/**
-* \file
-* \brief C API: UChar and UChar32 data types and UTF macros for C Unicode string handling
-*
-* This file defines the UChar and UChar32 data types for Unicode code units
-* and code points, as well as macros for efficiently getting code points
-* in and out of a string.
-*
-* utf.h is included by utypes.h and itself includes the utfXX.h after some
-* common definitions. Those files define the macros for each UTF-size.
-*
-* The original concept for these files was for ICU to allow
-* in principle to set which UTF (UTF-8/16/32) is used internally
-* by defining UTF_SIZE to either 8, 16, or 32. utf.h would then define the UChar type
-* accordingly. UTF-16 was the default.
-*
-* This concept has been abandoned.
-* A lot of the ICU source code — especially low-level code like
-* conversion, normalization, and collation — assumes UTF-16,
-* utf.h enforces the default of UTF-16.
-* The UTF-8 and UTF-32 macros remain for now for completeness and backward compatibility.
-*
-* Accordingly, utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then
-* UChar is defined to be exactly wchar_t, otherwise uint16_t.
-*
-* UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
-* Unicode code point (Unicode scalar value, 0..0x10ffff).
-* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
-* the definition of UChar. For details see the documentation for UChar32 itself.
-*
-* utf.h also defines a number of C macros for handling single Unicode code points and
-* for using UTF Unicode strings. It includes utf8.h, utf16.h, and utf32.h for the actual
-* implementations of those macros and then aliases one set of them (for UTF-16) for general use.
-* The UTF-specific macros have the UTF size in the macro name prefixes (UTF16_...), while
-* the general alias macros always begin with UTF_...
-*
-* Many string operations can be done with or without error checking.
-* Where such a distinction is useful, there are two versions of the macros, "unsafe" and "safe"
-* ones with ..._UNSAFE and ..._SAFE suffixes. The unsafe macros are fast but may cause
-* program failures if the strings are not well-formed. The safe macros have an additional, boolean
-* parameter "strict". If strict is FALSE, then only illegal sequences are detected.
-* Otherwise, irregular sequences and non-characters are detected as well (like single surrogates).
-* Safe macros return special error code points for illegal/irregular sequences:
-* Typically, U+ffff, or values that would result in a code unit sequence of the same length
-* as the erroneous input sequence.
-* Note that _UNSAFE macros have fewer parameters: They do not have the strictness parameter, and
-* they do not have start/length parameters for boundary checking.
-*
-* Here, the macros are aliased in two steps:
-* In the first step, the UTF-specific macros with UTF16_ prefix and _UNSAFE and _SAFE suffixes are
-* aliased according to the UTF_SIZE to macros with UTF_ prefix and the same suffixes and signatures.
-* Then, in a second step, the default, general alias macros are set to use either the unsafe or
-* the safe/not strict (default) or the safe/strict macro;
-* these general macros do not have a strictness parameter.
-*
-* It is possible to change the default choice for the general alias macros to be unsafe, safe/not strict or safe/strict.
-* The default is safe/not strict. It is not recommended to select the unsafe macros as the basis for
-* Unicode string handling in ICU! To select this, define UTF_SAFE, UTF_STRICT, or UTF_UNSAFE.
-*
-* For general use, one should use the default, general macros with UTF_ prefix and no _SAFE/_UNSAFE suffix.
-* Only in some cases it may be necessary to control the choice of macro directly and use a less generic alias.
-* For example, if it can be assumed that a string is well-formed and the index will stay within the bounds,
-* then the _UNSAFE version may be used.
-* If a UTF-8 string is to be processed, then the macros with UTF8_ prefixes need to be used.
-* Usage: ICU coding guidelines for if() statements should be followed when using these macros.
-* Compound statements (curly braces {}) must be used for if-else-while...
-* bodies and all macro statements should be terminated with semicolon.
-*/
+ * \file
+ * \brief C API: UChar and UChar32 data types and code point macros
+ *
+ * This file defines the UChar and UChar32 data types for Unicode code units
+ * and code points, as well as macros for checking whether a code point is
+ * a surrogate or a non-character.
+ *
+ * utf.h is included by utypes.h and itself includes utf8.h and utf16.h after some
+ * common definitions. Those files define macros for efficiently getting code points
+ * in and out of UTF-8/16 strings.
+ * utf16.h macros have "U16_" prefixes.
+ * utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling.
+ *
+ * ICU processes 16-bit Unicode strings.
+ * Most of the time, such strings are well-formed UTF-16.
+ * Single, unpaired surrogates must be handled as well, and are treated in ICU
+ * like regular code points where possible.
+ * (Pairs of surrogate code points are indistinguishable from supplementary
+ * code points encoded as pairs of supplementary code units.)
+ *
+ * In fact, almost all Unicode code points in normal text (>99%)
+ * are on the BMP (<=U+ffff) and even <=U+d7ff.
+ * ICU functions handle supplementary code points (U+10000..U+10ffff)
+ * but are optimized for the much more frequently occurring BMP code points.
+ *
+ * utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then
+ * UChar is defined to be exactly wchar_t, otherwise uint16_t.
+ *
+ * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
+ * Unicode code point (Unicode scalar value, 0..0x10ffff).
+ * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
+ * the definition of UChar. For details see the documentation for UChar32 itself.
+ *
+ * utf.h also defines a small number of C macros for single Unicode code points.
+ * These are simple checks for surrogates and non-characters.
+ * For actual Unicode character properties see uchar.h.
+ *
+ * By default, string operations must be done with error checking in case
+ * a string is not well-formed UTF-16.
+ * The macros will detect if a surrogate code unit is unpaired
+ * (lead unit without trail unit or vice versa) and just return the unit itself
+ * as the code point.
+ * (It is an accidental property of Unicode and UTF-16 that all
+ * malformed sequences can be expressed unambiguously with a distinct subrange
+ * of Unicode code points.)
+ *
+ * When it is safe to assume that text is well-formed UTF-16
+ * (does not contain single, unpaired surrogates), then one can use
+ * U16_..._UNSAFE macros.
+ * These do not check for proper code unit sequences or truncated text and may
+ * yield wrong results or even cause a crash if they are used with "malformed"
+ * text.
+ * In practice, U16_..._UNSAFE macros will produce slightly less code but
+ * should not be faster because the processing is only different when a
+ * surrogate code unit is detected, which will be rare.
+ *
+ * Similarly for UTF-8, there are "safe" macros without a suffix,
+ * and U8_..._UNSAFE versions.
+ * The performance differences are much larger here because UTF-8 provides so
+ * many opportunities for malformed sequences.
+ * The unsafe UTF-8 macros are entirely implemented inside the macro definitions
+ * and are fast, while the safe UTF-8 macros call functions for all but the
+ * trivial (ASCII) cases.
+ *
+ * Unlike with UTF-16, malformed sequences cannot be expressed with distinct
+ * code point values (0..U+10ffff). They are indicated with negative values instead.
+ *
+ * For more information see the ICU User Guide Strings chapter
+ * (http://oss.software.ibm.com/icu/userguide/).
+ *
+ * Usage:
+ * ICU coding guidelines for if() statements should be followed when using these macros.
+ * Compound statements (curly braces {}) must be used for if-else-while...
+ * bodies and all macro statements should be terminated with semicolon.
+ *
+ * @draft ICU 2.4
+ */
#ifndef __UTF_H__
#define __UTF_H__
+/* wchar_t-related definitions ---------------------------------------------- */
+
/*
* ANSI C headers:
* stddef.h defines wchar_t
@@ -94,18 +106,11 @@
#include
/* include the utfXX.h after the following definitions */
-/* If there is no compiler option for the preferred UTF size, then default to UTF-16. */
-#ifndef UTF_SIZE
- /** Number of bits in a Unicode string code unit, same as x in UTF-x (8, 16, or 32). */
-# define UTF_SIZE 16
-#endif
-
-/** Number of bytes in a UChar (sizeof(UChar)). */
-#define U_SIZEOF_UCHAR (UTF_SIZE>>3)
-
/*!
* \def U_SIZEOF_WCHAR_T
* U_SIZEOF_WCHAR_T==sizeof(wchar_t).
+ *
+ * @stable
*/
#ifndef U_HAVE_WCHAR_H
# define U_HAVE_WCHAR_H 1
@@ -120,10 +125,14 @@
/*!
* \def U_WCHAR_IS_UTF16
* Defined if wchar_t uses UTF-16.
+ *
+ * @stable
*/
/*!
* \def U_WCHAR_IS_UTF32
* Defined if wchar_t uses UTF-32.
+ *
+ * @stable
*/
#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
# ifdef __STDC_ISO_10646__
@@ -145,139 +154,10 @@
# endif
#endif
-/**
- * Define UChar32 as a type for single Unicode code points.
- * UChar32 is a signed 32-bit integer.
- *
- * The Unicode code point range is 0..0x10ffff.
- * All other values (negative or >=0x110000) are illegal as Unicode code points.
- * They may be used as sentinel values to indicate "done", "error"
- * or similar non-code point conditions.
- *
- * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
- * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
- * or else to be uint32_t.
- * That is, the definition of UChar32 was platform-dependent.
- *
- * @see UTF_SENTINEL
- * @draft ICU 2.4
- */
-typedef int32_t UChar32;
+/* UChar and UChar32 definitions -------------------------------------------- */
-/**
- * Unicode string and array offset and index type.
- * ICU always counts Unicode code units (UChars) for
- * string offsets, indexes, and lengths, not Unicode code points.
- *
- * @deprecated Use int32_t directly. UTextOffset to be removed after 2003-mar.
- */
-typedef int32_t UTextOffset;
-
-/* Specify which macro versions are the default ones - safe or fast. */
-#if !defined(UTF_SAFE) && !defined(UTF_STRICT) && !defined(UTF_UNSAFE)
- /**
- * The default choice for general Unicode string macros is to use the ..._SAFE macro implementations
- * with strict=FALSE. See the utf.h file description.
- */
-# define UTF_SAFE
-#endif
-
-/* internal definitions ----------------------------------------------------- */
-
-/**
- * UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8,
- * which need 1 or 2 bytes in UTF-8:
- * U+0015 = NAK = Negative Acknowledge, C0 control character
- * U+009f = highest C1 control character
- *
- * These are used by ("safe") UTF-8 macros so that they can return an error value
- * that needs the same number of code units (bytes) as were seen by
- * a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().
- *
- * @internal
- */
-#define UTF8_ERROR_VALUE_1 0x15
-/**
- * See documentation on UTF8_ERROR_VALUE_1 for details.
- */
-#define UTF8_ERROR_VALUE_2 0x9f
-
-/**
- * Error value for all UTFs. This code point value will be set by macros with error
- * checking if an error is detected.
- */
-#define UTF_ERROR_VALUE 0xffff
-
-/* single-code point definitions -------------------------------------------- */
-
-/**
- * This value is intended for sentinel values for APIs that
- * (take or) return single code points (UChar32).
- * It is outside of the Unicode code point range 0..0x10ffff.
- *
- * For example, a "done" or "error" value in a new API
- * could be indicated with UTF_SENTINEL.
- *
- * ICU APIs designed before ICU 2.4 usually define service-specific "done"
- * values, mostly 0xffff.
- * Those may need to be distinguished from
- * actual U+ffff text contents by calling functions like
- * CharacterIterator::hasNext() or UnicodeString::length().
- *
- * @see UChar32
- * @draft ICU 2.4
- */
-#define UTF_SENTINEL (-1)
-
-/** Is this code unit or code point a surrogate (U+d800..U+dfff)? */
-#define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800)
-
-/**
- * Is a given 32-bit code point a Unicode noncharacter?
- */
-#define UTF_IS_UNICODE_NONCHAR(c) \
- ((c)>=0xfdd0 && \
- ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
- (uint32_t)(c)<=0x10ffff)
-
-/**
- * Is a given 32-bit code point/Unicode scalar value
- * actually a valid Unicode (abstract) character?
- *
- * Code points that are not characters include:
- * - single surrogate code points (U+d800..U+dfff, 2048 code points)
- * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
- * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
- * - the highest Unicode code point value is U+10ffff
- *
- * This means that all code points below U+d800 are character code points,
- * and that boundary is tested first for performance.
- */
-#define UTF_IS_UNICODE_CHAR(c) \
- ((uint32_t)(c)<0xd800 || \
- ((uint32_t)(c)>0xdfff && \
- (uint32_t)(c)<=0x10ffff && \
- !UTF_IS_UNICODE_NONCHAR(c)))
-
-/**
- * Is a given 32-bit code an error value
- * as returned by one of the macros for any UTF?
- */
-#define UTF_IS_ERROR(c) \
- (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
-
-/** This is a combined macro: Is c a valid Unicode value _and_ not an error code? */
-#define UTF_IS_VALID(c) \
- (UTF_IS_UNICODE_CHAR(c) && \
- (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2)
-
-/* include the utfXX.h ------------------------------------------------------ */
-
-#include "unicode/utf8.h"
-#include "unicode/utf16.h"
-#include "unicode/utf32.h"
-
-/* Define types and macros according to the selected UTF size. -------------- */
+/** Number of bytes in a UChar. @stable */
+#define U_SIZEOF_UCHAR 2
/*!
* \var UChar
@@ -290,262 +170,127 @@ typedef int32_t UTextOffset;
* @stable
*/
-#if UTF_SIZE==8
-
-# error UTF-8 is not implemented, undefine UTF_SIZE or define it to 16
-
-/*
- * ANSI C header:
- * limits.h defines CHAR_MAX
- */
-# include
-
- /* Define UChar to be compatible with char if possible. */
-# if CHAR_MAX>=255
- typedef char UChar;
-# else
- typedef uint8_t UChar;
-# endif
-
-#elif UTF_SIZE==16
-
- /* Define UChar to be compatible with wchar_t if possible. */
-# if U_SIZEOF_WCHAR_T==2
- typedef wchar_t UChar;
-# else
- typedef uint16_t UChar;
-# endif
-
- /** Does this code unit alone encode a code point? */
-# define UTF_IS_SINGLE(uchar) UTF16_IS_SINGLE(uchar)
- /** Is this code unit the first one of several? */
-# define UTF_IS_LEAD(uchar) UTF16_IS_LEAD(uchar)
- /** Is this code unit one of several but not the first one? */
-# define UTF_IS_TRAIL(uchar) UTF16_IS_TRAIL(uchar)
-
- /** Does this code point require multiple code units? */
-# define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c)
- /** How many code units are used to encode this code point? */
-# define UTF_CHAR_LENGTH(c) UTF16_CHAR_LENGTH(c)
- /** How many code units are used at most for any Unicode code point? */
-# define UTF_MAX_CHAR_LENGTH UTF16_MAX_CHAR_LENGTH
- /** Estimate the number of code units for a string based on the number of UTF-16 code units. */
-# define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size)
-
- /** See file documentation and UTF_GET_CHAR. */
-# define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c)
- /** See file documentation and UTF_GET_CHAR. */
-# define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict)
-
- /** See file documentation and UTF_NEXT_CHAR. */
-# define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c)
- /** See file documentation and UTF_NEXT_CHAR. */
-# define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict)
-
- /** See file documentation and UTF_APPEND_CHAR. */
-# define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c)
- /** See file documentation and UTF_APPEND_CHAR. */
-# define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c)
-
- /** See file documentation and UTF_FWD_1. */
-# define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i)
- /** See file documentation and UTF_FWD_1. */
-# define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length)
-
- /** See file documentation and UTF_FWD_N. */
-# define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n)
- /** See file documentation and UTF_FWD_N. */
-# define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n)
-
- /** See file documentation and UTF_SET_CHAR_START. */
-# define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i)
- /** See file documentation and UTF_SET_CHAR_START. */
-# define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i)
-
- /** See file documentation and UTF_PREV_CHAR. */
-# define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c)
- /** See file documentation and UTF_PREV_CHAR. */
-# define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict)
-
- /** See file documentation and UTF_BACK_1. */
-# define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i)
- /** See file documentation and UTF_BACK_1. */
-# define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i)
-
- /** See file documentation and UTF_BACK_N. */
-# define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n)
- /** See file documentation and UTF_BACK_N. */
-# define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n)
-
- /** See file documentation and UTF_SET_CHAR_LIMIT. */
-# define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i)
- /** See file documentation and UTF_SET_CHAR_LIMIT. */
-# define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length)
-
-#elif UTF_SIZE==32
-
-# error UTF-32 is not implemented, undefine UTF_SIZE or define it to 16
-
- typedef UChar32 UChar;
-
+/* Define UChar to be compatible with wchar_t if possible. */
+#if U_SIZEOF_WCHAR_T==2
+ typedef wchar_t UChar;
#else
-# error UTF_SIZE must be undefined or one of { 8, 16, 32 } - only 16 is implemented
+ typedef uint16_t UChar;
#endif
-/* Define the default macros for handling UTF characters. ------------------- */
+/**
+ * Define UChar32 as a type for single Unicode code points.
+ * UChar32 is a signed 32-bit integer (same as int32_t).
+ *
+ * The Unicode code point range is 0..0x10ffff.
+ * All other values (negative or >=0x110000) are illegal as Unicode code points.
+ * They may be used as sentinel values to indicate "done", "error"
+ * or similar non-code point conditions.
+ *
+ * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
+ * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
+ * or else to be uint32_t.
+ * That is, the definition of UChar32 was platform-dependent.
+ *
+ * @see U_SENTINEL
+ * @draft ICU 2.4
+ */
+typedef int32_t UChar32;
+
+/* single-code point definitions -------------------------------------------- */
/**
- * \def UTF_GET_CHAR(s, start, i, length, c)
+ * This value is intended for sentinel values for APIs that
+ * (take or) return single code points (UChar32).
+ * It is outside of the Unicode code point range 0..0x10ffff.
+ *
+ * For example, a "done" or "error" value in a new API
+ * could be indicated with U_SENTINEL.
*
- * Set c to the code point that contains the code unit i.
- * i could point to the first, the last, or an intermediate code unit.
- * i is not modified.
- * \pre 0<=i=0xfdd0 && \
+ ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
+ (uint32_t)(c)<=0x10ffff)
/**
- * \def UTF_APPEND_CHAR(s, i, length, c)
+ * Is c a Unicode code point value (0..U+10ffff)
+ * that can be assigned a character?
*
- * Append the code units of code point c to the string at index i
- * and advance i to beyond the new code units (post-increment).
- * The code units beginning at index i will be overwritten.
- * \pre 0<=c<=0x10ffff
- * \pre 0<=i0xdfff && \
+ (uint32_t)(c)<=0x10ffff && \
+ !U_IS_UNICODE_NONCHAR(c)))
/**
- * \def UTF_FWD_1(s, i, length)
- *
- * Advance i to beyond the code units of the code point that begins at i.
- * I.e., advance i by one code point.
- * i must point to the first code unit of a code point.
- * \pre 0<=i
-* Usage: ICU coding guidelines for if() statements should be followed when using these macros.
-* Compound statements (curly braces {}) must be used for if-else-while...
-* bodies and all macro statements should be terminated with semicolon.
-*/
+ * \file
+ * \brief C API: 16-bit Unicode handling macros
+ *
+ * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
+ * utf16.h is included by utf.h after unicode/umachine.h
+ * and some common definitions.
+ *
+ * For more information see utf.h and the ICU User Guide Strings chapter
+ * (http://oss.software.ibm.com/icu/userguide/).
+ *
+ * Usage:
+ * ICU coding guidelines for if() statements should be followed when using these macros.
+ * Compound statements (curly braces {}) must be used for if-else-while...
+ * bodies and all macro statements should be terminated with semicolon.
+ */
+
+/* utf.h must be included first. */
+#ifndef __UTF_H__
+# include "unicode/utf.h"
+#endif
#ifndef __UTF16_H__
#define __UTF16_H__
/* single-code point definitions -------------------------------------------- */
-/* handle surrogate pairs */
-#define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800)
-#define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00)
-
-#define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0)
-
-/** Get the UTF-32 value directly from the surrogate pseudo-characters */
-#define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
-
-#define UTF16_GET_PAIR_VALUE(first, second) \
- (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET)
-
-/* get the first and second surrogates for a supplementary code point */
/**
- * Takes a supplementary code point (0x10000..0x10ffff)
- * and computes the first surrogate (0xd800..0xdbff)
- * for UTF-16 encoding.
+ * Does this code unit alone encode a code point (BMP, not a surrogate)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @draft ICU 2.4
*/
-#define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
+#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
/**
- * Takes a supplementary code point (0x10000..0x10ffff)
- * and computes the second surrogate (0xdc00..0xdfff)
- * for UTF-16 encoding.
+ * Is this code unit a lead surrogate (U+d800..U+dbff)?
+ * @param c 16-bit code unit
+ * @return TRUE or FALSE
+ * @draft ICU 2.4
*/
-#define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
-
-/** alias for UTF_FIRST_SURROGATE */
-#define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary)
-
-/** alias for UTF_SECOND_SURROGATE */
-#define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary)
-
-/* classes of code unit values */
-#define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar)
-#define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar)
-#define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar)
-
-/* number of code units per code point */
-#define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff)
-#define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
-#define UTF16_MAX_CHAR_LENGTH 2
-
-/* average number of code units compared to UTF-16 */
-#define UTF16_ARRAY_SIZE(size) (size)
+#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
- * Get a single code point from an offset that points to any
- * of the code units that belong to that code point.
- * Assume 0<=i>10)+0xd7c0)
+
+/**
+ * Get the trail surrogate (0xdc00..0xdfff) for a
+ * supplementary code point (0x10000..0x10ffff).
+ * @param c 32-bit code point (U+10000..U+10ffff)
+ * @return trail surrogate (U+dc00..U+dfff) for c
+ * @draft ICU 2.4
+ */
+#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
+
+/**
+ * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
+ * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
+ * @param c 32-bit code point
+ * @return 1 or 2
+ * @draft ICU 2.4
+ */
+#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
+
+/**
+ * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
+ * @return 2
+ * @draft ICU 2.4
+ */
+#define U16_MAX_LENGTH 2
+
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * The offset may point to either the lead or trail surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the adjacent matching surrogate as well.
+ * The result is undefined if the offset points to a single, unpaired surrogate.
+ * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U16_GET
+ * @draft ICU 2.4
+ */
+#define U16_GET_UNSAFE(s, i, c) { \
(c)=(s)[i]; \
- if(UTF_IS_SURROGATE(c)) { \
- if(UTF_IS_SURROGATE_FIRST(c)) { \
- (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \
+ if(U16_IS_SURROGATE(c)) { \
+ if(U16_IS_SURROGATE_LEAD(c)) { \
+ (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
} else { \
- (c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \
+ (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
} \
} \
}
-#define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The offset may point to either the lead or trail surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the adjacent matching surrogate as well.
+ * If the offset points to a single, unpaired surrogate, then that itself
+ * will be returned as the code point.
+ * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, start<=i=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
- (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
- /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
- } else if(strict) {\
- /* unmatched second surrogate */ \
- (c)=UTF_ERROR_VALUE; \
+ if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
+ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
- } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
- (c)=UTF_ERROR_VALUE; \
} \
}
/* definitions with forward iteration --------------------------------------- */
-/*
- * all the macros that go forward assume that
- * the initial offset is 0<=i>10)+0xd7c0); \
+ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
+ } else /* c>0x10ffff or not enough space */ { \
+ (isError)=TRUE; \
+ } \
+}
+
+/**
+ * Advance the string offset from one code point boundary to the next.
+ * (Post-incrementing iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U16_FWD_1
+ * @draft ICU 2.4
+ */
+#define U16_FWD_1_UNSAFE(s, i) { \
+ if(U16_IS_LEAD((s)[(i)++])) { \
++(i); \
} \
}
-#define UTF16_FWD_N_UNSAFE(s, i, n) { \
+/**
+ * Advance the string offset from one code point boundary to the next.
+ * (Post-incrementing iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param i string offset, i0) { \
- UTF16_FWD_1_UNSAFE(s, i); \
+ U16_FWD_1_UNSAFE(s, i); \
--__N; \
} \
}
/**
- * Set a random-access offset and adjust it so that
- * it points to the beginning of a Unicode character.
- * The offset that is passed in points to
- * any code unit of a code point
- * and will point to the first code unit after
- * the macro invocation.
- * Never increments the offset.
+ * Advance the string offset from one code point boundary to the n-th next one,
+ * i.e., move forward by n code points.
+ * (Post-incrementing iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param i string offset, i>10)+0xd7c0); \
- (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
- } else /* not enough space */ { \
- (s)[(i)++]=UTF_ERROR_VALUE; \
- } \
- } else /* c>0x10ffff, write error value */ { \
- (s)[(i)++]=UTF_ERROR_VALUE; \
- } \
-}
-
-#define UTF16_FWD_1_SAFE(s, i, length) { \
- if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
- ++(i); \
- } \
-}
-
-#define UTF16_FWD_N_SAFE(s, i, length, n) { \
+#define U16_FWD_N(s, i, length, n) { \
int32_t __N=(n); \
while(__N>0 && (i)<(length)) { \
- UTF16_FWD_1_SAFE(s, i, length); \
+ U16_FWD_1(s, i, length); \
--__N; \
} \
}
-#define UTF16_SET_CHAR_START_SAFE(s, start, i) { \
- if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
+/**
+ * Adjust a random-access offset to a code point boundary
+ * at the start of a code point.
+ * If the offset points to the trail surrogate of a surrogate pair,
+ * then the offset is decremented.
+ * Otherwise, it is not modified.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U16_SET_CP_START
+ * @draft ICU 2.4
+ */
+#define U16_SET_CP_START_UNSAFE(s, i) { \
+ if(U16_IS_TRAIL((s)[i])) { \
+ --(i); \
+ } \
+}
+
+/**
+ * Adjust a random-access offset to a code point boundary
+ * at the start of a code point.
+ * If the offset points to the trail surrogate of a surrogate pair,
+ * then the offset is decremented.
+ * Otherwise, it is not modified.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, start<=i
+ * @see U16_SET_CP_START_UNSAFE
+ * @draft ICU 2.4
+ */
+#define U16_SET_CP_START(s, start, i) { \
+ if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
}
/* definitions with backward iteration -------------------------------------- */
-/*
- * all the macros that go backward assume that
- * the valid buffer range starts at offset 0
- * and that the initial offset is 0(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
+ --(i); \
+ (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
+ } \
+ } \
+}
+
+/**
+ * Move the string offset from one code point boundary to the previous one.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U16_BACK_1
+ * @draft ICU 2.4
+ */
+#define U16_BACK_1_UNSAFE(s, i) { \
+ if(U16_IS_TRAIL((s)[--(i)])) { \
--(i); \
} \
}
-#define UTF16_BACK_N_UNSAFE(s, i, n) { \
+/**
+ * Move the string offset from one code point boundary to the previous one.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, start<=i
+ * @see U16_BACK_1_UNSAFE
+ * @draft ICU 2.4
+ */
+#define U16_BACK_1(s, start, i) { \
+ if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
+ --(i); \
+ } \
+}
+
+/**
+ * Move the string offset from one code point boundary to the n-th one before it,
+ * i.e., move backward by n code points.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param n number of code points to skip
+ * @see U16_BACK_N
+ * @draft ICU 2.4
+ */
+#define U16_BACK_N_UNSAFE(s, i, n) { \
int32_t __N=(n); \
while(__N>0) { \
- UTF16_BACK_1_UNSAFE(s, i); \
+ U16_BACK_1_UNSAFE(s, i); \
--__N; \
} \
}
/**
- * Set a random-access offset and adjust it so that
- * it points after the end of a Unicode character.
- * The offset that is passed in points behind
- * any code unit of a code point
- * and will point behind the last code unit after
- * the macro invocation.
- * Never decrements the offset.
+ * Move the string offset from one code point boundary to the n-th one before it,
+ * i.e., move backward by n code points.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param i string offset, i0 && (i)>(start)) { \
+ U16_BACK_1(s, start, i); \
+ --__N; \
+ } \
+}
+
+/**
+ * Adjust a random-access offset to a code point boundary after a code point.
+ * If the offset is behind the lead surrogate of a surrogate pair,
+ * then the offset is incremented.
+ * Otherwise, it is not modified.
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-16.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U16_SET_CP_LIMIT
+ * @draft ICU 2.4
+ */
+#define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
+ if(U16_IS_LEAD((s)[(i)-1])) { \
++(i); \
} \
}
-/* safe versions with error-checking and optional regularity-checking */
-
-#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
- (c)=(s)[--(i)]; \
- if(UTF_IS_SECOND_SURROGATE(c)) { \
- uint16_t __c2; \
- if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
- --(i); \
- (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
- /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
- } else if(strict) {\
- /* unmatched second surrogate */ \
- (c)=UTF_ERROR_VALUE; \
- } \
- } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
- /* unmatched first surrogate or other non-character */ \
- (c)=UTF_ERROR_VALUE; \
- } \
-}
-
-#define UTF16_BACK_1_SAFE(s, start, i) { \
- if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
- --(i); \
- } \
-}
-
-#define UTF16_BACK_N_SAFE(s, start, i, n) { \
- int32_t __N=(n); \
- while(__N>0 && (i)>(start)) { \
- UTF16_BACK_1_SAFE(s, start, i); \
- --__N; \
- } \
-}
-
-#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
- if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
+/**
+ * Adjust a random-access offset to a code point boundary after a code point.
+ * If the offset is behind the lead surrogate of a surrogate pair,
+ * then the offset is incremented.
+ * Otherwise, it is not modified.
+ * The input offset may be the same as the string length.
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, start<=i<=length
+ * @param length string length
+ * @see U16_SET_CP_LIMIT_UNSAFE
+ * @draft ICU 2.4
+ */
+#define U16_SET_CP_LIMIT(s, start, i, length) { \
+ if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
}
diff --git a/icu4c/source/common/unicode/utf32.h b/icu4c/source/common/unicode/utf32.h
index 7b3a1da2ce7..eaafb2b3d6e 100644
--- a/icu4c/source/common/unicode/utf32.h
+++ b/icu4c/source/common/unicode/utf32.h
@@ -14,146 +14,10 @@
* created by: Markus W. Scherer
*/
/**
-* \file
-* \brief C API: UTF-32 macros
-*
-* This file defines macros to deal with UTF-32 code units and code points.
-* Signatures and semantics are the same as for the similarly named macros
-* in utf16.h.
-* utf32.h is included by utf.h after unicode/umachine.h
-* and some common definitions.
-* Usage: ICU coding guidelines for if() statements should be followed when using these macros.
-* Compound statements (curly braces {}) must be used for if-else-while...
-* bodies and all macro statements should be terminated with semicolon.
-*/
-
-#ifndef __UTF32_H__
-#define __UTF32_H__
-
-/* internal definitions ----------------------------------------------------- */
-
-#define UTF32_IS_SAFE(c, strict) \
- (!(strict) ? \
- (uint32_t)(c)<=0x10ffff : \
- UTF_IS_UNICODE_CHAR(c))
-
-/*
- * For the semantics of all of these macros, see utf16.h.
- * The UTF-32 versions are trivial because any code point is
- * encoded using exactly one code unit.
+ * \file
+ * \brief C API: UTF-32 macros
+ *
+ * This file is deprecated and its contents moved to utf_old.h.
+ * See utf_old.h and Jitterbug 2150 and its discussion on the ICU mailing list
+ * in September 2002.
*/
-
-/* single-code point definitions -------------------------------------------- */
-
-/* classes of code unit values */
-#define UTF32_IS_SINGLE(uchar) 1
-#define UTF32_IS_LEAD(uchar) 0
-#define UTF32_IS_TRAIL(uchar) 0
-
-/* number of code units per code point */
-#define UTF32_NEED_MULTIPLE_UCHAR(c) 0
-#define UTF32_CHAR_LENGTH(c) 1
-#define UTF32_MAX_CHAR_LENGTH 1
-
-/* average number of code units compared to UTF-16 */
-#define UTF32_ARRAY_SIZE(size) (size)
-
-#define UTF32_GET_CHAR_UNSAFE(s, i, c) { \
- (c)=(s)[i]; \
-}
-
-#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
- (c)=(s)[i]; \
- if(!UTF32_IS_SAFE(c, strict)) { \
- (c)=UTF_ERROR_VALUE; \
- } \
-}
-
-/* definitions with forward iteration --------------------------------------- */
-
-#define UTF32_NEXT_CHAR_UNSAFE(s, i, c) { \
- (c)=(s)[(i)++]; \
-}
-
-#define UTF32_APPEND_CHAR_UNSAFE(s, i, c) { \
- (s)[(i)++]=(c); \
-}
-
-#define UTF32_FWD_1_UNSAFE(s, i) { \
- ++(i); \
-}
-
-#define UTF32_FWD_N_UNSAFE(s, i, n) { \
- (i)+=(n); \
-}
-
-#define UTF32_SET_CHAR_START_UNSAFE(s, i) { \
-}
-
-#define UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
- (c)=(s)[(i)++]; \
- if(!UTF32_IS_SAFE(c, strict)) { \
- (c)=UTF_ERROR_VALUE; \
- } \
-}
-
-#define UTF32_APPEND_CHAR_SAFE(s, i, length, c) { \
- if((uint32_t)(c)<=0x10ffff) { \
- (s)[(i)++]=(c); \
- } else /* c>0x10ffff, write 0xfffd */ { \
- (s)[(i)++]=0xfffd; \
- } \
-}
-
-#define UTF32_FWD_1_SAFE(s, i, length) { \
- ++(i); \
-}
-
-#define UTF32_FWD_N_SAFE(s, i, length, n) { \
- if(((i)+=(n))>(length)) { \
- (i)=(length); \
- } \
-}
-
-#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \
-}
-
-/* definitions with backward iteration -------------------------------------- */
-
-#define UTF32_PREV_CHAR_UNSAFE(s, i, c) { \
- (c)=(s)[--(i)]; \
-}
-
-#define UTF32_BACK_1_UNSAFE(s, i) { \
- --(i); \
-}
-
-#define UTF32_BACK_N_UNSAFE(s, i, n) { \
- (i)-=(n); \
-}
-
-#define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \
-}
-
-#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \
- (c)=(s)[--(i)]; \
- if(!UTF32_IS_SAFE(c, strict)) { \
- (c)=UTF_ERROR_VALUE; \
- } \
-}
-
-#define UTF32_BACK_1_SAFE(s, start, i) { \
- --(i); \
-}
-
-#define UTF32_BACK_N_SAFE(s, start, i, n) { \
- (i)-=(n); \
- if((i)<(start)) { \
- (i)=(start); \
- } \
-}
-
-#define UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) { \
-}
-
-#endif
diff --git a/icu4c/source/common/unicode/utf8.h b/icu4c/source/common/unicode/utf8.h
index be6644fa6d3..13f5d618ed1 100644
--- a/icu4c/source/common/unicode/utf8.h
+++ b/icu4c/source/common/unicode/utf8.h
@@ -15,23 +15,25 @@
*/
/**
-* \file
-* \brief C API: UTF-8 macros
-*
-* This file defines macros to deal with UTF-8 code units and code points.
-* Signatures and semantics are the same as for the similarly named macros
-* in utf16.h.
-* utf8.h is included by utf.h after unicode/umachine.h
-* and some common definitions.
-* Usage: ICU coding guidelines for if() statements should be followed when using these macros.
-* Compound statements (curly braces {}) must be used for if-else-while...
-* bodies and all macro statements should be terminated with semicolon.
-*/
-
+ * \file
+ * \brief C API: 8-bit Unicode handling macros
+ *
+ * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
+ * utf8.h is included by utf.h after unicode/umachine.h
+ * and some common definitions.
+ *
+ * For more information see utf.h and the ICU User Guide Strings chapter
+ * (http://oss.software.ibm.com/icu/userguide/).
+ *
+ * Usage:
+ * ICU coding guidelines for if() statements should be followed when using these macros.
+ * Compound statements (curly braces {}) must be used for if-else-while...
+ * bodies and all macro statements should be terminated with semicolon.
+ */
/* utf.h must be included first. */
#ifndef __UTF_H__
-# include "unicode/utf.h"
+# include "unicode/utf.h"
#endif
#ifndef __UTF8_H__
@@ -39,6 +41,12 @@
/* internal definitions ----------------------------------------------------- */
+/**
+ * \var utf8_countTrailBytes
+ * Internal array with numbers of trail bytes for any given byte used in
+ * lead byte position.
+ * @internal
+ */
#ifdef U_UTF8_IMPL
U_CAPI const uint8_t
utf8_countTrailBytes[256];
@@ -48,114 +56,166 @@ utf8_countTrailBytes[256];
#endif
/**
- * Count the trail bytes for a lead byte -
- * this macro should be used so that the assembler code
- * that is mentioned in utf_impl.c could be used here.
+ * Count the trail bytes for a UTF-8 lead byte.
+ * @internal
*/
-#define UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte])
+#define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte])
-/* use a macro here, too - there may be a simpler way with some machines */
-#define UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
+/**
+ * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
+ * @internal
+ */
+#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
+/**
+ * Function for handling "next code point" with error-checking.
+ * @internal
+ */
U_CAPI UChar32 U_EXPORT2
-utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict, UBool *pIsError);
+utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
+/**
+ * Function for handling "append code point" with error-checking.
+ * @internal
+ */
U_CAPI int32_t U_EXPORT2
-utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c);
+utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
+/**
+ * Function for handling "previous code point" with error-checking.
+ * @internal
+ */
U_CAPI UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
+/**
+ * Function for handling "skip backward one code point" with error-checking.
+ * @internal
+ */
U_CAPI int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
-/*
- * For the semantics of all of these macros, see utf16.h.
- * The UTF-8 macros favor sequences more the shorter they are.
- * Sometimes, only the single-byte case is covered by a macro,
- * while longer sequences are handled by a function call.
- */
-
/* single-code point definitions -------------------------------------------- */
-/** Is this this code point a single code unit (byte)? */
-#define UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0)
-/** Is this this code unit the lead code unit (byte) of a code point? */
-#define UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e)
-/** Is this this code unit a trailing code unit (byte) of a code point? */
-#define UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80)
-
-/** Does this scalar Unicode value need multiple code units for storage? */
-#define UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f)
+/**
+ * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @draft ICU 2.4
+ */
+#define U8_IS_SINGLE(c) (((c)&0x80)==0)
/**
- * Given the lead character, how many bytes are taken by this code point.
- * ICU does not deal with code points >0x10ffff
- * unless necessary for advancing in the byte stream.
- *
- * These length macros take into account that for values >0x10ffff
- * the "safe" append macros would write the error code point 0xffff
- * with 3 bytes.
- * Code point comparisons need to be in uint32_t because UChar32
- * may be a signed type, and negative values must be recognized.
+ * Is this code unit (byte) a UTF-8 lead byte?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @draft ICU 2.4
*/
-#if 1
-# define UTF8_CHAR_LENGTH(c) \
- ((uint32_t)(c)<=0x7f ? 1 : \
- ((uint32_t)(c)<=0x7ff ? 2 : \
- ((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \
- ) \
- )
-#else
-# define UTF8_CHAR_LENGTH(c) \
- ((uint32_t)(c)<=0x7f ? 1 : \
- ((uint32_t)(c)<=0x7ff ? 2 : \
- ((uint32_t)(c)<=0xffff ? 3 : \
- ((uint32_t)(c)<=0x10ffff ? 4 : \
- ((uint32_t)(c)<=0x3ffffff ? 5 : \
- ((uint32_t)(c)<=0x7fffffff ? 6 : 3) \
- ) \
- ) \
+#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
+
+/**
+ * Is this code unit (byte) a UTF-8 trail byte?
+ * @param c 8-bit code unit (byte)
+ * @return TRUE or FALSE
+ * @draft ICU 2.4
+ */
+#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
+
+/**
+ * How many code units (bytes) are used for the UTF-8 encoding
+ * of this Unicode code point?
+ * @param c 32-bit code point
+ * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
+ * @draft ICU 2.4
+ */
+#define U8_LENGTH(c) \
+ ((uint32_t)(c)<=0x7f ? 1 : \
+ ((uint32_t)(c)<=0x7ff ? 2 : \
+ ((uint32_t)(c)<=0xd7ff ? 3 : \
+ ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
+ ((uint32_t)(c)<=0xffff ? 3 : 4)\
) \
) \
- )
-#endif
+ ) \
+ )
-/** The maximum number of bytes per code point */
-#define UTF8_MAX_CHAR_LENGTH 4
+/**
+ * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
+ * @return 4
+ * @draft ICU 2.4
+ */
+#define U8_MAX_LENGTH 4
-/** Average number of code units compared to UTF-16 */
-#define UTF8_ARRAY_SIZE(size) ((5*(size))/2)
-
-#define UTF8_GET_CHAR_UNSAFE(s, i, c) { \
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * The offset may point to either the lead byte or one of the trail bytes
+ * for a code point, in which case the macro will read all of the bytes
+ * for the code point.
+ * The result is undefined if the offset points to an illegal UTF-8
+ * byte sequence.
+ * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U8_GET
+ * @draft ICU 2.4
+ */
+#define U8_GET_UNSAFE(s, i, c) { \
int32_t __I=(int32_t)(i); \
- UTF8_SET_CHAR_START_UNSAFE(s, __I); \
- UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \
+ U8_SET_CP_START_UNSAFE(s, __I); \
+ U8_NEXT_UNSAFE(s, __I, c); \
}
-#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
+/**
+ * Get a code point from a string at a random-access offset,
+ * without changing the offset.
+ * The offset may point to either the lead byte or one of the trail bytes
+ * for a code point, in which case the macro will read all of the bytes
+ * for the code point.
+ * If the offset points to an illegal UTF-8 byte sequence, then
+ * c is set to a negative value.
+ * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset
+ * @param i string offset, start<=i instead of <0>.
- * The strict checks also check for non-characters.
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * The offset may point to the lead byte of a multi-byte sequence,
+ * in which case the macro will read the whole sequence.
+ * The result is undefined if the offset points to a trail byte
+ * or an illegal UTF-8 sequence.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U8_NEXT
+ * @draft ICU 2.4
*/
-#define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \
+#define U8_NEXT_UNSAFE(s, i, c) { \
(c)=(s)[(i)++]; \
if((uint8_t)((c)-0xc0)<0x35) { \
- uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \
- UTF8_MASK_LEAD_BYTE(c, __count); \
+ uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \
+ U8_MASK_LEAD_BYTE(c, __count); \
switch(__count) { \
/* each following branch falls through to the next one */ \
case 3: \
@@ -170,7 +230,49 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
} \
}
-#define UTF8_APPEND_CHAR_UNSAFE(s, i, c) { \
+/**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The offset may point to the lead byte of a multi-byte sequence,
+ * in which case the macro will read the whole sequence.
+ * If the offset points to a trail byte or an illegal UTF-8 sequence, then
+ * c is set to a negative value.
+ *
+ * @param s const UChar * string
+ * @param i string offset, i=0x80) { \
+ if(U8_IS_LEAD(c)) { \
+ (c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, -1); \
+ } else { \
+ (c)=U_SENTINEL; \
+ } \
+ } \
+}
+
+/**
+ * Append a code point to a string, overwriting 1 to 4 bytes.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
+ * Otherwise, the result is undefined.
+ *
+ * @param s const UChar * string buffer
+ * @param i string offset
+ * @param c code point to append
+ * @see U8_APPEND
+ * @draft ICU 2.4
+ */
+#define U8_APPEND_UNSAFE(s, i, c) { \
if((uint32_t)(c)<=0x7f) { \
(s)[(i)++]=(uint8_t)(c); \
} else { \
@@ -189,74 +291,172 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
} \
}
-#define UTF8_FWD_1_UNSAFE(s, i) { \
- (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \
-}
-
-#define UTF8_FWD_N_UNSAFE(s, i, n) { \
- int32_t __N=(n); \
- while(__N>0) { \
- UTF8_FWD_1_UNSAFE(s, i); \
- --__N; \
- } \
-}
-
-#define UTF8_SET_CHAR_START_UNSAFE(s, i) { \
- while(UTF8_IS_TRAIL((s)[i])) { --(i); } \
-}
-
-#define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
- (c)=(s)[(i)++]; \
- if((c)>=0x80) { \
- if(UTF8_IS_LEAD(c)) { \
- (c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict, NULL); \
- } else { \
- (c)=UTF8_ERROR_VALUE_1; \
- } \
- } \
-}
-
-#define UTF8_APPEND_CHAR_SAFE(s, i, length, c) { \
+/**
+ * Append a code point to a string, overwriting 1 or 2 code units.
+ * The offset points to the current end of the string contents
+ * and is advanced (post-increment).
+ * "Safe" macro, checks for a valid code point.
+ * If a non-ASCII code point is written, checks for sufficient space in the string.
+ * If the code point is not valid or trail bytes do not fit,
+ * then isError is set to TRUE.
+ *
+ * @param s const UChar * string buffer
+ * @param i string offset, i(length)) { \
__count=(uint8_t)((length)-(i)); \
} \
- while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \
+ while(__count>0 && U8_IS_TRAIL((s)[i])) { \
++(i); \
--__count; \
} \
} \
}
-#define UTF8_FWD_N_SAFE(s, i, length, n) { \
+/**
+ * Advance the string offset from one code point boundary to the n-th next one,
+ * i.e., move forward by n code points.
+ * (Post-incrementing iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param n number of code points to skip
+ * @see U8_FWD_N
+ * @draft ICU 2.4
+ */
+#define U8_FWD_N_UNSAFE(s, i, n) { \
int32_t __N=(n); \
- while(__N>0 && (i)<(length)) { \
- UTF8_FWD_1_SAFE(s, i, length); \
+ while(__N>0) { \
+ U8_FWD_1_UNSAFE(s, i); \
--__N; \
} \
}
-#define UTF8_SET_CHAR_START_SAFE(s, start, i) { \
- if(UTF8_IS_TRAIL((s)[(i)])) { \
+/**
+ * Advance the string offset from one code point boundary to the n-th next one,
+ * i.e., move forward by n code points.
+ * (Post-incrementing iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param i string offset, i0 && (i)<(length)) { \
+ U8_FWD_1(s, i, length); \
+ --__N; \
+ } \
+}
+
+/**
+ * Adjust a random-access offset to a code point boundary
+ * at the start of a code point.
+ * If the offset points to a UTF-8 trail byte,
+ * then the offset is moved backward to the corresponding lead byte.
+ * Otherwise, it is not modified.
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U8_SET_CP_START
+ * @draft ICU 2.4
+ */
+#define U8_SET_CP_START_UNSAFE(s, i) { \
+ while(U8_IS_TRAIL((s)[i])) { --(i); } \
+}
+
+/**
+ * Adjust a random-access offset to a code point boundary
+ * at the start of a code point.
+ * If the offset points to a UTF-8 trail byte,
+ * then the offset is moved backward to the corresponding lead byte.
+ * Otherwise, it is not modified.
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, start<=i
+ * @see U8_SET_CP_START_UNSAFE
+ * @draft ICU 2.4
+ */
+#define U8_SET_CP_START(s, start, i) { \
+ if(U8_IS_TRAIL((s)[(i)])) { \
(i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \
} \
}
/* definitions with backward iteration -------------------------------------- */
-#define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a multi-byte sequence, then the macro will read
+ * the whole sequence.
+ * If the offset is behind a lead byte, then that itself
+ * will be returned as the code point.
+ * The result is undefined if the offset is behind an illegal UTF-8 sequence.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param c output UChar32 variable
+ * @see U8_PREV
+ * @draft ICU 2.4
+ */
+#define U8_PREV_UNSAFE(s, i, c) { \
(c)=(s)[--(i)]; \
- if(UTF8_IS_TRAIL(c)) { \
+ if(U8_IS_TRAIL(c)) { \
uint8_t __b, __count=1, __shift=6; \
\
/* c is a trail byte */ \
@@ -264,7 +464,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
for(;;) { \
__b=(s)[--(i)]; \
if(__b>=0xc0) { \
- UTF8_MASK_LEAD_BYTE(__b, __count); \
+ U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
break; \
} else { \
@@ -276,57 +476,151 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
} \
}
-#define UTF8_BACK_1_UNSAFE(s, i) { \
- while(UTF8_IS_TRAIL((s)[--(i)])) {} \
-}
-
-#define UTF8_BACK_N_UNSAFE(s, i, n) { \
- int32_t __N=(n); \
- while(__N>0) { \
- UTF8_BACK_1_UNSAFE(s, i); \
- --__N; \
- } \
-}
-
-#define UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) { \
- UTF8_BACK_1_UNSAFE(s, i); \
- UTF8_FWD_1_UNSAFE(s, i); \
-}
-
-#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \
+/**
+ * Move the string offset from one code point boundary to the previous one
+ * and get the code point between them.
+ * (Pre-decrementing backward iteration.)
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * The input offset may be the same as the string length.
+ * If the offset is behind a multi-byte sequence, then the macro will read
+ * the whole sequence.
+ * If the offset is behind a lead byte, then that itself
+ * will be returned as the code point.
+ * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, start<=i
+ * @param length string length
+ * @param c output UChar32 variable, set to <0 in case of an error
+ * @see U8_PREV_UNSAFE
+ * @draft ICU 2.4
+ */
+#define U8_PREV(s, start, i, c) { \
(c)=(s)[--(i)]; \
if((c)>=0x80) { \
if((c)<=0xbf) { \
- (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \
+ (c)=utf8_prevCharSafeBody(s, start, &(i), c, -1); \
} else { \
- (c)=UTF8_ERROR_VALUE_1; \
+ (c)=U_SENTINEL; \
} \
} \
}
-#define UTF8_BACK_1_SAFE(s, start, i) { \
- if(UTF8_IS_TRAIL((s)[--(i)])) { \
+/**
+ * Move the string offset from one code point boundary to the previous one.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U8_BACK_1
+ * @draft ICU 2.4
+ */
+#define U8_BACK_1_UNSAFE(s, i) { \
+ while(U8_IS_TRAIL((s)[--(i)])) {} \
+}
+
+/**
+ * Move the string offset from one code point boundary to the previous one.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, start<=i
+ * @see U8_BACK_1_UNSAFE
+ * @draft ICU 2.4
+ */
+#define U8_BACK_1(s, start, i) { \
+ if(U8_IS_TRAIL((s)[--(i)])) { \
(i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \
} \
}
-#define UTF8_BACK_N_SAFE(s, start, i, n) { \
+/**
+ * Move the string offset from one code point boundary to the n-th one before it,
+ * i.e., move backward by n code points.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @param n number of code points to skip
+ * @see U8_BACK_N
+ * @draft ICU 2.4
+ */
+#define U8_BACK_N_UNSAFE(s, i, n) { \
int32_t __N=(n); \
- while(__N>0 && (i)>(start)) { \
- UTF8_BACK_1_SAFE(s, start, i); \
+ while(__N>0) { \
+ U8_BACK_1_UNSAFE(s, i); \
--__N; \
} \
}
-/*
- * Need to use UTF8_FWD_1_SAFE() because UTF8_BACK_1_SAFE()
- * may have started from the middle of the sequence and not checked
- * all trail bytes.
+/**
+ * Move the string offset from one code point boundary to the n-th one before it,
+ * i.e., move backward by n code points.
+ * (Pre-decrementing backward iteration.)
+ * The input offset may be the same as the string length.
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param i string offset, i0 && (i)>(start)) { \
+ U8_BACK_1(s, start, i); \
+ --__N; \
+ } \
+}
+
+/**
+ * Adjust a random-access offset to a code point boundary after a code point.
+ * If the offset is behind a partial multi-byte sequence,
+ * then the offset is incremented to behind the whole sequence.
+ * Otherwise, it is not modified.
+ * The input offset may be the same as the string length.
+ * "Unsafe" macro, assumes well-formed UTF-8.
+ *
+ * @param s const UChar * string
+ * @param i string offset
+ * @see U8_SET_CP_LIMIT
+ * @draft ICU 2.4
+ */
+#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \
+ U8_BACK_1_UNSAFE(s, i); \
+ U8_FWD_1_UNSAFE(s, i); \
+}
+
+/**
+ * Adjust a random-access offset to a code point boundary after a code point.
+ * If the offset is behind a partial multi-byte sequence,
+ * then the offset is incremented to behind the whole sequence.
+ * Otherwise, it is not modified.
+ * The input offset may be the same as the string length.
+ * "Safe" macro, checks for illegal sequences and for string boundaries.
+ *
+ * @param s const UChar * string
+ * @param start starting string offset (usually 0)
+ * @param i string offset, start<=i<=length
+ * @param length string length
+ * @see U8_SET_CP_LIMIT_UNSAFE
+ * @draft ICU 2.4
+ */
+#define U8_SET_CP_LIMIT(s, start, i, length) { \
if((start)<(i) && (i)<(length)) { \
- UTF8_BACK_1_SAFE(s, start, i); \
- UTF8_FWD_1_SAFE(s, i, length); \
+ U8_BACK_1(s, start, i); \
+ U8_FWD_1(s, i, length); \
} \
}
diff --git a/icu4c/source/common/unicode/utf_old.h b/icu4c/source/common/unicode/utf_old.h
new file mode 100644
index 00000000000..d859a54a0db
--- /dev/null
+++ b/icu4c/source/common/unicode/utf_old.h
@@ -0,0 +1,1153 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 2002, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: utf.h
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2002sep21
+* created by: Markus W. Scherer
+*/
+
+/**
+ * \file
+ * The macros in utf_old.h are all deprecated and their use discouraged.
+ * Some of the design principles behind the set of UTF macros
+ * have changed or proved impractical.
+ * Almost all of the old "UTF macros" are at least renamed.
+ * If you are looking for a new equivalent to an old macro, please see the
+ * comment at the old one.
+ *
+ * utf_old.h is included by utf.h after unicode/umachine.h
+ * and some common definitions, to not break old code.
+ *
+ * Brief summary of reasons for deprecation:
+ * - Switch on UTF_SIZE (selection of UTF-8/16/32 default string processing)
+ * was impractical.
+ * - Switch on UTF_SAFE etc. (selection of unsafe/safe/strict default string processing)
+ * was of little use and impractical.
+ * - Whole classes of macros became obsolete outside of the UTF_SIZE/UTF_SAFE
+ * selection framework: UTF32_ macros (all trivial)
+ * and UTF_ default and intermediate macros (all aliases).
+ * - The selection framework also caused many macro aliases.
+ * - Change in Unicode standard: "irregular" sequences (3.0) became illegal (3.2).
+ * - Change of language in Unicode standard:
+ * Growing distinction between internal x-bit Unicode strings and external UTF-x
+ * forms, with the former more lenient.
+ * Suggests renaming of UTF16_ macros to U16_.
+ * - The prefix "UTF_" without a width number confused some users.
+ * - "Safe" append macros needed the addition of an error indicator output.
+ * - "Safe" UTF-8 macros used legitimate (if rarely used) code point values
+ * to indicate error conditions.
+ * - The use of the "_CHAR" infix for code point operations confused some users.
+ *
+ * More details:
+ *
+ * Until ICU 2.2, utf.h theoretically allowed to choose among UTF-8/16/32
+ * for string processing, and among unsafe/safe/strict default macros for that.
+ *
+ * It proved nearly impossible to write non-trivial, high-performance code
+ * that is UTF-generic.
+ * Unsafe default macros would be dangerous for default string processing,
+ * and the main reason for the "strict" versions disappeared:
+ * Between Unicode 3.0 and 3.2 all "irregular" UTF-8 sequences became illegal.
+ * The only other conditions that "strict" checked for were non-characters,
+ * which are valid during processing. Only during text input/output should they
+ * be checked, and at that time other well-formedness checks may be
+ * necessary or useful as well.
+ * This can still be done by using U16_NEXT and U_IS_UNICODE_NONCHAR
+ * or U_IS_UNICODE_CHAR.
+ *
+ * The old UTF8_..._SAFE macros also used some normal Unicode code points
+ * to indicate malformed sequences.
+ * The new UTF8_ macros without suffix use negative values instead.
+ *
+ * The entire contents of utf32.h was moved here without replacement
+ * because all those macros were trivial and
+ * were meaningful only in the framework of choosing the UTF size.
+ *
+ * See Jitterbug 2150 and its discussion on the ICU mailing list
+ * in September 2002.
+ *
+ *
+ *
+ * Obsolete part of pre-ICU 2.4 utf.h file documentation:
+ *
+ * The original concept for these files was for ICU to allow
+ * in principle to set which UTF (UTF-8/16/32) is used internally
+ * by defining UTF_SIZE to either 8, 16, or 32. utf.h would then define the UChar type
+ * accordingly. UTF-16 was the default.
+ *
+ * This concept has been abandoned.
+ * A lot of the ICU source code — especially low-level code like
+ * conversion, normalization, and collation — assumes UTF-16,
+ * utf.h enforces the default of UTF-16.
+ * The UTF-8 and UTF-32 macros remain for now for completeness and backward compatibility.
+ *
+ * Accordingly, utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then
+ * UChar is defined to be exactly wchar_t, otherwise uint16_t.
+ *
+ * UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
+ * Unicode code point (Unicode scalar value, 0..0x10ffff).
+ * Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
+ * the definition of UChar. For details see the documentation for UChar32 itself.
+ *
+ * utf.h also defines a number of C macros for handling single Unicode code points and
+ * for using UTF Unicode strings. It includes utf8.h, utf16.h, and utf32.h for the actual
+ * implementations of those macros and then aliases one set of them (for UTF-16) for general use.
+ * The UTF-specific macros have the UTF size in the macro name prefixes (UTF16_...), while
+ * the general alias macros always begin with UTF_...
+ *
+ * Many string operations can be done with or without error checking.
+ * Where such a distinction is useful, there are two versions of the macros, "unsafe" and "safe"
+ * ones with ..._UNSAFE and ..._SAFE suffixes. The unsafe macros are fast but may cause
+ * program failures if the strings are not well-formed. The safe macros have an additional, boolean
+ * parameter "strict". If strict is FALSE, then only illegal sequences are detected.
+ * Otherwise, irregular sequences and non-characters are detected as well (like single surrogates).
+ * Safe macros return special error code points for illegal/irregular sequences:
+ * Typically, U+ffff, or values that would result in a code unit sequence of the same length
+ * as the erroneous input sequence.
+ * Note that _UNSAFE macros have fewer parameters: They do not have the strictness parameter, and
+ * they do not have start/length parameters for boundary checking.
+ *
+ * Here, the macros are aliased in two steps:
+ * In the first step, the UTF-specific macros with UTF16_ prefix and _UNSAFE and _SAFE suffixes are
+ * aliased according to the UTF_SIZE to macros with UTF_ prefix and the same suffixes and signatures.
+ * Then, in a second step, the default, general alias macros are set to use either the unsafe or
+ * the safe/not strict (default) or the safe/strict macro;
+ * these general macros do not have a strictness parameter.
+ *
+ * It is possible to change the default choice for the general alias macros to be unsafe, safe/not strict or safe/strict.
+ * The default is safe/not strict. It is not recommended to select the unsafe macros as the basis for
+ * Unicode string handling in ICU! To select this, define UTF_SAFE, UTF_STRICT, or UTF_UNSAFE.
+ *
+ * For general use, one should use the default, general macros with UTF_ prefix and no _SAFE/_UNSAFE suffix.
+ * Only in some cases it may be necessary to control the choice of macro directly and use a less generic alias.
+ * For example, if it can be assumed that a string is well-formed and the index will stay within the bounds,
+ * then the _UNSAFE version may be used.
+ * If a UTF-8 string is to be processed, then the macros with UTF8_ prefixes need to be used.
+ *
+ *
+ *
+ * @deprecated since ICU 2.4. Use the macros in utf.h, utf16.h, utf8.h instead.
+ */
+
+#ifndef __UTF_OLD_H__
+#define __UTF_OLD_H__
+
+/* Formerly utf.h, part 1 --------------------------------------------------- */
+
+/**
+ * Unicode string and array offset and index type.
+ * ICU always counts Unicode code units (UChars) for
+ * string offsets, indexes, and lengths, not Unicode code points.
+ *
+ * @deprecated Use int32_t directly. UTextOffset to be removed after 2003-mar.
+ */
+typedef int32_t UTextOffset;
+
+/** Number of bits in a Unicode string code unit - ICU uses 16-bit Unicode. @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF_SIZE 16
+
+/**
+ * The default choice for general Unicode string macros is to use the ..._SAFE macro implementations
+ * with strict=FALSE.
+ *
+ * @deprecated since ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define UTF_SAFE
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#undef UTF_UNSAFE
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#undef UTF_STRICT
+
+/**
+ * UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8,
+ * which need 1 or 2 bytes in UTF-8:
+ * U+0015 = NAK = Negative Acknowledge, C0 control character
+ * U+009f = highest C1 control character
+ *
+ * These are used by UTF8_..._SAFE macros so that they can return an error value
+ * that needs the same number of code units (bytes) as were seen by
+ * a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().
+ *
+ * @deprecated since ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define UTF8_ERROR_VALUE_1 0x15
+
+/**
+ * See documentation on UTF8_ERROR_VALUE_1 for details.
+ *
+ * @deprecated since ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define UTF8_ERROR_VALUE_2 0x9f
+
+/**
+ * Error value for all UTFs. This code point value will be set by macros with error
+ * checking if an error is detected.
+ *
+ * @deprecated since ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define UTF_ERROR_VALUE 0xffff
+
+/**
+ * Is a given 32-bit code an error value
+ * as returned by one of the macros for any UTF?
+ *
+ * @deprecated since ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define UTF_IS_ERROR(c) \
+ (((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
+
+/**
+ * This is a combined macro: Is c a valid Unicode value _and_ not an error code?
+ *
+ * @deprecated since ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define UTF_IS_VALID(c) \
+ (UTF_IS_UNICODE_CHAR(c) && \
+ (c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2)
+
+/**
+ * Is this code unit or code point a surrogate (U+d800..U+dfff)?
+ * @deprecated since ICU 2.4. Renamed to U_IS_SURROGATE and U16_IS_SURROGATE, see utf_old.h.
+ */
+#define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800)
+
+/**
+ * Is a given 32-bit code point a Unicode noncharacter?
+ *
+ * @deprecated since ICU 2.4. Renamed to U_IS_UNICODE_NONCHAR, see utf_old.h.
+ */
+#define UTF_IS_UNICODE_NONCHAR(c) \
+ ((c)>=0xfdd0 && \
+ ((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
+ (uint32_t)(c)<=0x10ffff)
+
+/**
+ * Is a given 32-bit value a Unicode code point value (0..U+10ffff)
+ * that can be assigned a character?
+ *
+ * Code points that are not characters include:
+ * - single surrogate code points (U+d800..U+dfff, 2048 code points)
+ * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
+ * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
+ * - the highest Unicode code point value is U+10ffff
+ *
+ * This means that all code points below U+d800 are character code points,
+ * and that boundary is tested first for performance.
+ *
+ * @deprecated since ICU 2.4. Renamed to U_IS_UNICODE_CHAR, see utf_old.h.
+ */
+#define UTF_IS_UNICODE_CHAR(c) \
+ ((uint32_t)(c)<0xd800 || \
+ ((uint32_t)(c)>0xdfff && \
+ (uint32_t)(c)<=0x10ffff && \
+ !UTF_IS_UNICODE_NONCHAR(c)))
+
+/* Formerly utf8.h ---------------------------------------------------------- */
+
+/**
+ * Count the trail bytes for a UTF-8 lead byte.
+ * @deprecated since ICU 2.4. Renamed to U8_COUNT_TRAIL_BYTES, see utf_old.h.
+ */
+#define UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte])
+
+/**
+ * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
+ * @deprecated since ICU 2.4. Renamed to U8_MASK_LEAD_BYTE, see utf_old.h.
+ */
+#define UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
+
+/** Is this this code point a single code unit (byte)? @deprecated since ICU 2.4. Renamed to U8_IS_SINGLE, see utf_old.h. */
+#define UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0)
+/** Is this this code unit the lead code unit (byte) of a code point? @deprecated since ICU 2.4. Renamed to U8_IS_LEAD, see utf_old.h. */
+#define UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e)
+/** Is this this code unit a trailing code unit (byte) of a code point? @deprecated since ICU 2.4. Renamed to U8_IS_TRAIL, see utf_old.h. */
+#define UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80)
+
+/** Does this scalar Unicode value need multiple code units for storage? @deprecated since ICU 2.4. Use U8_LENGTH or test ((uint32_t)(c)>0x7f) instead, see utf_old.h. */
+#define UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f)
+
+/**
+ * Given the lead character, how many bytes are taken by this code point.
+ * ICU does not deal with code points >0x10ffff
+ * unless necessary for advancing in the byte stream.
+ *
+ * These length macros take into account that for values >0x10ffff
+ * the UTF8_APPEND_CHAR_SAFE macros would write the error code point 0xffff
+ * with 3 bytes.
+ * Code point comparisons need to be in uint32_t because UChar32
+ * may be a signed type, and negative values must be recognized.
+ *
+ * @deprecated since ICU 2.4. Use U8_LENGTH instead, see utf_old.h.
+ */
+#if 1
+# define UTF8_CHAR_LENGTH(c) \
+ ((uint32_t)(c)<=0x7f ? 1 : \
+ ((uint32_t)(c)<=0x7ff ? 2 : \
+ ((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \
+ ) \
+ )
+#else
+# define UTF8_CHAR_LENGTH(c) \
+ ((uint32_t)(c)<=0x7f ? 1 : \
+ ((uint32_t)(c)<=0x7ff ? 2 : \
+ ((uint32_t)(c)<=0xffff ? 3 : \
+ ((uint32_t)(c)<=0x10ffff ? 4 : \
+ ((uint32_t)(c)<=0x3ffffff ? 5 : \
+ ((uint32_t)(c)<=0x7fffffff ? 6 : 3) \
+ ) \
+ ) \
+ ) \
+ ) \
+ )
+#endif
+
+/** The maximum number of bytes per code point. @deprecated since ICU 2.4. Renamed to U8_MAX_LENGTH, see utf_old.h. */
+#define UTF8_MAX_CHAR_LENGTH 4
+
+/** Average number of code units compared to UTF-16. @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF8_ARRAY_SIZE(size) ((5*(size))/2)
+
+/** @deprecated since ICU 2.4. Renamed to U8_GET_UNSAFE, see utf_old.h. */
+#define UTF8_GET_CHAR_UNSAFE(s, i, c) { \
+ int32_t __I=(int32_t)(i); \
+ UTF8_SET_CHAR_START_UNSAFE(s, __I); \
+ UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \
+}
+
+/** @deprecated since ICU 2.4. Use U8_GET instead, see utf_old.h. */
+#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
+ int32_t __I=(int32_t)(i); \
+ UTF8_SET_CHAR_START_SAFE(s, start, __I); \
+ UTF8_NEXT_CHAR_SAFE(s, __I, length, c, strict); \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_NEXT_UNSAFE, see utf_old.h. */
+#define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \
+ (c)=(s)[(i)++]; \
+ if((uint8_t)((c)-0xc0)<0x35) { \
+ uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \
+ UTF8_MASK_LEAD_BYTE(c, __count); \
+ switch(__count) { \
+ /* each following branch falls through to the next one */ \
+ case 3: \
+ (c)=((c)<<6)|((s)[(i)++]&0x3f); \
+ case 2: \
+ (c)=((c)<<6)|((s)[(i)++]&0x3f); \
+ case 1: \
+ (c)=((c)<<6)|((s)[(i)++]&0x3f); \
+ /* no other branches to optimize switch() */ \
+ break; \
+ } \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_APPEND_UNSAFE, see utf_old.h. */
+#define UTF8_APPEND_CHAR_UNSAFE(s, i, c) { \
+ if((uint32_t)(c)<=0x7f) { \
+ (s)[(i)++]=(uint8_t)(c); \
+ } else { \
+ if((uint32_t)(c)<=0x7ff) { \
+ (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
+ } else { \
+ if((uint32_t)(c)<=0xffff) { \
+ (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
+ } else { \
+ (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
+ (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
+ } \
+ (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_FWD_1_UNSAFE, see utf_old.h. */
+#define UTF8_FWD_1_UNSAFE(s, i) { \
+ (i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_FWD_N_UNSAFE, see utf_old.h. */
+#define UTF8_FWD_N_UNSAFE(s, i, n) { \
+ int32_t __N=(n); \
+ while(__N>0) { \
+ UTF8_FWD_1_UNSAFE(s, i); \
+ --__N; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_SET_CP_START_UNSAFE, see utf_old.h. */
+#define UTF8_SET_CHAR_START_UNSAFE(s, i) { \
+ while(UTF8_IS_TRAIL((s)[i])) { --(i); } \
+}
+
+/** @deprecated since ICU 2.4. Use U8_NEXT instead, see utf_old.h. */
+#define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
+ (c)=(s)[(i)++]; \
+ if((c)>=0x80) { \
+ if(UTF8_IS_LEAD(c)) { \
+ (c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict); \
+ } else { \
+ (c)=UTF8_ERROR_VALUE_1; \
+ } \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Use U8_APPEND instead, see utf_old.h. */
+#define UTF8_APPEND_CHAR_SAFE(s, i, length, c) { \
+ if((uint32_t)(c)<=0x7f) { \
+ (s)[(i)++]=(uint8_t)(c); \
+ } else { \
+ (i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, NULL); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_FWD_1, see utf_old.h. */
+#define UTF8_FWD_1_SAFE(s, i, length) U8_FWD_1(s, i, length)
+
+/** @deprecated since ICU 2.4. Renamed to U8_FWD_N, see utf_old.h. */
+#define UTF8_FWD_N_SAFE(s, i, length, n) U8_FWD_N(s, i, length, n)
+
+/** @deprecated since ICU 2.4. Renamed to U8_SET_CP_START, see utf_old.h. */
+#define UTF8_SET_CHAR_START_SAFE(s, start, i) U8_SET_CP_START(s, start, i)
+
+/** @deprecated since ICU 2.4. Renamed to U8_PREV_UNSAFE, see utf_old.h. */
+#define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \
+ (c)=(s)[--(i)]; \
+ if(UTF8_IS_TRAIL(c)) { \
+ uint8_t __b, __count=1, __shift=6; \
+\
+ /* c is a trail byte */ \
+ (c)&=0x3f; \
+ for(;;) { \
+ __b=(s)[--(i)]; \
+ if(__b>=0xc0) { \
+ UTF8_MASK_LEAD_BYTE(__b, __count); \
+ (c)|=(UChar32)__b<<__shift; \
+ break; \
+ } else { \
+ (c)|=(UChar32)(__b&0x3f)<<__shift; \
+ ++__count; \
+ __shift+=6; \
+ } \
+ } \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_BACK_1_UNSAFE, see utf_old.h. */
+#define UTF8_BACK_1_UNSAFE(s, i) { \
+ while(UTF8_IS_TRAIL((s)[--(i)])) {} \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_BACK_N_UNSAFE, see utf_old.h. */
+#define UTF8_BACK_N_UNSAFE(s, i, n) { \
+ int32_t __N=(n); \
+ while(__N>0) { \
+ UTF8_BACK_1_UNSAFE(s, i); \
+ --__N; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_SET_CP_LIMIT_UNSAFE, see utf_old.h. */
+#define UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) { \
+ UTF8_BACK_1_UNSAFE(s, i); \
+ UTF8_FWD_1_UNSAFE(s, i); \
+}
+
+/** @deprecated since ICU 2.4. Use U8_PREV instead, see utf_old.h. */
+#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \
+ (c)=(s)[--(i)]; \
+ if((c)>=0x80) { \
+ if((c)<=0xbf) { \
+ (c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \
+ } else { \
+ (c)=UTF8_ERROR_VALUE_1; \
+ } \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U8_BACK_1, see utf_old.h. */
+#define UTF8_BACK_1_SAFE(s, start, i) U8_BACK_1(s, start, i)
+
+/** @deprecated since ICU 2.4. Renamed to U8_BACK_N, see utf_old.h. */
+#define UTF8_BACK_N_SAFE(s, start, i, n) U8_BACK_N(s, start, i, n)
+
+/** @deprecated since ICU 2.4. Renamed to U8_SET_CP_LIMIT, see utf_old.h. */
+#define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) U8_SET_CP_LIMIT(s, start, i, length)
+
+/* Formerly utf16.h --------------------------------------------------------- */
+
+/** Is uchar a first/lead surrogate? @deprecated since ICU 2.4. Renamed to U_IS_LEAD and U16_IS_LEAD, see utf_old.h. */
+#define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800)
+
+/** Is uchar a second/trail surrogate? @deprecated since ICU 2.4. Renamed to U_IS_TRAIL and U16_IS_TRAIL, see utf_old.h. */
+#define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00)
+
+/** Assuming c is a surrogate, is it a first/lead surrogate? @deprecated since ICU 2.4. Renamed to U_IS_SURROGATE_LEAD and U16_IS_SURROGATE_LEAD, see utf_old.h. */
+#define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0)
+
+/** Helper constant for UTF16_GET_PAIR_VALUE. @deprecated since ICU 2.4. Renamed to U16_SURROGATE_OFFSET, see utf_old.h. */
+#define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
+
+/** Get the UTF-32 value from the surrogate code units. @deprecated since ICU 2.4. Renamed to U16_GET_SUPPLEMENTARY, see utf_old.h. */
+#define UTF16_GET_PAIR_VALUE(first, second) \
+ (((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET)
+
+/** @deprecated since ICU 2.4. Renamed to U16_LEAD, see utf_old.h. */
+#define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
+
+/** @deprecated since ICU 2.4. Renamed to U16_TRAIL, see utf_old.h. */
+#define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
+
+/** @deprecated since ICU 2.4. Renamed to U16_LEAD, see utf_old.h. */
+#define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary)
+
+/** @deprecated since ICU 2.4. Renamed to U16_TRAIL, see utf_old.h. */
+#define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary)
+
+/** @deprecated since ICU 2.4. Renamed to U16_IS_SINGLE, see utf_old.h. */
+#define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar)
+
+/** @deprecated since ICU 2.4. Renamed to U16_IS_LEAD, see utf_old.h. */
+#define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar)
+
+/** @deprecated since ICU 2.4. Renamed to U16_IS_TRAIL, see utf_old.h. */
+#define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar)
+
+/** Does this scalar Unicode value need multiple code units for storage? @deprecated since ICU 2.4. Use U16_LENGTH or test ((uint32_t)(c)>0xffff) instead, see utf_old.h. */
+#define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff)
+
+/** @deprecated since ICU 2.4. Renamed to U16_LENGTH, see utf_old.h. */
+#define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
+
+/** @deprecated since ICU 2.4. Renamed to U16_MAX_LENGTH, see utf_old.h. */
+#define UTF16_MAX_CHAR_LENGTH 2
+
+/** Average number of code units compared to UTF-16. @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF16_ARRAY_SIZE(size) (size)
+
+/**
+ * Get a single code point from an offset that points to any
+ * of the code units that belong to that code point.
+ * Assume 0<=i=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
+ (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
+ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
+ } else if(strict) {\
+ /* unmatched second surrogate */ \
+ (c)=UTF_ERROR_VALUE; \
+ } \
+ } \
+ } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
+ (c)=UTF_ERROR_VALUE; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_NEXT_UNSAFE, see utf_old.h. */
+#define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \
+ (c)=(s)[(i)++]; \
+ if(UTF_IS_FIRST_SURROGATE(c)) { \
+ (c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_APPEND_UNSAFE, see utf_old.h. */
+#define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \
+ if((uint32_t)(c)<=0xffff) { \
+ (s)[(i)++]=(uint16_t)(c); \
+ } else { \
+ (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
+ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_FWD_1_UNSAFE, see utf_old.h. */
+#define UTF16_FWD_1_UNSAFE(s, i) { \
+ if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \
+ ++(i); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_FWD_N_UNSAFE, see utf_old.h. */
+#define UTF16_FWD_N_UNSAFE(s, i, n) { \
+ int32_t __N=(n); \
+ while(__N>0) { \
+ UTF16_FWD_1_UNSAFE(s, i); \
+ --__N; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_START_UNSAFE, see utf_old.h. */
+#define UTF16_SET_CHAR_START_UNSAFE(s, i) { \
+ if(UTF_IS_SECOND_SURROGATE((s)[i])) { \
+ --(i); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Use U16_NEXT instead, see utf_old.h. */
+#define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
+ (c)=(s)[(i)++]; \
+ if(UTF_IS_FIRST_SURROGATE(c)) { \
+ uint16_t __c2; \
+ if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \
+ ++(i); \
+ (c)=UTF16_GET_PAIR_VALUE((c), __c2); \
+ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
+ } else if(strict) {\
+ /* unmatched first surrogate */ \
+ (c)=UTF_ERROR_VALUE; \
+ } \
+ } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
+ /* unmatched second surrogate or other non-character */ \
+ (c)=UTF_ERROR_VALUE; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Use U16_APPEND instead, see utf_old.h. */
+#define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \
+ if((uint32_t)(c)<=0xffff) { \
+ (s)[(i)++]=(uint16_t)(c); \
+ } else if((uint32_t)(c)<=0x10ffff) { \
+ if((i)+1<(length)) { \
+ (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
+ (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
+ } else /* not enough space */ { \
+ (s)[(i)++]=UTF_ERROR_VALUE; \
+ } \
+ } else /* c>0x10ffff, write error value */ { \
+ (s)[(i)++]=UTF_ERROR_VALUE; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_FWD_1, see utf_old.h. */
+#define UTF16_FWD_1_SAFE(s, i, length) U16_FWD_1(s, i, length)
+
+/** @deprecated since ICU 2.4. Renamed to U16_FWD_N, see utf_old.h. */
+#define UTF16_FWD_N_SAFE(s, i, length, n) U16_FWD_N(s, i, length, n)
+
+/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_START, see utf_old.h. */
+#define UTF16_SET_CHAR_START_SAFE(s, start, i) U16_SET_CP_START(s, start, i)
+
+/** @deprecated since ICU 2.4. Renamed to U16_PREV_UNSAFE, see utf_old.h. */
+#define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \
+ (c)=(s)[--(i)]; \
+ if(UTF_IS_SECOND_SURROGATE(c)) { \
+ (c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_BACK_1_UNSAFE, see utf_old.h. */
+#define UTF16_BACK_1_UNSAFE(s, i) { \
+ if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \
+ --(i); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_BACK_N_UNSAFE, see utf_old.h. */
+#define UTF16_BACK_N_UNSAFE(s, i, n) { \
+ int32_t __N=(n); \
+ while(__N>0) { \
+ UTF16_BACK_1_UNSAFE(s, i); \
+ --__N; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_LIMIT_UNSAFE, see utf_old.h. */
+#define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \
+ if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
+ ++(i); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Use U16_PREV instead, see utf_old.h. */
+#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
+ (c)=(s)[--(i)]; \
+ if(UTF_IS_SECOND_SURROGATE(c)) { \
+ uint16_t __c2; \
+ if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
+ --(i); \
+ (c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
+ /* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
+ } else if(strict) {\
+ /* unmatched second surrogate */ \
+ (c)=UTF_ERROR_VALUE; \
+ } \
+ } else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
+ /* unmatched first surrogate or other non-character */ \
+ (c)=UTF_ERROR_VALUE; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Renamed to U16_BACK_1, see utf_old.h. */
+#define UTF16_BACK_1_SAFE(s, start, i) U16_BACK_1(s, start, i)
+
+/** @deprecated since ICU 2.4. Renamed to U16_BACK_N, see utf_old.h. */
+#define UTF16_BACK_N_SAFE(s, start, i, n) U16_BACK_N(s, start, i, n)
+
+/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h. */
+#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) U16_SET_CP_LIMIT(s, start, i, length)
+
+/* Formerly utf32.h --------------------------------------------------------- */
+
+/*
+* Old documentation:
+*
+* This file defines macros to deal with UTF-32 code units and code points.
+* Signatures and semantics are the same as for the similarly named macros
+* in utf16.h.
+* utf32.h is included by utf.h after unicode/umachine.h
+* and some common definitions.
+* Usage: ICU coding guidelines for if() statements should be followed when using these macros.
+* Compound statements (curly braces {}) must be used for if-else-while...
+* bodies and all macro statements should be terminated with semicolon.
+*/
+
+/* internal definitions ----------------------------------------------------- */
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_IS_SAFE(c, strict) \
+ (!(strict) ? \
+ (uint32_t)(c)<=0x10ffff : \
+ UTF_IS_UNICODE_CHAR(c))
+
+/*
+ * For the semantics of all of these macros, see utf16.h.
+ * The UTF-32 versions are trivial because any code point is
+ * encoded using exactly one code unit.
+ */
+
+/* single-code point definitions -------------------------------------------- */
+
+/* classes of code unit values */
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_IS_SINGLE(uchar) 1
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_IS_LEAD(uchar) 0
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_IS_TRAIL(uchar) 0
+
+/* number of code units per code point */
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_NEED_MULTIPLE_UCHAR(c) 0
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_CHAR_LENGTH(c) 1
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_MAX_CHAR_LENGTH 1
+
+/* average number of code units compared to UTF-16 */
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_ARRAY_SIZE(size) (size)
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_GET_CHAR_UNSAFE(s, i, c) { \
+ (c)=(s)[i]; \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
+ (c)=(s)[i]; \
+ if(!UTF32_IS_SAFE(c, strict)) { \
+ (c)=UTF_ERROR_VALUE; \
+ } \
+}
+
+/* definitions with forward iteration --------------------------------------- */
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_NEXT_CHAR_UNSAFE(s, i, c) { \
+ (c)=(s)[(i)++]; \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_APPEND_CHAR_UNSAFE(s, i, c) { \
+ (s)[(i)++]=(c); \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_FWD_1_UNSAFE(s, i) { \
+ ++(i); \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_FWD_N_UNSAFE(s, i, n) { \
+ (i)+=(n); \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_SET_CHAR_START_UNSAFE(s, i) { \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
+ (c)=(s)[(i)++]; \
+ if(!UTF32_IS_SAFE(c, strict)) { \
+ (c)=UTF_ERROR_VALUE; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_APPEND_CHAR_SAFE(s, i, length, c) { \
+ if((uint32_t)(c)<=0x10ffff) { \
+ (s)[(i)++]=(c); \
+ } else /* c>0x10ffff, write 0xfffd */ { \
+ (s)[(i)++]=0xfffd; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_FWD_1_SAFE(s, i, length) { \
+ ++(i); \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_FWD_N_SAFE(s, i, length, n) { \
+ if(((i)+=(n))>(length)) { \
+ (i)=(length); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \
+}
+
+/* definitions with backward iteration -------------------------------------- */
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_PREV_CHAR_UNSAFE(s, i, c) { \
+ (c)=(s)[--(i)]; \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_BACK_1_UNSAFE(s, i) { \
+ --(i); \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_BACK_N_UNSAFE(s, i, n) { \
+ (i)-=(n); \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \
+ (c)=(s)[--(i)]; \
+ if(!UTF32_IS_SAFE(c, strict)) { \
+ (c)=UTF_ERROR_VALUE; \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_BACK_1_SAFE(s, start, i) { \
+ --(i); \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_BACK_N_SAFE(s, start, i, n) { \
+ (i)-=(n); \
+ if((i)<(start)) { \
+ (i)=(start); \
+ } \
+}
+
+/** @deprecated since ICU 2.4. Obsolete, see utf_old.h. */
+#define UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) { \
+}
+
+/* Formerly utf.h, part 2 --------------------------------------------------- */
+
+/**
+ * Estimate the number of code units for a string based on the number of UTF-16 code units.
+ *
+ * @deprecated since ICU 2.4. Obsolete, see utf_old.h.
+ */
+#define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size)
+
+/** @deprecated since ICU 2.4. Renamed to U16_GET_UNSAFE, see utf_old.h. */
+#define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c)
+
+/** @deprecated since ICU 2.4. Use U16_GET instead, see utf_old.h. */
+#define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict)
+
+
+/** @deprecated since ICU 2.4. Renamed to U16_NEXT, see utf_old.h. */
+#define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c)
+
+/** @deprecated since ICU 2.4. Use U16_NEXT instead, see utf_old.h. */
+#define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict)
+
+
+/** @deprecated since ICU 2.4. Renamed to U16_APPEND_UNSAFE, see utf_old.h. */
+#define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c)
+
+/** @deprecated since ICU 2.4. Use U16_APPEND instead, see utf_old.h. */
+#define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c)
+
+
+/** @deprecated since ICU 2.4. Renamed to U16_FWD_1_UNSAFE, see utf_old.h. */
+#define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i)
+
+/** @deprecated since ICU 2.4. Renamed to U16_FWD_1, see utf_old.h. */
+#define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length)
+
+
+/** @deprecated since ICU 2.4. Renamed to U16_FWD_N_UNSAFE, see utf_old.h. */
+#define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n)
+
+/** @deprecated since ICU 2.4. Renamed to U16_FWD_N, see utf_old.h. */
+#define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n)
+
+
+/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_START_UNSAFE, see utf_old.h. */
+#define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i)
+
+/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_START, see utf_old.h. */
+#define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i)
+
+
+/** @deprecated since ICU 2.4. Renamed to U16_PREV_UNSAFE, see utf_old.h. */
+#define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c)
+
+/** @deprecated since ICU 2.4. Use U16_PREV instead, see utf_old.h. */
+#define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict)
+
+
+/** @deprecated since ICU 2.4. Renamed to U16_BACK_1_UNSAFE, see utf_old.h. */
+#define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i)
+
+/** @deprecated since ICU 2.4. Renamed to U16_BACK_1, see utf_old.h. */
+#define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i)
+
+
+/** @deprecated since ICU 2.4. Renamed to U16_BACK_N_UNSAFE, see utf_old.h. */
+#define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n)
+
+/** @deprecated since ICU 2.4. Renamed to U16_BACK_N, see utf_old.h. */
+#define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n)
+
+
+/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_LIMIT_UNSAFE, see utf_old.h. */
+#define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i)
+
+/** @deprecated since ICU 2.4. Renamed to U16_SET_CP_LIMIT, see utf_old.h. */
+#define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length)
+
+/* Define default macros (UTF-16 "safe") ------------------------------------ */
+
+/**
+ * Does this code unit alone encode a code point (BMP, not a surrogate)?
+ * Same as UTF16_IS_SINGLE.
+ * @deprecated since ICU 2.4. Renamed to U_IS_SINGLE and U16_IS_SINGLE, see utf_old.h.
+ */
+#define UTF_IS_SINGLE(uchar) U16_IS_SINGLE(uchar)
+
+/**
+ * Is this code unit the first one of several (a lead surrogate)?
+ * Same as UTF16_IS_LEAD.
+ * @deprecated since ICU 2.4. Renamed to U_IS_LEAD and U16_IS_LEAD, see utf_old.h.
+ */
+#define UTF_IS_LEAD(uchar) U16_IS_LEAD(uchar)
+
+/**
+ * Is this code unit one of several but not the first one (a trail surrogate)?
+ * Same as UTF16_IS_TRAIL.
+ * @deprecated since ICU 2.4. Renamed to U_IS_TRAIL and U16_IS_TRAIL, see utf_old.h.
+ */
+#define UTF_IS_TRAIL(uchar) U16_IS_TRAIL(uchar)
+
+/**
+ * Does this code point require multiple code units (is it a supplementary code point)?
+ * Same as UTF16_NEED_MULTIPLE_UCHAR.
+ * @deprecated since ICU 2.4. Use U16_LENGTH or test ((uint32_t)(c)>0xffff) instead.
+ */
+#define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c)
+
+/**
+ * How many code units are used to encode this code point (1 or 2)?
+ * Same as UTF16_CHAR_LENGTH.
+ * @deprecated since ICU 2.4. Renamed to U16_LENGTH, see utf_old.h.
+ */
+#define UTF_CHAR_LENGTH(c) U16_LENGTH(c)
+
+/**
+ * How many code units are used at most for any Unicode code point (2)?
+ * Same as UTF16_MAX_CHAR_LENGTH.
+ * @deprecated since ICU 2.4. Renamed to U16_MAX_LENGTH, see utf_old.h.
+ */
+#define UTF_MAX_CHAR_LENGTH U16_MAX_LENGTH
+
+/**
+ * Set c to the code point that contains the code unit i.
+ * i could point to the lead or the trail surrogate for the code point.
+ * i is not modified.
+ * Same as UTF16_GET_CHAR.
+ * \pre 0<=i= 0) {
+ return (UChar *)string + index;
+ } else {
+ return NULL;
}
-
- if (single)
- {
- const UChar *matchItr;
- const UChar *strItr;
-
- for (strItr = string; *strItr; strItr++)
- {
- for (matchItr = matchSet; *matchItr; matchItr++)
- {
- if (*matchItr == *strItr)
- {
- return (UChar *)strItr;
- }
- }
- }
- }
- else
- {
- int32_t matchItr;
- int32_t strItr;
- UChar32 stringCh, matchSetCh;
- int32_t stringLen = u_strlen(string);
-
- for (strItr = 0; strItr < stringLen; strItr++)
- {
- UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE);
- for (matchItr = 0; matchItr < matchLen; matchItr++)
- {
- UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE);
- if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE
- || string[strItr] == UTF_ERROR_VALUE
- || (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr]))))
- {
- return (UChar *)string + strItr;
- }
- }
- }
- }
-
- /* Didn't find it. */
- return NULL;
}
/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
U_CAPI int32_t U_EXPORT2
u_strcspn(const UChar *string, const UChar *matchSet)
{
- const UChar *foundStr = u_strpbrk(string, matchSet);
- if (foundStr == NULL)
- {
- return u_strlen(string);
+ int32_t index = _matchFromSet(string, matchSet, TRUE);
+ if(index >= 0) {
+ return index;
+ } else {
+ return -index - 1; /* == u_strlen(string) */
}
- return foundStr - string;
}
/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
U_CAPI int32_t U_EXPORT2
u_strspn(const UChar *string, const UChar *matchSet)
{
- UBool single = TRUE;
- UBool match = TRUE;
- int32_t matchLen;
- int32_t retValue;
-
- for (matchLen = 0; matchSet[matchLen]; matchLen++)
- {
- if (!UTF_IS_SINGLE(matchSet[matchLen]))
- {
- single = FALSE;
- }
+ int32_t index = _matchFromSet(string, matchSet, FALSE);
+ if(index >= 0) {
+ return index;
+ } else {
+ return -index - 1; /* == u_strlen(string) */
}
-
- if (single)
- {
- const UChar *matchItr;
- const UChar *strItr;
-
- for (strItr = string; *strItr && match; strItr++)
- {
- match = FALSE;
- for (matchItr = matchSet; *matchItr; matchItr++)
- {
- if (*matchItr == *strItr)
- {
- match = TRUE;
- break;
- }
- }
- }
- retValue = strItr - string - (match == FALSE);
- }
- else
- {
- int32_t matchItr;
- int32_t strItr;
- UChar32 stringCh, matchSetCh;
- int32_t stringLen = u_strlen(string);
-
- for (strItr = 0; strItr < stringLen && match; strItr++)
- {
- match = FALSE;
- UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE);
- for (matchItr = 0; matchItr < matchLen; matchItr++)
- {
- UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE);
- if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE
- || string[strItr] == UTF_ERROR_VALUE
- || (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr]))))
- {
- match = TRUE;
- break;
- }
- }
- }
- retValue = strItr - (match == FALSE);
- }
-
- /* Found a mismatch or didn't find it. */
- return retValue;
}
/* ----- Text manipulation functions --- */
diff --git a/icu4c/source/common/ustrtrns.c b/icu4c/source/common/ustrtrns.c
index 15b9576ecc5..df6da4263cc 100644
--- a/icu4c/source/common/ustrtrns.c
+++ b/icu4c/source/common/ustrtrns.c
@@ -228,7 +228,6 @@ u_strFromUTF8(UChar *dest,
int32_t index = 0;
int32_t reqLength = 0;
uint8_t* pSrc = (uint8_t*) src;
- UBool isError;
/* args check */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
@@ -249,8 +248,8 @@ u_strFromUTF8(UChar *dest,
if(ch <=0x7f){
*pDest++=(UChar)ch;
}else{
- ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, FALSE, &isError);
- if(isError){
+ ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
+ if(ch<0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}else if(ch<=0xFFFF){
@@ -272,8 +271,8 @@ u_strFromUTF8(UChar *dest,
if(ch <= 0x7f){
reqLength++;
}else{
- ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, FALSE, &isError);
- if(isError){
+ ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
+ if(ch<0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}
diff --git a/icu4c/source/common/utf_impl.c b/icu4c/source/common/utf_impl.c
index ce3eb9bc0ef..5ace9404c38 100644
--- a/icu4c/source/common/utf_impl.c
+++ b/icu4c/source/common/utf_impl.c
@@ -83,7 +83,7 @@ utf8_errorValue[6]={
};
U_CAPI UChar32 U_EXPORT2
-utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict, UBool *pIsError) {
+utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
int32_t i=*pi;
uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
if((i)+count<=(length)) {
@@ -118,10 +118,11 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
illegal|=(trail&0xc0)^0x80;
break;
case 0:
- if(pIsError!=NULL) {
- *pIsError=TRUE;
+ if(strict>=0) {
+ return UTF8_ERROR_VALUE_1;
+ } else {
+ return U_SENTINEL;
}
- return UTF8_ERROR_VALUE_1;
/* no default branch to optimize switch() - all values are covered */
}
@@ -132,6 +133,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
* Starting with Unicode 3.0.1, non-shortest forms are illegal.
* Starting with Unicode 3.2, surrogate code points must not be
* encoded in UTF-8, and there are no irregular sequences any more.
+ *
+ * U8_ macros (new in ICU 2.4) return negative values for error conditions.
*/
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
@@ -145,21 +148,14 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
++(i);
--count;
}
- c=utf8_errorValue[errorCount-count];
- if(pIsError!=NULL) {
- *pIsError=TRUE;
+ if(strict>=0) {
+ c=utf8_errorValue[errorCount-count];
+ } else {
+ c=U_SENTINEL;
}
- } else if((strict) && UTF_IS_UNICODE_NONCHAR(c)) {
+ } else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) {
/* strict: forbid non-characters like U+fffe */
c=utf8_errorValue[count];
- if(pIsError!=NULL) {
- *pIsError=TRUE;
- }
- } else {
- /* good result */
- if(pIsError!=NULL) {
- *pIsError=FALSE;
- }
}
} else /* too few bytes left */ {
/* error handling */
@@ -168,9 +164,10 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
while((i)<(length) && UTF8_IS_TRAIL(s[i])) {
++(i);
}
- c=utf8_errorValue[i-i0];
- if(pIsError!=NULL) {
- *pIsError=TRUE;
+ if(strict>=0) {
+ c=utf8_errorValue[i-i0];
+ } else {
+ c=U_SENTINEL;
}
}
*pi=i;
@@ -178,8 +175,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
}
U_CAPI int32_t U_EXPORT2
-utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
- if((c)<=0x7ff) {
+utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
+ if((uint32_t)(c)<=0x7ff) {
if((i)+1<(length)) {
(s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
@@ -187,7 +184,7 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
}
} else if((uint32_t)(c)<=0xffff) {
/* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
- if((i)+2<(length) && !UTF_IS_SURROGATE(c)) {
+ if((i)+2<(length) && !U_IS_SURROGATE(c)) {
(s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
@@ -203,18 +200,22 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
}
}
/* c>0x10ffff or not enough space, write an error value */
- length-=i;
- if(length>0) {
- int32_t offset;
- if(length>3) {
- length=3;
+ if(pIsError!=NULL) {
+ *pIsError=TRUE;
+ } else {
+ length-=i;
+ if(length>0) {
+ int32_t offset;
+ if(length>3) {
+ length=3;
+ }
+ s+=i;
+ offset=0;
+ c=utf8_errorValue[length-1];
+ UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
+ i=i+offset;
}
- s+=i;
- offset=0;
- c=utf8_errorValue[length-1];
- UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
- i=i+offset;
- }
+ }
return i;
}
@@ -229,7 +230,11 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
for(;;) {
if(i<=start) {
/* no lead byte at all */
- c=UTF8_ERROR_VALUE_1;
+ if(strict>=0) {
+ return UTF8_ERROR_VALUE_1;
+ } else {
+ return U_SENTINEL;
+ }
break;
}
@@ -250,7 +255,11 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
if(count>=4) {
count=3;
}
- c=utf8_errorValue[count];
+ if(strict>=0) {
+ c=utf8_errorValue[count];
+ } else {
+ c=U_SENTINEL;
+ }
} else {
/* exit with correct c */
}
@@ -260,9 +269,17 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
include the trail byte that we started with */
if(count=0) {
+ c=utf8_errorValue[count];
+ } else {
+ c=U_SENTINEL;
+ }
} else {
- c=UTF8_ERROR_VALUE_1;
+ if(strict>=0) {
+ c=UTF8_ERROR_VALUE_1;
+ } else {
+ c=U_SENTINEL;
+ }
}
}
break;
@@ -273,12 +290,20 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
shift+=6;
} else {
/* more than 5 trail bytes is illegal */
- c=UTF8_ERROR_VALUE_1;
+ if(strict>=0) {
+ c=UTF8_ERROR_VALUE_1;
+ } else {
+ c=U_SENTINEL;
+ }
break;
}
} else {
/* single-byte character precedes trailing bytes */
- c=UTF8_ERROR_VALUE_1;
+ if(strict>=0) {
+ c=UTF8_ERROR_VALUE_1;
+ } else {
+ c=U_SENTINEL;
+ }
break;
}
}
diff --git a/icu4c/source/test/cintltst/cucdtst.c b/icu4c/source/test/cintltst/cucdtst.c
index ee5bfed94ac..33bb27ea664 100644
--- a/icu4c/source/test/cintltst/cucdtst.c
+++ b/icu4c/source/test/cintltst/cucdtst.c
@@ -942,39 +942,39 @@ static void TestCodePoint(){
UChar32 c=codePoint[i];
log_verbose("Testing code unit value of \\u%4X\n", c);
if(i<6){
- if(!UTF_IS_SURROGATE(c)){
+ if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
}
if(UTF_IS_VALID(c)){
log_err("ERROR: isValid() failed for \\u%4X\n", c);
}
- if(UTF_IS_UNICODE_CHAR(c)){
+ if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
}
if(UTF_IS_ERROR(c)){
log_err("ERROR: isError() failed for \\u%4X\n", c);
}
}else if(i >=6 && i<18){
- if(UTF_IS_SURROGATE(c)){
+ if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
}
if(!UTF_IS_VALID(c)){
log_err("ERROR: isValid() failed for \\u%4X\n", c);
}
- if(!UTF_IS_UNICODE_CHAR(c)){
+ if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
}
if(UTF_IS_ERROR(c)){
log_err("ERROR: isError() failed for \\u%4X\n", c);
}
}else if(i >=18 && i<20){
- if(UTF_IS_SURROGATE(c)){
+ if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
}
if(UTF_IS_VALID(c)){
log_err("ERROR: isValid() failed for \\u%4X\n", c);
}
- if(!UTF_IS_UNICODE_CHAR(c)){
+ if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
}
if(!UTF_IS_ERROR(c)){
@@ -982,13 +982,13 @@ static void TestCodePoint(){
}
}
else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){
- if(UTF_IS_SURROGATE(c)){
+ if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
}
if(UTF_IS_VALID(c)){
log_err("ERROR: isValid() failed for \\u%4X\n", c);
}
- if(UTF_IS_UNICODE_CHAR(c)){
+ if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
}
if(!UTF_IS_ERROR(c)){
@@ -1018,7 +1018,7 @@ static void TestCharLength()
UBool multiple;
for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){
UChar32 c=codepoint[i+1];
- if(UTF_CHAR_LENGTH(c) != codepoint[i]){
+ if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
log_err("The no: of code units for \\u%4X:- Expected: %d Got: %d", c, codepoint[i], UTF_CHAR_LENGTH(c));
}else{
log_verbose("The no: of code units for \\u%4X is %d", c, UTF_CHAR_LENGTH(c));
@@ -1457,7 +1457,6 @@ static void TestStringFunctions()
static void TestStringSearching()
{
- UChar ucharBuf[255];
const UChar testString[] = {0x0061, 0x0062, 0x0063, 0x0064, 0x0064, 0x0061, 0};
const UChar testSurrogateString[] = {0xdbff, 0x0061, 0x0062, 0xdbff, 0xdfff, 0x0063, 0x0064, 0x0064, 0xdbff, 0xdfff, 0xdb00, 0xdf00, 0x0061, 0};
const UChar surrMatchSet1[] = {0xdbff, 0xdfff, 0};
@@ -1467,55 +1466,67 @@ static void TestStringSearching()
const UChar surrMatchSetBad[] = {0xdbff, 0x0061, 0};
const UChar surrMatchSetBad2[] = {0x0061, 0xdbff, 0};
const UChar surrMatchSetBad3[] = {0xdbff, 0x0061, 0x0062, 0xdbff, 0xdfff, 0}; /* has partial surrogate */
+ const UChar
+ empty[] = { 0 },
+ a[] = { 0x61, 0 },
+ ab[] = { 0x61, 0x62, 0 },
+ ba[] = { 0x62, 0x61, 0 },
+ abcd[] = { 0x61, 0x62, 0x63, 0x64, 0 },
+ cd[] = { 0x63, 0x64, 0 },
+ dc[] = { 0x64, 0x63, 0 },
+ cdh[] = { 0x63, 0x64, 0x68, 0 },
+ f[] = { 0x66, 0 },
+ fg[] = { 0x66, 0x67, 0 },
+ gf[] = { 0x67, 0x66, 0 };
log_verbose("Testing u_strpbrk()");
- if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "a")) != &testString[0]) {
+ if (u_strpbrk(testString, a) != &testString[0]) {
log_err("u_strpbrk couldn't find first letter a.\n");
}
- if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "dc")) != &testString[2]) {
+ if (u_strpbrk(testString, dc) != &testString[2]) {
log_err("u_strpbrk couldn't find d or c.\n");
}
- if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "cd")) != &testString[2]) {
+ if (u_strpbrk(testString, cd) != &testString[2]) {
log_err("u_strpbrk couldn't find c or d.\n");
}
- if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "cdh")) != &testString[2]) {
+ if (u_strpbrk(testString, cdh) != &testString[2]) {
log_err("u_strpbrk couldn't find c, d or h.\n");
}
- if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "f")) != NULL) {
+ if (u_strpbrk(testString, f) != NULL) {
log_err("u_strpbrk didn't return NULL for \"f\".\n");
}
- if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "fg")) != NULL) {
+ if (u_strpbrk(testString, fg) != NULL) {
log_err("u_strpbrk didn't return NULL for \"fg\".\n");
}
- if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "gf")) != NULL) {
+ if (u_strpbrk(testString, gf) != NULL) {
log_err("u_strpbrk didn't return NULL for \"gf\".\n");
}
- if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "")) != NULL) {
+ if (u_strpbrk(testString, empty) != NULL) {
log_err("u_strpbrk didn't return NULL for \"\".\n");
}
log_verbose("Testing u_strpbrk() with surrogates");
- if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "a")) != &testSurrogateString[1]) {
+ if (u_strpbrk(testSurrogateString, a) != &testSurrogateString[1]) {
log_err("u_strpbrk couldn't find first letter a.\n");
}
- if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != &testSurrogateString[5]) {
+ if (u_strpbrk(testSurrogateString, dc) != &testSurrogateString[5]) {
log_err("u_strpbrk couldn't find d or c.\n");
}
- if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != &testSurrogateString[5]) {
+ if (u_strpbrk(testSurrogateString, cd) != &testSurrogateString[5]) {
log_err("u_strpbrk couldn't find c or d.\n");
}
- if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "cdh")) != &testSurrogateString[5]) {
+ if (u_strpbrk(testSurrogateString, cdh) != &testSurrogateString[5]) {
log_err("u_strpbrk couldn't find c, d or h.\n");
}
- if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != NULL) {
+ if (u_strpbrk(testSurrogateString, f) != NULL) {
log_err("u_strpbrk didn't return NULL for \"f\".\n");
}
- if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "fg")) != NULL) {
+ if (u_strpbrk(testSurrogateString, fg) != NULL) {
log_err("u_strpbrk didn't return NULL for \"fg\".\n");
}
- if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "gf")) != NULL) {
+ if (u_strpbrk(testSurrogateString, gf) != NULL) {
log_err("u_strpbrk didn't return NULL for \"gf\".\n");
}
if (u_strpbrk(testSurrogateString, surrMatchSet1) != &testSurrogateString[3]) {
@@ -1536,49 +1547,49 @@ static void TestStringSearching()
log_verbose("Testing u_strcspn()");
- if (u_strcspn(testString, u_uastrcpy(ucharBuf, "a")) != 0) {
+ if (u_strcspn(testString, a) != 0) {
log_err("u_strcspn couldn't find first letter a.\n");
}
- if (u_strcspn(testString, u_uastrcpy(ucharBuf, "dc")) != 2) {
+ if (u_strcspn(testString, dc) != 2) {
log_err("u_strcspn couldn't find d or c.\n");
}
- if (u_strcspn(testString, u_uastrcpy(ucharBuf, "cd")) != 2) {
+ if (u_strcspn(testString, cd) != 2) {
log_err("u_strcspn couldn't find c or d.\n");
}
- if (u_strcspn(testString, u_uastrcpy(ucharBuf, "cdh")) != 2) {
+ if (u_strcspn(testString, cdh) != 2) {
log_err("u_strcspn couldn't find c, d or h.\n");
}
- if (u_strcspn(testString, u_uastrcpy(ucharBuf, "f")) != u_strlen(testString)) {
+ if (u_strcspn(testString, f) != u_strlen(testString)) {
log_err("u_strcspn didn't return NULL for \"f\".\n");
}
- if (u_strcspn(testString, u_uastrcpy(ucharBuf, "fg")) != u_strlen(testString)) {
+ if (u_strcspn(testString, fg) != u_strlen(testString)) {
log_err("u_strcspn didn't return NULL for \"fg\".\n");
}
- if (u_strcspn(testString, u_uastrcpy(ucharBuf, "gf")) != u_strlen(testString)) {
+ if (u_strcspn(testString, gf) != u_strlen(testString)) {
log_err("u_strcspn didn't return NULL for \"gf\".\n");
}
log_verbose("Testing u_strcspn() with surrogates");
- if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "a")) != 1) {
+ if (u_strcspn(testSurrogateString, a) != 1) {
log_err("u_strcspn couldn't find first letter a.\n");
}
- if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != 5) {
+ if (u_strcspn(testSurrogateString, dc) != 5) {
log_err("u_strcspn couldn't find d or c.\n");
}
- if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != 5) {
+ if (u_strcspn(testSurrogateString, cd) != 5) {
log_err("u_strcspn couldn't find c or d.\n");
}
- if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "cdh")) != 5) {
+ if (u_strcspn(testSurrogateString, cdh) != 5) {
log_err("u_strcspn couldn't find c, d or h.\n");
}
- if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != u_strlen(testSurrogateString)) {
+ if (u_strcspn(testSurrogateString, f) != u_strlen(testSurrogateString)) {
log_err("u_strcspn didn't return NULL for \"f\".\n");
}
- if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "fg")) != u_strlen(testSurrogateString)) {
+ if (u_strcspn(testSurrogateString, fg) != u_strlen(testSurrogateString)) {
log_err("u_strcspn didn't return NULL for \"fg\".\n");
}
- if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "gf")) != u_strlen(testSurrogateString)) {
+ if (u_strcspn(testSurrogateString, gf) != u_strlen(testSurrogateString)) {
log_err("u_strcspn didn't return NULL for \"gf\".\n");
}
if (u_strcspn(testSurrogateString, surrMatchSet1) != 3) {
@@ -1597,25 +1608,25 @@ static void TestStringSearching()
log_verbose("Testing u_strspn()");
- if (u_strspn(testString, u_uastrcpy(ucharBuf, "a")) != 1) {
+ if (u_strspn(testString, a) != 1) {
log_err("u_strspn couldn't skip first letter a.\n");
}
- if (u_strspn(testString, u_uastrcpy(ucharBuf, "ab")) != 2) {
+ if (u_strspn(testString, ab) != 2) {
log_err("u_strspn couldn't skip a or b.\n");
}
- if (u_strspn(testString, u_uastrcpy(ucharBuf, "ba")) != 2) {
+ if (u_strspn(testString, ba) != 2) {
log_err("u_strspn couldn't skip a or b.\n");
}
- if (u_strspn(testString, u_uastrcpy(ucharBuf, "f")) != 0) {
+ if (u_strspn(testString, f) != 0) {
log_err("u_strspn didn't return 0 for \"f\".\n");
}
- if (u_strspn(testString, u_uastrcpy(ucharBuf, "dc")) != 0) {
+ if (u_strspn(testString, dc) != 0) {
log_err("u_strspn couldn't find first letter a (skip d or c).\n");
}
- if (u_strspn(testString, u_uastrcpy(ucharBuf, "abcd")) != u_strlen(testString)) {
+ if (u_strspn(testString, abcd) != u_strlen(testString)) {
log_err("u_strspn couldn't skip over the whole string.\n");
}
- if (u_strspn(testString, u_uastrcpy(ucharBuf, "")) != 0) {
+ if (u_strspn(testString, empty) != 0) {
log_err("u_strspn should have returned 0 for empty string.\n");
}
@@ -1626,13 +1637,13 @@ static void TestStringSearching()
if (u_strspn(testSurrogateString, surrMatchSetBad2) != 2) {
log_err("u_strspn couldn't skip 0xdbff or a.\n");
}
- if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != 0) {
+ if (u_strspn(testSurrogateString, f) != 0) {
log_err("u_strspn couldn't skip d or c (skip first letter).\n");
}
- if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != 0) {
+ if (u_strspn(testSurrogateString, dc) != 0) {
log_err("u_strspn couldn't skip d or c (skip first letter).\n");
}
- if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != 0) {
+ if (u_strspn(testSurrogateString, cd) != 0) {
log_err("u_strspn couldn't skip d or c (skip first letter).\n");
}
if (u_strspn(testSurrogateString, testSurrogateString) != u_strlen(testSurrogateString)) {
diff --git a/icu4c/source/test/cintltst/utf16tst.c b/icu4c/source/test/cintltst/utf16tst.c
index e5367367b22..9dd395a9145 100644
--- a/icu4c/source/test/cintltst/utf16tst.c
+++ b/icu4c/source/test/cintltst/utf16tst.c
@@ -21,6 +21,7 @@
#include "cintltst.h"
#include
+#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
static void printUChars(const UChar *uchars);
@@ -31,6 +32,7 @@ static void TestNextPrevChar(void);
static void TestFwdBack(void);
static void TestSetChar(void);
static void TestAppendChar(void);
+static void TestAppend(void);
static void TestSurrogate(void);
void addUTF16Test(TestNode** root);
@@ -45,6 +47,7 @@ addUTF16Test(TestNode** root)
addTest(root, &TestFwdBack, "utf16tst/TestFwdBack" );
addTest(root, &TestSetChar, "utf16tst/TestSetChar" );
addTest(root, &TestAppendChar, "utf16tst/TestAppendChar" );
+ addTest(root, &TestAppend, "utf8tst/TestAppend" );
addTest(root, &TestSurrogate, "utf16tst/TestSurrogate" );
}
@@ -57,17 +60,17 @@ static void TestCodeUnitValues()
UChar c=codeunit[i];
log_verbose("Testing code unit value of %x\n", c);
if(i<4){
- if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c)){
+ if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c) || !U16_IS_SINGLE(c) || U16_IS_LEAD(c) || U16_IS_TRAIL(c)){
log_err("ERROR: %x is a single character\n", c);
}
}
if(i >= 4 && i< 8){
- if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c)){
+ if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c) || !U16_IS_LEAD(c) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c)){
log_err("ERROR: %x is a first surrogate\n", c);
}
}
if(i >= 8 && i< 12){
- if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c)){
+ if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || !U16_IS_TRAIL(c) || U16_IS_SINGLE(c) || U16_IS_LEAD(c)){
log_err("ERROR: %x is a second surrogate\n", c);
}
}
@@ -93,7 +96,7 @@ static void TestCharLength()
UBool multiple;
for(i=0; i 0){
+ U16_BACK_1_UNSAFE(input, offunsafe);
+ if(offunsafe != back_unsafe[i]){
+ log_err("ERROR: U16_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
+ }
+ i++;
+ }
+
i=0;
while(offsafe > 0){
UTF16_BACK_1_SAFE(input,0, offsafe);
@@ -321,6 +408,16 @@ static void TestFwdBack(){
}
i++;
}
+
+ i=0;
+ while(offsafe > 0){
+ U16_BACK_1(input,0, offsafe);
+ if(offsafe != back_safe[i]){
+ log_err("ERROR: U16_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
+ }
+ i++;
+ }
+
offunsafe=0;
offsafe=0;
for(i=0; i 0) {
setOffset=offset;
UTF16_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
if(setOffset != limit_unsafe[i]){
log_err("ERROR: UTF16_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_unsafe[i], setOffset);
}
+
+ setOffset=offset;
+ U16_SET_CP_LIMIT_UNSAFE(input, setOffset);
+ if(setOffset != limit_unsafe[i]){
+ log_err("ERROR: U16_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_unsafe[i], setOffset);
+ }
}
+
setOffset=offset;
- UTF16_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input)/U_SIZEOF_UCHAR);
+ U16_SET_CP_LIMIT(input,0, setOffset, sizeof(input)/U_SIZEOF_UCHAR);
if(setOffset != limit_safe[i]){
- log_err("ERROR: UTF16_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_safe[i], setOffset);
+ log_err("ERROR: U16_SET_CHAR_LIMIT failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_safe[i], setOffset);
}
+
i++;
}
}
@@ -487,6 +643,67 @@ static void TestAppendChar(){
}
+static void TestAppend() {
+ static const UChar32 codePoints[]={
+ 0x61, 0xdf, 0x901, 0x3040,
+ 0xac00, 0xd800, 0xdbff, 0xdcde,
+ 0xdffd, 0xe000, 0xffff, 0x10000,
+ 0x12345, 0xe0021, 0x10ffff, 0x110000,
+ 0x234567, 0x7fffffff, -1, -1000,
+ 0, 0x400
+ };
+ static const UChar expectUnsafe[]={
+ 0x61, 0xdf, 0x901, 0x3040,
+ 0xac00, 0xd800, 0xdbff, 0xdcde,
+ 0xdffd, 0xe000, 0xffff, 0xd800, 0xdc00,
+ 0xd848, 0xdf45, 0xdb40, 0xdc21, 0xdbff, 0xdfff, /* not 0x110000 */
+ /* none from this line */
+ 0, 0x400
+ }, expectSafe[]={
+ 0x61, 0xdf, 0x901, 0x3040,
+ 0xac00, 0xd800, 0xdbff, 0xdcde,
+ 0xdffd, 0xe000, 0xffff, 0xd800, 0xdc00,
+ 0xd848, 0xdf45, 0xdb40, 0xdc21, 0xdbff, 0xdfff, /* not 0x110000 */
+ /* none from this line */
+ 0, 0x400
+ };
+
+ UChar buffer[100];
+ UChar32 c;
+ int32_t i, length;
+ UBool isError, expectIsError, wrongIsError;
+
+ length=0;
+ for(i=0; i= 0 : c != result[i+1]){
+ log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
+ }
+
UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
if(c != result[i+1]){
log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
+
UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
if(c != result[i+2]){
log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
@@ -225,7 +241,18 @@ static void TestNextPrevChar(){
if(c != result[i]){
log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
}
+
+ setOffset=offset;
+ U8_NEXT_UNSAFE(input, setOffset, c);
+ if(setOffset != movedOffset[i]){
+ log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+ offset, movedOffset[i], setOffset);
+ }
+ if(c != result[i]){
+ log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
+ }
}
+
setOffset=offset;
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
if(setOffset != movedOffset[i+1]){
@@ -235,6 +262,17 @@ static void TestNextPrevChar(){
if(c != result[i+1]){
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
+
+ setOffset=offset;
+ U8_NEXT(input, setOffset, sizeof(input), c);
+ if(setOffset != movedOffset[i+1]){
+ log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+ offset, movedOffset[i+1], setOffset);
+ }
+ if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
+ log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
+ }
+
setOffset=offset;
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
if(setOffset != movedOffset[i+1]){
@@ -244,8 +282,10 @@ static void TestNextPrevChar(){
if(c != result[i+2]){
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
}
+
i=i+6;
}
+
i=0;
for(offset=sizeof(input); offset > 0; --offset){
setOffset=offset;
@@ -257,6 +297,7 @@ static void TestNextPrevChar(){
if(c != result[i+3]){
log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
}
+
setOffset=offset;
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
if(setOffset != movedOffset[i+4]){
@@ -266,6 +307,17 @@ static void TestNextPrevChar(){
if(c != result[i+4]){
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
}
+
+ setOffset=offset;
+ U8_PREV(input, 0, setOffset, c);
+ if(setOffset != movedOffset[i+4]){
+ log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
+ offset, movedOffset[i+4], setOffset);
+ }
+ if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
+ log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
+ }
+
setOffset=offset;
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
if(setOffset != movedOffset[i+5]){
@@ -275,6 +327,7 @@ static void TestNextPrevChar(){
if(c != result[i+5]){
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
}
+
i=i+6;
}
@@ -295,6 +348,7 @@ static void TestFwdBack(){
uint32_t offunsafe=0, offsafe=0;
+
uint32_t i=0;
while(offunsafe < sizeof(input)){
UTF8_FWD_1_UNSAFE(input, offunsafe);
@@ -303,6 +357,16 @@ static void TestFwdBack(){
}
i++;
}
+
+ i=0;
+ while(offunsafe < sizeof(input)){
+ U8_FWD_1_UNSAFE(input, offunsafe);
+ if(offunsafe != fwd_unsafe[i]){
+ log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
+ }
+ i++;
+ }
+
i=0;
while(offsafe < sizeof(input)){
UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
@@ -311,6 +375,16 @@ static void TestFwdBack(){
}
i++;
}
+
+ i=0;
+ while(offsafe < sizeof(input)){
+ U8_FWD_1(input, offsafe, sizeof(input));
+ if(offsafe != fwd_safe[i]){
+ log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
+ }
+ i++;
+ }
+
offunsafe=sizeof(input);
i=0;
while(offunsafe > 0){
@@ -320,6 +394,17 @@ static void TestFwdBack(){
}
i++;
}
+
+ offunsafe=sizeof(input);
+ i=0;
+ while(offunsafe > 0){
+ U8_BACK_1_UNSAFE(input, offunsafe);
+ if(offunsafe != back_unsafe[i]){
+ log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
+ }
+ i++;
+ }
+
i=0;
offsafe=sizeof(input);
while(offsafe > 0){
@@ -329,14 +414,34 @@ static void TestFwdBack(){
}
i++;
}
+
+ i=0;
+ offsafe=sizeof(input);
+ while(offsafe > 0){
+ U8_BACK_1(input, 0, offsafe);
+ if(offsafe != back_safe[i]){
+ log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
+ }
+ i++;
+ }
+
offunsafe=0;
- offsafe=0;
for(i=0; i