ICU-2150 simplify/improve UTF macros

X-SVN-Rev: 9930
This commit is contained in:
Markus Scherer 2002-09-30 04:00:17 +00:00
parent 22e1a4fe61
commit 6b1fa6036a
16 changed files with 2991 additions and 1199 deletions

View file

@ -582,7 +582,7 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
int32_t inputLen = 0;
UChar decomp[decompSize];
UTF_APPEND_CHAR(temp, inputLen, bufSize, comp);
U16_APPEND_UNSAFE(temp, inputLen, comp);
int32_t decompLen = unorm_getDecomposition(comp, FALSE, decomp, decompSize);
if(decompLen < 0) {
decompLen = -decompLen;
@ -597,7 +597,9 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
UChar32 decompCp;
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
int32_t i = 0;
int32_t i;
UBool overflow = FALSE;
i = segmentPos;
while(i < segLen) {
UTF_NEXT_CHAR(segment, i, segLen, cp);
@ -620,7 +622,19 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
// brute force approach
UTF_APPEND_CHAR(buff, bufLen, bufSize, cp);
U16_APPEND(buff, bufLen, bufSize, cp, overflow);
if(overflow) {
/*
* ### TODO handle buffer overflow
* The buffer is large, but an overflow may still happen with
* unusual input (many combining marks?).
* Reallocate buffer and continue.
* markus 20020929
*/
overflow = FALSE;
}
/* TODO: optimize
// since we know that the classes are monotonically increasing, after zero

View file

@ -3100,6 +3100,10 @@ InputPath=.\unicode\utf8.h
# End Source File
# Begin Source File
SOURCE=.\unicode\utf_old.h
# End Source File
# Begin Source File
SOURCE=.\util.h
# End Source File
# Begin Source File

View file

@ -141,7 +141,7 @@ ucnv_getUChar32KeepOverflow(UConverter *cnv, const UChar *buffer, int32_t length
/* get the first code point in the buffer */
i=0;
UTF_NEXT_CHAR_SAFE(buffer, i, length, c, FALSE);
UTF_NEXT_CHAR(buffer, i, length, c);
if(i<length) {
/* there are UChars left in the buffer that need to go into the overflow buffer */
UChar *overflow=cnv->UCharErrorBuffer;

View file

@ -156,7 +156,7 @@ u_strstr(const UChar *s, const UChar *substring);
* but u_strchr32() will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that UTF_GET_CHAR(u_strchr32(c))==c.
* This behavior ensures that U16_GET(u_strchr32(c))==c.
*
* @param s The string to search.
* @param c The code point (0..0x10ffff) to find.
@ -628,7 +628,7 @@ u_memchr(const UChar *src, UChar ch, int32_t count);
* but u_memchr32() will find neither because they
* combine to the code point U+10000.
* Either function will find U+d800 in "a\ud800b".
* This behavior ensures that UTF_GET_CHAR(u_memchr32(c))==c.
* This behavior ensures that U16_GET(u_memchr32(c))==c.
*
* @param src string to search in
* @param ch character to find

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2001, International Business Machines
* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -15,77 +15,89 @@
*/
/**
* \file
* \brief C API: UChar and UChar32 data types and UTF macros for C Unicode string handling
*
* <p>This file defines the UChar and UChar32 data types for Unicode code units
* and code points, as well as macros for efficiently getting code points
* in and out of a string.</p>
*
* <p>utf.h is included by utypes.h and itself includes the utfXX.h after some
* common definitions. Those files define the macros for each UTF-size.</p>
*
* <p>The original concept for these files was for ICU to allow
* in principle to set which UTF (UTF-8/16/32) is used internally
* by defining UTF_SIZE to either 8, 16, or 32. utf.h would then define the UChar type
* accordingly. UTF-16 was the default.</p>
*
* <p>This concept has been abandoned.
* A lot of the ICU source code &mdash; especially low-level code like
* conversion, normalization, and collation &mdash; assumes UTF-16,
* utf.h enforces the default of UTF-16.
* The UTF-8 and UTF-32 macros remain for now for completeness and backward compatibility.</p>
*
* <p>Accordingly, utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then
* UChar is defined to be exactly wchar_t, otherwise uint16_t.</p>
*
* <p>UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
* Unicode code point (Unicode scalar value, 0..0x10ffff).
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
* the definition of UChar. For details see the documentation for UChar32 itself.</p>
*
* <p>utf.h also defines a number of C macros for handling single Unicode code points and
* for using UTF Unicode strings. It includes utf8.h, utf16.h, and utf32.h for the actual
* implementations of those macros and then aliases one set of them (for UTF-16) for general use.
* The UTF-specific macros have the UTF size in the macro name prefixes (UTF16_...), while
* the general alias macros always begin with UTF_...</p>
*
* <p>Many string operations can be done with or without error checking.
* Where such a distinction is useful, there are two versions of the macros, "unsafe" and "safe"
* ones with ..._UNSAFE and ..._SAFE suffixes. The unsafe macros are fast but may cause
* program failures if the strings are not well-formed. The safe macros have an additional, boolean
* parameter "strict". If strict is FALSE, then only illegal sequences are detected.
* Otherwise, irregular sequences and non-characters are detected as well (like single surrogates).
* Safe macros return special error code points for illegal/irregular sequences:
* Typically, U+ffff, or values that would result in a code unit sequence of the same length
* as the erroneous input sequence.<br>
* Note that _UNSAFE macros have fewer parameters: They do not have the strictness parameter, and
* they do not have start/length parameters for boundary checking.</p>
*
* <p>Here, the macros are aliased in two steps:
* In the first step, the UTF-specific macros with UTF16_ prefix and _UNSAFE and _SAFE suffixes are
* aliased according to the UTF_SIZE to macros with UTF_ prefix and the same suffixes and signatures.
* Then, in a second step, the default, general alias macros are set to use either the unsafe or
* the safe/not strict (default) or the safe/strict macro;
* these general macros do not have a strictness parameter.</p>
*
* <p>It is possible to change the default choice for the general alias macros to be unsafe, safe/not strict or safe/strict.
* The default is safe/not strict. It is not recommended to select the unsafe macros as the basis for
* Unicode string handling in ICU! To select this, define UTF_SAFE, UTF_STRICT, or UTF_UNSAFE.</p>
*
* <p>For general use, one should use the default, general macros with UTF_ prefix and no _SAFE/_UNSAFE suffix.
* Only in some cases it may be necessary to control the choice of macro directly and use a less generic alias.
* For example, if it can be assumed that a string is well-formed and the index will stay within the bounds,
* then the _UNSAFE version may be used.
* If a UTF-8 string is to be processed, then the macros with UTF8_ prefixes need to be used.</p>
* <p><b>Usage:</b> ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.</p>
*/
* \file
* \brief C API: UChar and UChar32 data types and code point macros
*
* This file defines the UChar and UChar32 data types for Unicode code units
* and code points, as well as macros for checking whether a code point is
* a surrogate or a non-character.
*
* utf.h is included by utypes.h and itself includes utf8.h and utf16.h after some
* common definitions. Those files define macros for efficiently getting code points
* in and out of UTF-8/16 strings.
* utf16.h macros have "U16_" prefixes.
* utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling.
*
* ICU processes 16-bit Unicode strings.
* Most of the time, such strings are well-formed UTF-16.
* Single, unpaired surrogates must be handled as well, and are treated in ICU
* like regular code points where possible.
* (Pairs of surrogate code points are indistinguishable from supplementary
* code points encoded as pairs of supplementary code units.)
*
* In fact, almost all Unicode code points in normal text (>99%)
* are on the BMP (<=U+ffff) and even <=U+d7ff.
* ICU functions handle supplementary code points (U+10000..U+10ffff)
* but are optimized for the much more frequently occurring BMP code points.
*
* utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then
* UChar is defined to be exactly wchar_t, otherwise uint16_t.
*
* UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
* Unicode code point (Unicode scalar value, 0..0x10ffff).
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
* the definition of UChar. For details see the documentation for UChar32 itself.
*
* utf.h also defines a small number of C macros for single Unicode code points.
* These are simple checks for surrogates and non-characters.
* For actual Unicode character properties see uchar.h.
*
* By default, string operations must be done with error checking in case
* a string is not well-formed UTF-16.
* The macros will detect if a surrogate code unit is unpaired
* (lead unit without trail unit or vice versa) and just return the unit itself
* as the code point.
* (It is an accidental property of Unicode and UTF-16 that all
* malformed sequences can be expressed unambiguously with a distinct subrange
* of Unicode code points.)
*
* When it is safe to assume that text is well-formed UTF-16
* (does not contain single, unpaired surrogates), then one can use
* U16_..._UNSAFE macros.
* These do not check for proper code unit sequences or truncated text and may
* yield wrong results or even cause a crash if they are used with "malformed"
* text.
* In practice, U16_..._UNSAFE macros will produce slightly less code but
* should not be faster because the processing is only different when a
* surrogate code unit is detected, which will be rare.
*
* Similarly for UTF-8, there are "safe" macros without a suffix,
* and U8_..._UNSAFE versions.
* The performance differences are much larger here because UTF-8 provides so
* many opportunities for malformed sequences.
* The unsafe UTF-8 macros are entirely implemented inside the macro definitions
* and are fast, while the safe UTF-8 macros call functions for all but the
* trivial (ASCII) cases.
*
* Unlike with UTF-16, malformed sequences cannot be expressed with distinct
* code point values (0..U+10ffff). They are indicated with negative values instead.
*
* For more information see the ICU User Guide Strings chapter
* (http://oss.software.ibm.com/icu/userguide/).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*
* @draft ICU 2.4
*/
#ifndef __UTF_H__
#define __UTF_H__
/* wchar_t-related definitions ---------------------------------------------- */
/*
* ANSI C headers:
* stddef.h defines wchar_t
@ -94,18 +106,11 @@
#include <stddef.h>
/* include the utfXX.h after the following definitions */
/* If there is no compiler option for the preferred UTF size, then default to UTF-16. */
#ifndef UTF_SIZE
/** Number of bits in a Unicode string code unit, same as x in UTF-x (8, 16, or 32). */
# define UTF_SIZE 16
#endif
/** Number of bytes in a UChar (sizeof(UChar)). */
#define U_SIZEOF_UCHAR (UTF_SIZE>>3)
/*!
* \def U_SIZEOF_WCHAR_T
* U_SIZEOF_WCHAR_T==sizeof(wchar_t).
*
* @stable
*/
#ifndef U_HAVE_WCHAR_H
# define U_HAVE_WCHAR_H 1
@ -120,10 +125,14 @@
/*!
* \def U_WCHAR_IS_UTF16
* Defined if wchar_t uses UTF-16.
*
* @stable
*/
/*!
* \def U_WCHAR_IS_UTF32
* Defined if wchar_t uses UTF-32.
*
* @stable
*/
#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
# ifdef __STDC_ISO_10646__
@ -145,139 +154,10 @@
# endif
#endif
/**
* Define UChar32 as a type for single Unicode code points.
* UChar32 is a signed 32-bit integer.
*
* The Unicode code point range is 0..0x10ffff.
* All other values (negative or >=0x110000) are illegal as Unicode code points.
* They may be used as sentinel values to indicate "done", "error"
* or similar non-code point conditions.
*
* Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
* to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
* or else to be uint32_t.
* That is, the definition of UChar32 was platform-dependent.
*
* @see UTF_SENTINEL
* @draft ICU 2.4
*/
typedef int32_t UChar32;
/* UChar and UChar32 definitions -------------------------------------------- */
/**
* Unicode string and array offset and index type.
* ICU always counts Unicode code units (UChars) for
* string offsets, indexes, and lengths, not Unicode code points.
*
* @deprecated Use int32_t directly. UTextOffset to be removed after 2003-mar.
*/
typedef int32_t UTextOffset;
/* Specify which macro versions are the default ones - safe or fast. */
#if !defined(UTF_SAFE) && !defined(UTF_STRICT) && !defined(UTF_UNSAFE)
/**
* The default choice for general Unicode string macros is to use the ..._SAFE macro implementations
* with strict=FALSE. See the utf.h file description.
*/
# define UTF_SAFE
#endif
/* internal definitions ----------------------------------------------------- */
/**
* <p>UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8,
* which need 1 or 2 bytes in UTF-8:<br>
* U+0015 = NAK = Negative Acknowledge, C0 control character<br>
* U+009f = highest C1 control character</p>
*
* <p>These are used by ("safe") UTF-8 macros so that they can return an error value
* that needs the same number of code units (bytes) as were seen by
* a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().</p>
*
* @internal
*/
#define UTF8_ERROR_VALUE_1 0x15
/**
* See documentation on UTF8_ERROR_VALUE_1 for details.
*/
#define UTF8_ERROR_VALUE_2 0x9f
/**
* Error value for all UTFs. This code point value will be set by macros with error
* checking if an error is detected.
*/
#define UTF_ERROR_VALUE 0xffff
/* single-code point definitions -------------------------------------------- */
/**
* This value is intended for sentinel values for APIs that
* (take or) return single code points (UChar32).
* It is outside of the Unicode code point range 0..0x10ffff.
*
* For example, a "done" or "error" value in a new API
* could be indicated with UTF_SENTINEL.
*
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
* values, mostly 0xffff.
* Those may need to be distinguished from
* actual U+ffff text contents by calling functions like
* CharacterIterator::hasNext() or UnicodeString::length().
*
* @see UChar32
* @draft ICU 2.4
*/
#define UTF_SENTINEL (-1)
/** Is this code unit or code point a surrogate (U+d800..U+dfff)? */
#define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800)
/**
* Is a given 32-bit code point a Unicode noncharacter?
*/
#define UTF_IS_UNICODE_NONCHAR(c) \
((c)>=0xfdd0 && \
((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
(uint32_t)(c)<=0x10ffff)
/**
* Is a given 32-bit code point/Unicode scalar value
* actually a valid Unicode (abstract) character?
*
* Code points that are not characters include:
* - single surrogate code points (U+d800..U+dfff, 2048 code points)
* - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
* - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
* - the highest Unicode code point value is U+10ffff
*
* This means that all code points below U+d800 are character code points,
* and that boundary is tested first for performance.
*/
#define UTF_IS_UNICODE_CHAR(c) \
((uint32_t)(c)<0xd800 || \
((uint32_t)(c)>0xdfff && \
(uint32_t)(c)<=0x10ffff && \
!UTF_IS_UNICODE_NONCHAR(c)))
/**
* Is a given 32-bit code an error value
* as returned by one of the macros for any UTF?
*/
#define UTF_IS_ERROR(c) \
(((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
/** This is a combined macro: Is c a valid Unicode value _and_ not an error code? */
#define UTF_IS_VALID(c) \
(UTF_IS_UNICODE_CHAR(c) && \
(c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2)
/* include the utfXX.h ------------------------------------------------------ */
#include "unicode/utf8.h"
#include "unicode/utf16.h"
#include "unicode/utf32.h"
/* Define types and macros according to the selected UTF size. -------------- */
/** Number of bytes in a UChar. @stable */
#define U_SIZEOF_UCHAR 2
/*!
* \var UChar
@ -290,262 +170,127 @@ typedef int32_t UTextOffset;
* @stable
*/
#if UTF_SIZE==8
# error UTF-8 is not implemented, undefine UTF_SIZE or define it to 16
/*
* ANSI C header:
* limits.h defines CHAR_MAX
*/
# include <limits.h>
/* Define UChar to be compatible with char if possible. */
# if CHAR_MAX>=255
typedef char UChar;
# else
typedef uint8_t UChar;
# endif
#elif UTF_SIZE==16
/* Define UChar to be compatible with wchar_t if possible. */
# if U_SIZEOF_WCHAR_T==2
typedef wchar_t UChar;
# else
typedef uint16_t UChar;
# endif
/** Does this code unit alone encode a code point? */
# define UTF_IS_SINGLE(uchar) UTF16_IS_SINGLE(uchar)
/** Is this code unit the first one of several? */
# define UTF_IS_LEAD(uchar) UTF16_IS_LEAD(uchar)
/** Is this code unit one of several but not the first one? */
# define UTF_IS_TRAIL(uchar) UTF16_IS_TRAIL(uchar)
/** Does this code point require multiple code units? */
# define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c)
/** How many code units are used to encode this code point? */
# define UTF_CHAR_LENGTH(c) UTF16_CHAR_LENGTH(c)
/** How many code units are used at most for any Unicode code point? */
# define UTF_MAX_CHAR_LENGTH UTF16_MAX_CHAR_LENGTH
/** Estimate the number of code units for a string based on the number of UTF-16 code units. */
# define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size)
/** See file documentation and UTF_GET_CHAR. */
# define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c)
/** See file documentation and UTF_GET_CHAR. */
# define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict)
/** See file documentation and UTF_NEXT_CHAR. */
# define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c)
/** See file documentation and UTF_NEXT_CHAR. */
# define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict)
/** See file documentation and UTF_APPEND_CHAR. */
# define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c)
/** See file documentation and UTF_APPEND_CHAR. */
# define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c)
/** See file documentation and UTF_FWD_1. */
# define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i)
/** See file documentation and UTF_FWD_1. */
# define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length)
/** See file documentation and UTF_FWD_N. */
# define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n)
/** See file documentation and UTF_FWD_N. */
# define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n)
/** See file documentation and UTF_SET_CHAR_START. */
# define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i)
/** See file documentation and UTF_SET_CHAR_START. */
# define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i)
/** See file documentation and UTF_PREV_CHAR. */
# define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c)
/** See file documentation and UTF_PREV_CHAR. */
# define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict)
/** See file documentation and UTF_BACK_1. */
# define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i)
/** See file documentation and UTF_BACK_1. */
# define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i)
/** See file documentation and UTF_BACK_N. */
# define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n)
/** See file documentation and UTF_BACK_N. */
# define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n)
/** See file documentation and UTF_SET_CHAR_LIMIT. */
# define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i)
/** See file documentation and UTF_SET_CHAR_LIMIT. */
# define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length)
#elif UTF_SIZE==32
# error UTF-32 is not implemented, undefine UTF_SIZE or define it to 16
typedef UChar32 UChar;
/* Define UChar to be compatible with wchar_t if possible. */
#if U_SIZEOF_WCHAR_T==2
typedef wchar_t UChar;
#else
# error UTF_SIZE must be undefined or one of { 8, 16, 32 } - only 16 is implemented
typedef uint16_t UChar;
#endif
/* Define the default macros for handling UTF characters. ------------------- */
/**
* Define UChar32 as a type for single Unicode code points.
* UChar32 is a signed 32-bit integer (same as int32_t).
*
* The Unicode code point range is 0..0x10ffff.
* All other values (negative or >=0x110000) are illegal as Unicode code points.
* They may be used as sentinel values to indicate "done", "error"
* or similar non-code point conditions.
*
* Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
* to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
* or else to be uint32_t.
* That is, the definition of UChar32 was platform-dependent.
*
* @see U_SENTINEL
* @draft ICU 2.4
*/
typedef int32_t UChar32;
/* single-code point definitions -------------------------------------------- */
/**
* \def UTF_GET_CHAR(s, start, i, length, c)
* This value is intended for sentinel values for APIs that
* (take or) return single code points (UChar32).
* It is outside of the Unicode code point range 0..0x10ffff.
*
* For example, a "done" or "error" value in a new API
* could be indicated with U_SENTINEL.
*
* Set c to the code point that contains the code unit i.
* i could point to the first, the last, or an intermediate code unit.
* i is not modified.
* \pre 0<=i<length
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
* values, mostly 0xffff.
* Those may need to be distinguished from
* actual U+ffff text contents by calling functions like
* CharacterIterator::hasNext() or UnicodeString::length().
*
* @return -1
* @see UChar32
* @draft ICU 2.4
*/
#define U_SENTINEL (-1)
/**
* \def UTF_NEXT_CHAR(s, i, length, c)
*
* Set c to the code point that starts at code unit i
* and advance i to beyond the code units of this code point (post-increment).
* i must point to the first code unit of a code point.
* \pre 0<=i<length
* \post 0<i<=length
* Is this code point a Unicode noncharacter?
* @param c 32-bit code point
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U_IS_UNICODE_NONCHAR(c) \
((c)>=0xfdd0 && \
((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
(uint32_t)(c)<=0x10ffff)
/**
* \def UTF_APPEND_CHAR(s, i, length, c)
* Is c a Unicode code point value (0..U+10ffff)
* that can be assigned a character?
*
* Append the code units of code point c to the string at index i
* and advance i to beyond the new code units (post-increment).
* The code units beginning at index i will be overwritten.
* \pre 0<=c<=0x10ffff
* \pre 0<=i<length
* \post 0<i<=length
* Code points that are not characters include:
* - single surrogate code points (U+d800..U+dfff, 2048 code points)
* - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
* - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
* - the highest Unicode code point value is U+10ffff
*
* This means that all code points below U+d800 are character code points,
* and that boundary is tested first for performance.
*
* @param c 32-bit code point
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U_IS_UNICODE_CHAR(c) \
((uint32_t)(c)<0xd800 || \
((uint32_t)(c)>0xdfff && \
(uint32_t)(c)<=0x10ffff && \
!U_IS_UNICODE_NONCHAR(c)))
/**
* \def UTF_FWD_1(s, i, length)
*
* Advance i to beyond the code units of the code point that begins at i.
* I.e., advance i by one code point.
* i must point to the first code unit of a code point.
* \pre 0<=i<length
* \post 0<i<=length
* Is this code point a lead surrogate (U+d800..U+dbff)?
* @param c 32-bit code point
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
* \def UTF_FWD_N(s, i, length, n)
*
* Advance i to beyond the code units of the n code points where the first one begins at i.
* I.e., advance i by n code points.
* i must point to the first code unit of a code point.
* \pre 0<=i<length
* \post 0<i<=length
* Is this code point a trail surrogate (U+dc00..U+dfff)?
* @param c 32-bit code point
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
/**
* \def UTF_SET_CHAR_START(s, start, i)
*
* Take the random-access index i and adjust it so that it points to the beginning
* of a code point.
* The input index points to any code unit of a code point and is moved to point to
* the first code unit of the same code point. i is never incremented.
* This can be used to start an iteration with UTF_NEXT_CHAR() from a random index.
* \pre start<=i<length
* \post start<=i<length
* Is this code point a surrogate (U+d800..U+dfff)?
* @param c 32-bit code point
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
/**
* \def UTF_PREV_CHAR(s, start, i, c)
*
* Set c to the code point that has code units before i
* and move i backward (towards the beginning of the string)
* to the first code unit of this code point (pre-increment).
* i must point to the first code unit after the last unit of a code point (i==length is allowed).
* \pre start<i<=length
* \post start<=i<length
* Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
* is it a lead surrogate?
* @param c 32-bit code point
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
/**
* \def UTF_BACK_1(s, start, i)
*
* Move i backward (towards the beginning of the string)
* to the first code unit of the code point that has code units before i.
* I.e., move i backward by one code point.
* i must point to the first code unit after the last unit of a code point (i==length is allowed).
* \pre start<i<=length
* \post start<=i<length
*/
/* include the utfXX.h ------------------------------------------------------ */
/**
* \def UTF_BACK_N(s, start, i, n)
*
* Move i backward (towards the beginning of the string)
* to the first code unit of the n code points that have code units before i.
* I.e., move i backward by n code points.
* i must point to the first code unit after the last unit of a code point (i==length is allowed).
* \pre start<i<=length
* \post start<=i<length
*/
#include "unicode/utf8.h"
#include "unicode/utf16.h"
/**
* \def UTF_SET_CHAR_LIMIT(s, start, i, length)
*
* Take the random-access index i and adjust it so that it points beyond
* a code point. The input index points beyond any code unit
* of a code point and is moved to point beyond the last code unit of the same
* code point. i is never decremented.
* This can be used to start an iteration with UTF_PREV_CHAR() from a random index.
* \pre start<i<=length
* \post start<i<=length
*/
#ifdef UTF_SAFE
# define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, FALSE)
# define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, FALSE)
# define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c)
# define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length)
# define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n)
# define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i)
# define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, FALSE)
# define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i)
# define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n)
# define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)
#elif defined(UTF_STRICT)
# define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, TRUE)
# define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, TRUE)
# define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c)
# define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length)
# define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n)
# define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i)
# define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, TRUE)
# define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i)
# define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n)
# define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)
#else /* UTF_UNSAFE */
# define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_UNSAFE(s, i, c)
# define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_UNSAFE(s, i, c)
# define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_UNSAFE(s, i, c)
# define UTF_FWD_1(s, i, length) UTF_FWD_1_UNSAFE(s, i)
# define UTF_FWD_N(s, i, length, n) UTF_FWD_N_UNSAFE(s, i, n)
# define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_UNSAFE(s, i)
# define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_UNSAFE(s, i, c)
# define UTF_BACK_1(s, start, i) UTF_BACK_1_UNSAFE(s, i)
# define UTF_BACK_N(s, start, i, n) UTF_BACK_N_UNSAFE(s, i, n)
# define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_UNSAFE(s, i)
#endif
/* utf_old.h contains deprecated, pre-ICU 2.4 definitions */
#include "unicode/utf_old.h"
#endif

View file

@ -15,141 +15,265 @@
*/
/**
* \file
* \brief C API: UTF-16 macros
*
* This file defines macros to deal with UTF-16 code units and code points.
* "Safe" macros check for length overruns and illegal sequences, and
* also for irregular sequences when the strict option is set.
* "Unsafe" macros are designed for maximum speed.
* utf16.h is included by utf.h after unicode/umachine.h
* and some common definitions.</p>
* <p><b>Usage:</b> ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.</p>
*/
* \file
* \brief C API: 16-bit Unicode handling macros
*
* This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
* utf16.h is included by utf.h after unicode/umachine.h
* and some common definitions.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (http://oss.software.ibm.com/icu/userguide/).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
/* utf.h must be included first. */
#ifndef __UTF_H__
# include "unicode/utf.h"
#endif
#ifndef __UTF16_H__
#define __UTF16_H__
/* single-code point definitions -------------------------------------------- */
/* handle surrogate pairs */
#define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800)
#define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00)
#define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0)
/** Get the UTF-32 value directly from the surrogate pseudo-characters */
#define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
#define UTF16_GET_PAIR_VALUE(first, second) \
(((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET)
/* get the first and second surrogates for a supplementary code point */
/**
* Takes a supplementary code point (0x10000..0x10ffff)
* and computes the first surrogate (0xd800..0xdbff)
* for UTF-16 encoding.
* Does this code unit alone encode a code point (BMP, not a surrogate)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
/**
* Takes a supplementary code point (0x10000..0x10ffff)
* and computes the second surrogate (0xdc00..0xdfff)
* for UTF-16 encoding.
* Is this code unit a lead surrogate (U+d800..U+dbff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
/** alias for UTF_FIRST_SURROGATE */
#define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary)
/** alias for UTF_SECOND_SURROGATE */
#define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary)
/* classes of code unit values */
#define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar)
#define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar)
#define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar)
/* number of code units per code point */
#define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff)
#define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
#define UTF16_MAX_CHAR_LENGTH 2
/* average number of code units compared to UTF-16 */
#define UTF16_ARRAY_SIZE(size) (size)
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
/**
* Get a single code point from an offset that points to any
* of the code units that belong to that code point.
* Assume 0<=i<length.
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
/**
* Is this code unit a surrogate (U+d800..U+dfff)?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
/**
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
* is it a lead surrogate?
* @param c 16-bit code unit
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
/**
* Helper constant for U16_GET_SUPPLEMENTARY.
* @internal
*/
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
/**
* Get a supplementary code point value (U+10000..U+10ffff)
* from its lead and trail surrogates.
* The result is undefined if the input values are not
* lead and trail surrogates.
*
* This could be used for iteration together with
* UTF16_CHAR_LENGTH() and UTF_IS_ERROR(),
* but the use of UTF16_NEXT_CHAR_[UN]SAFE() and
* UTF16_PREV_CHAR_[UN]SAFE() is more efficient for that.
* @param lead lead surrogate (U+d800..U+dbff)
* @param trail trail surrogate (U+dc00..U+dfff)
* @return supplementary code point (U+10000..U+10ffff)
* @draft ICU 2.4
*/
#define UTF16_GET_CHAR_UNSAFE(s, i, c) { \
#define U16_GET_SUPPLEMENTARY(lead, trail) \
(((lead)<<10UL)+(trail)-U16_SURROGATE_OFFSET)
/**
* Get the lead surrogate (0xd800..0xdbff) for a
* supplementary code point (0x10000..0x10ffff).
* @param c 32-bit code point (U+10000..U+10ffff)
* @return lead surrogate (U+d800..U+dbff) for c
* @draft ICU 2.4
*/
#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
/**
* Get the trail surrogate (0xdc00..0xdfff) for a
* supplementary code point (0x10000..0x10ffff).
* @param c 32-bit code point (U+10000..U+10ffff)
* @return trail surrogate (U+dc00..U+dfff) for c
* @draft ICU 2.4
*/
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
/**
* How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
* The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
* @param c 32-bit code point
* @return 1 or 2
* @draft ICU 2.4
*/
#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
/**
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
* @return 2
* @draft ICU 2.4
*/
#define U16_MAX_LENGTH 2
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
* The result is undefined if the offset points to a single, unpaired surrogate.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_GET
* @draft ICU 2.4
*/
#define U16_GET_UNSAFE(s, i, c) { \
(c)=(s)[i]; \
if(UTF_IS_SURROGATE(c)) { \
if(UTF_IS_SURROGATE_FIRST(c)) { \
(c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \
if(U16_IS_SURROGATE(c)) { \
if(U16_IS_SURROGATE_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
} else { \
(c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \
(c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
} \
} \
}
#define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to either the lead or trail surrogate unit
* for a supplementary code point, in which case the macro will read
* the adjacent matching surrogate as well.
* If the offset points to a single, unpaired surrogate, then that itself
* will be returned as the code point.
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, start<=i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_GET_UNSAFE
* @draft ICU 2.4
*/
#define U16_GET(s, start, i, length, c) { \
(c)=(s)[i]; \
if(UTF_IS_SURROGATE(c)) { \
if(U16_IS_SURROGATE(c)) { \
uint16_t __c2; \
if(UTF_IS_SURROGATE_FIRST(c)) { \
if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \
(c)=UTF16_GET_PAIR_VALUE((c), __c2); \
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
} else if(strict) {\
/* unmatched first surrogate */ \
(c)=UTF_ERROR_VALUE; \
if(U16_IS_SURROGATE_LEAD(c)) { \
if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} else { \
if((i)-1>=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
(c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
} else if(strict) {\
/* unmatched second surrogate */ \
(c)=UTF_ERROR_VALUE; \
if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
(c)=UTF_ERROR_VALUE; \
} \
}
/* definitions with forward iteration --------------------------------------- */
/*
* all the macros that go forward assume that
* the initial offset is 0<=i<length;
* they update the offset
*/
/* fast versions, no error-checking */
/**
* Get a single code point from an offset that points to the first
* of the code units that belong to that code point.
* Assume 0<=i<length.
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset points to a single, unpaired lead surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_NEXT
* @draft ICU 2.4
*/
#define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \
#define U16_NEXT_UNSAFE(s, i, c) { \
(c)=(s)[(i)++]; \
if(UTF_IS_FIRST_SURROGATE(c)) { \
(c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \
if(U16_IS_LEAD(c)) { \
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
} \
}
#define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The offset may point to the lead surrogate unit
* for a supplementary code point, in which case the macro will read
* the following trail surrogate as well.
* If the offset points to a trail surrogate or
* to a single, unpaired lead surrogate, then that itself
* will be returned as the code point.
*
* @param s const UChar * string
* @param i string offset, i<length
* @param length string length
* @param c output UChar32 variable
* @see U16_NEXT_UNSAFE
* @draft ICU 2.4
*/
#define U16_NEXT(s, i, length, c) { \
(c)=(s)[(i)++]; \
if(U16_IS_LEAD(c)) { \
uint16_t __c2; \
if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
++(i); \
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
} \
} \
}
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const UChar * string buffer
* @param i string offset
* @param c code point to append
* @see U16_APPEND
* @draft ICU 2.4
*/
#define U16_APPEND_UNSAFE(s, i, c) { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else { \
@ -158,178 +282,323 @@
} \
}
#define UTF16_FWD_1_UNSAFE(s, i) { \
if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a surrogate pair is written, checks for sufficient space in the string.
* If the code point is not valid or a trail surrogate does not fit,
* then isError is set to TRUE.
*
* @param s const UChar * string buffer
* @param i string offset, i<length
* @param capacity size of the string buffer
* @param c code point to append
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
* @see U16_APPEND_UNSAFE
* @draft ICU 2.4
*/
#define U16_APPEND(s, i, capacity, c, isError) { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} else /* c>0x10ffff or not enough space */ { \
(isError)=TRUE; \
} \
}
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_FWD_1
* @draft ICU 2.4
*/
#define U16_FWD_1_UNSAFE(s, i) { \
if(U16_IS_LEAD((s)[(i)++])) { \
++(i); \
} \
}
#define UTF16_FWD_N_UNSAFE(s, i, n) { \
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param i string offset, i<length
* @param length string length
* @see U16_FWD_1_UNSAFE
* @draft ICU 2.4
*/
#define U16_FWD_1(s, i, length) { \
if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
}
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_FWD_N
* @draft ICU 2.4
*/
#define U16_FWD_N_UNSAFE(s, i, n) { \
int32_t __N=(n); \
while(__N>0) { \
UTF16_FWD_1_UNSAFE(s, i); \
U16_FWD_1_UNSAFE(s, i); \
--__N; \
} \
}
/**
* Set a random-access offset and adjust it so that
* it points to the beginning of a Unicode character.
* The offset that is passed in points to
* any code unit of a code point
* and will point to the first code unit after
* the macro invocation.
* Never increments the offset.
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param i string offset, i<length
* @param length string length
* @param n number of code points to skip
* @see U16_FWD_N_UNSAFE
* @draft ICU 2.4
*/
#define UTF16_SET_CHAR_START_UNSAFE(s, i) { \
if(UTF_IS_SECOND_SURROGATE((s)[i])) { \
--(i); \
} \
}
/* safe versions with error-checking and optional regularity-checking */
#define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
(c)=(s)[(i)++]; \
if(UTF_IS_FIRST_SURROGATE(c)) { \
uint16_t __c2; \
if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \
++(i); \
(c)=UTF16_GET_PAIR_VALUE((c), __c2); \
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
} else if(strict) {\
/* unmatched first surrogate */ \
(c)=UTF_ERROR_VALUE; \
} \
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
/* unmatched second surrogate or other non-character */ \
(c)=UTF_ERROR_VALUE; \
} \
}
#define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \
if((uint32_t)(c)<=0xffff) { \
(s)[(i)++]=(uint16_t)(c); \
} else if((uint32_t)(c)<=0x10ffff) { \
if((i)+1<(length)) { \
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
} else /* not enough space */ { \
(s)[(i)++]=UTF_ERROR_VALUE; \
} \
} else /* c>0x10ffff, write error value */ { \
(s)[(i)++]=UTF_ERROR_VALUE; \
} \
}
#define UTF16_FWD_1_SAFE(s, i, length) { \
if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
++(i); \
} \
}
#define UTF16_FWD_N_SAFE(s, i, length, n) { \
#define U16_FWD_N(s, i, length, n) { \
int32_t __N=(n); \
while(__N>0 && (i)<(length)) { \
UTF16_FWD_1_SAFE(s, i, length); \
U16_FWD_1(s, i, length); \
--__N; \
} \
}
#define UTF16_SET_CHAR_START_SAFE(s, start, i) { \
if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_START
* @draft ICU 2.4
*/
#define U16_SET_CP_START_UNSAFE(s, i) { \
if(U16_IS_TRAIL((s)[i])) { \
--(i); \
} \
}
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to the trail surrogate of a surrogate pair,
* then the offset is decremented.
* Otherwise, it is not modified.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, start<=i
* @see U16_SET_CP_START_UNSAFE
* @draft ICU 2.4
*/
#define U16_SET_CP_START(s, start, i) { \
if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
}
/* definitions with backward iteration -------------------------------------- */
/*
* all the macros that go backward assume that
* the valid buffer range starts at offset 0
* and that the initial offset is 0<i<=length;
* they update the offset
*/
/* fast versions, no error-checking */
/**
* Get a single code point from an offset that points behind the last
* of the code units that belong to that code point.
* Assume 0<=i<length.
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-16.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind a single, unpaired trail surrogate.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U16_PREV
* @draft ICU 2.4
*/
#define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \
#define U16_PREV_UNSAFE(s, i, c) { \
(c)=(s)[--(i)]; \
if(UTF_IS_SECOND_SURROGATE(c)) { \
(c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \
if(U16_IS_TRAIL(c)) { \
(c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
} \
}
#define UTF16_BACK_1_UNSAFE(s, i) { \
if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a trail surrogate unit
* for a supplementary code point, then the macro will read
* the preceding lead surrogate as well.
* If the offset is behind a lead surrogate or behind a single, unpaired
* trail surrogate, then that itself
* will be returned as the code point.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, start<=i
* @param length string length
* @param c output UChar32 variable
* @see U16_PREV_UNSAFE
* @draft ICU 2.4
*/
#define U16_PREV(s, start, i, c) { \
(c)=(s)[--(i)]; \
if(U16_IS_TRAIL(c)) { \
uint16_t __c2; \
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
--(i); \
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
} \
} \
}
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_BACK_1
* @draft ICU 2.4
*/
#define U16_BACK_1_UNSAFE(s, i) { \
if(U16_IS_TRAIL((s)[--(i)])) { \
--(i); \
} \
}
#define UTF16_BACK_N_UNSAFE(s, i, n) { \
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, start<=i
* @see U16_BACK_1_UNSAFE
* @draft ICU 2.4
*/
#define U16_BACK_1(s, start, i) { \
if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
--(i); \
} \
}
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U16_BACK_N
* @draft ICU 2.4
*/
#define U16_BACK_N_UNSAFE(s, i, n) { \
int32_t __N=(n); \
while(__N>0) { \
UTF16_BACK_1_UNSAFE(s, i); \
U16_BACK_1_UNSAFE(s, i); \
--__N; \
} \
}
/**
* Set a random-access offset and adjust it so that
* it points after the end of a Unicode character.
* The offset that is passed in points behind
* any code unit of a code point
* and will point behind the last code unit after
* the macro invocation.
* Never decrements the offset.
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param i string offset, i<length
* @param length string length
* @param n number of code points to skip
* @see U16_BACK_N_UNSAFE
* @draft ICU 2.4
*/
#define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \
if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
#define U16_BACK_N(s, start, i, n) { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U16_BACK_1(s, start, i); \
--__N; \
} \
}
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-16.
*
* @param s const UChar * string
* @param i string offset
* @see U16_SET_CP_LIMIT
* @draft ICU 2.4
*/
#define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
if(U16_IS_LEAD((s)[(i)-1])) { \
++(i); \
} \
}
/* safe versions with error-checking and optional regularity-checking */
#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
(c)=(s)[--(i)]; \
if(UTF_IS_SECOND_SURROGATE(c)) { \
uint16_t __c2; \
if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
--(i); \
(c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
} else if(strict) {\
/* unmatched second surrogate */ \
(c)=UTF_ERROR_VALUE; \
} \
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
/* unmatched first surrogate or other non-character */ \
(c)=UTF_ERROR_VALUE; \
} \
}
#define UTF16_BACK_1_SAFE(s, start, i) { \
if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
--(i); \
} \
}
#define UTF16_BACK_N_SAFE(s, start, i, n) { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
UTF16_BACK_1_SAFE(s, start, i); \
--__N; \
} \
}
#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind the lead surrogate of a surrogate pair,
* then the offset is incremented.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, start<=i<=length
* @param length string length
* @see U16_SET_CP_LIMIT_UNSAFE
* @draft ICU 2.4
*/
#define U16_SET_CP_LIMIT(s, start, i, length) { \
if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
++(i); \
} \
}

View file

@ -14,146 +14,10 @@
* created by: Markus W. Scherer
*/
/**
* \file
* \brief C API: UTF-32 macros
*
* This file defines macros to deal with UTF-32 code units and code points.
* Signatures and semantics are the same as for the similarly named macros
* in utf16.h.
* utf32.h is included by utf.h after unicode/umachine.h</p>
* and some common definitions.
* <p><b>Usage:</b> ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.</p>
*/
#ifndef __UTF32_H__
#define __UTF32_H__
/* internal definitions ----------------------------------------------------- */
#define UTF32_IS_SAFE(c, strict) \
(!(strict) ? \
(uint32_t)(c)<=0x10ffff : \
UTF_IS_UNICODE_CHAR(c))
/*
* For the semantics of all of these macros, see utf16.h.
* The UTF-32 versions are trivial because any code point is
* encoded using exactly one code unit.
* \file
* \brief C API: UTF-32 macros
*
* This file is deprecated and its contents moved to utf_old.h.
* See utf_old.h and Jitterbug 2150 and its discussion on the ICU mailing list
* in September 2002.
*/
/* single-code point definitions -------------------------------------------- */
/* classes of code unit values */
#define UTF32_IS_SINGLE(uchar) 1
#define UTF32_IS_LEAD(uchar) 0
#define UTF32_IS_TRAIL(uchar) 0
/* number of code units per code point */
#define UTF32_NEED_MULTIPLE_UCHAR(c) 0
#define UTF32_CHAR_LENGTH(c) 1
#define UTF32_MAX_CHAR_LENGTH 1
/* average number of code units compared to UTF-16 */
#define UTF32_ARRAY_SIZE(size) (size)
#define UTF32_GET_CHAR_UNSAFE(s, i, c) { \
(c)=(s)[i]; \
}
#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
(c)=(s)[i]; \
if(!UTF32_IS_SAFE(c, strict)) { \
(c)=UTF_ERROR_VALUE; \
} \
}
/* definitions with forward iteration --------------------------------------- */
#define UTF32_NEXT_CHAR_UNSAFE(s, i, c) { \
(c)=(s)[(i)++]; \
}
#define UTF32_APPEND_CHAR_UNSAFE(s, i, c) { \
(s)[(i)++]=(c); \
}
#define UTF32_FWD_1_UNSAFE(s, i) { \
++(i); \
}
#define UTF32_FWD_N_UNSAFE(s, i, n) { \
(i)+=(n); \
}
#define UTF32_SET_CHAR_START_UNSAFE(s, i) { \
}
#define UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
(c)=(s)[(i)++]; \
if(!UTF32_IS_SAFE(c, strict)) { \
(c)=UTF_ERROR_VALUE; \
} \
}
#define UTF32_APPEND_CHAR_SAFE(s, i, length, c) { \
if((uint32_t)(c)<=0x10ffff) { \
(s)[(i)++]=(c); \
} else /* c>0x10ffff, write 0xfffd */ { \
(s)[(i)++]=0xfffd; \
} \
}
#define UTF32_FWD_1_SAFE(s, i, length) { \
++(i); \
}
#define UTF32_FWD_N_SAFE(s, i, length, n) { \
if(((i)+=(n))>(length)) { \
(i)=(length); \
} \
}
#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \
}
/* definitions with backward iteration -------------------------------------- */
#define UTF32_PREV_CHAR_UNSAFE(s, i, c) { \
(c)=(s)[--(i)]; \
}
#define UTF32_BACK_1_UNSAFE(s, i) { \
--(i); \
}
#define UTF32_BACK_N_UNSAFE(s, i, n) { \
(i)-=(n); \
}
#define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \
}
#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \
(c)=(s)[--(i)]; \
if(!UTF32_IS_SAFE(c, strict)) { \
(c)=UTF_ERROR_VALUE; \
} \
}
#define UTF32_BACK_1_SAFE(s, start, i) { \
--(i); \
}
#define UTF32_BACK_N_SAFE(s, start, i, n) { \
(i)-=(n); \
if((i)<(start)) { \
(i)=(start); \
} \
}
#define UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) { \
}
#endif

View file

@ -15,23 +15,25 @@
*/
/**
* \file
* \brief C API: UTF-8 macros
*
* This file defines macros to deal with UTF-8 code units and code points.
* Signatures and semantics are the same as for the similarly named macros
* in utf16.h.
* utf8.h is included by utf.h after unicode/umachine.h
* and some common definitions.</p>
* <p><b>Usage:</b> ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.</p>
*/
* \file
* \brief C API: 8-bit Unicode handling macros
*
* This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
* utf8.h is included by utf.h after unicode/umachine.h
* and some common definitions.
*
* For more information see utf.h and the ICU User Guide Strings chapter
* (http://oss.software.ibm.com/icu/userguide/).
*
* <em>Usage:</em>
* ICU coding guidelines for if() statements should be followed when using these macros.
* Compound statements (curly braces {}) must be used for if-else-while...
* bodies and all macro statements should be terminated with semicolon.
*/
/* utf.h must be included first. */
#ifndef __UTF_H__
# include "unicode/utf.h"
# include "unicode/utf.h"
#endif
#ifndef __UTF8_H__
@ -39,6 +41,12 @@
/* internal definitions ----------------------------------------------------- */
/**
* \var utf8_countTrailBytes
* Internal array with numbers of trail bytes for any given byte used in
* lead byte position.
* @internal
*/
#ifdef U_UTF8_IMPL
U_CAPI const uint8_t
utf8_countTrailBytes[256];
@ -48,114 +56,166 @@ utf8_countTrailBytes[256];
#endif
/**
* Count the trail bytes for a lead byte -
* this macro should be used so that the assembler code
* that is mentioned in utf_impl.c could be used here.
* Count the trail bytes for a UTF-8 lead byte.
* @internal
*/
#define UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte])
#define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte])
/* use a macro here, too - there may be a simpler way with some machines */
#define UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
/**
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
* @internal
*/
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
/**
* Function for handling "next code point" with error-checking.
* @internal
*/
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict, UBool *pIsError);
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
/**
* Function for handling "append code point" with error-checking.
* @internal
*/
U_CAPI int32_t U_EXPORT2
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c);
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
/**
* Function for handling "previous code point" with error-checking.
* @internal
*/
U_CAPI UChar32 U_EXPORT2
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
/**
* Function for handling "skip backward one code point" with error-checking.
* @internal
*/
U_CAPI int32_t U_EXPORT2
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
/*
* For the semantics of all of these macros, see utf16.h.
* The UTF-8 macros favor sequences more the shorter they are.
* Sometimes, only the single-byte case is covered by a macro,
* while longer sequences are handled by a function call.
*/
/* single-code point definitions -------------------------------------------- */
/** Is this this code point a single code unit (byte)? */
#define UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0)
/** Is this this code unit the lead code unit (byte) of a code point? */
#define UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e)
/** Is this this code unit a trailing code unit (byte) of a code point? */
#define UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80)
/** Does this scalar Unicode value need multiple code units for storage? */
#define UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f)
/**
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
/**
* Given the lead character, how many bytes are taken by this code point.
* ICU does not deal with code points >0x10ffff
* unless necessary for advancing in the byte stream.
*
* These length macros take into account that for values >0x10ffff
* the "safe" append macros would write the error code point 0xffff
* with 3 bytes.
* Code point comparisons need to be in uint32_t because UChar32
* may be a signed type, and negative values must be recognized.
* Is this code unit (byte) a UTF-8 lead byte?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#if 1
# define UTF8_CHAR_LENGTH(c) \
((uint32_t)(c)<=0x7f ? 1 : \
((uint32_t)(c)<=0x7ff ? 2 : \
((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \
) \
)
#else
# define UTF8_CHAR_LENGTH(c) \
((uint32_t)(c)<=0x7f ? 1 : \
((uint32_t)(c)<=0x7ff ? 2 : \
((uint32_t)(c)<=0xffff ? 3 : \
((uint32_t)(c)<=0x10ffff ? 4 : \
((uint32_t)(c)<=0x3ffffff ? 5 : \
((uint32_t)(c)<=0x7fffffff ? 6 : 3) \
) \
) \
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
/**
* Is this code unit (byte) a UTF-8 trail byte?
* @param c 8-bit code unit (byte)
* @return TRUE or FALSE
* @draft ICU 2.4
*/
#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
/**
* How many code units (bytes) are used for the UTF-8 encoding
* of this Unicode code point?
* @param c 32-bit code point
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
* @draft ICU 2.4
*/
#define U8_LENGTH(c) \
((uint32_t)(c)<=0x7f ? 1 : \
((uint32_t)(c)<=0x7ff ? 2 : \
((uint32_t)(c)<=0xd7ff ? 3 : \
((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
((uint32_t)(c)<=0xffff ? 3 : 4)\
) \
) \
)
#endif
) \
)
/** The maximum number of bytes per code point */
#define UTF8_MAX_CHAR_LENGTH 4
/**
* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
* @return 4
* @draft ICU 2.4
*/
#define U8_MAX_LENGTH 4
/** Average number of code units compared to UTF-16 */
#define UTF8_ARRAY_SIZE(size) ((5*(size))/2)
#define UTF8_GET_CHAR_UNSAFE(s, i, c) { \
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
* The result is undefined if the offset points to an illegal UTF-8
* byte sequence.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_GET
* @draft ICU 2.4
*/
#define U8_GET_UNSAFE(s, i, c) { \
int32_t __I=(int32_t)(i); \
UTF8_SET_CHAR_START_UNSAFE(s, __I); \
UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \
U8_SET_CP_START_UNSAFE(s, __I); \
U8_NEXT_UNSAFE(s, __I, c); \
}
#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
/**
* Get a code point from a string at a random-access offset,
* without changing the offset.
* The offset may point to either the lead byte or one of the trail bytes
* for a code point, in which case the macro will read all of the bytes
* for the code point.
* If the offset points to an illegal UTF-8 byte sequence, then
* c is set to a negative value.
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
*
* @param s const UChar * string
* @param start starting string offset
* @param i string offset, start<=i<length
* @param length string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_GET_UNSAFE
* @draft ICU 2.4
*/
#define U8_GET(s, start, i, length, c) { \
int32_t __I=(int32_t)(i); \
UTF8_SET_CHAR_START_SAFE(s, start, __I); \
UTF8_NEXT_CHAR_SAFE(s, __I, length, c, strict); \
U8_SET_CP_START(s, start, __I); \
U8_NEXT(s, __I, length, c); \
}
/* definitions with forward iteration --------------------------------------- */
/**
* Read a Unicode scalar value from an array of UTF-8 bytes.
* Only values <=0x10ffff are accepted, and if an error occurs,
* then c will be set such that UTF_IS_ERROR(c).
* The _UNSAFE macro is fast and does not check for errors.
* The _SAFE macro checks for errors and optionally for
* irregular sequences, too, i.e., for sequences that
* are longer than necessary, such as <c0 80> instead of <0>.
* The strict checks also check for non-characters.
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* The result is undefined if the offset points to a trail byte
* or an illegal UTF-8 sequence.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_NEXT
* @draft ICU 2.4
*/
#define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \
#define U8_NEXT_UNSAFE(s, i, c) { \
(c)=(s)[(i)++]; \
if((uint8_t)((c)-0xc0)<0x35) { \
uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \
UTF8_MASK_LEAD_BYTE(c, __count); \
uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \
U8_MASK_LEAD_BYTE(c, __count); \
switch(__count) { \
/* each following branch falls through to the next one */ \
case 3: \
@ -170,7 +230,49 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
} \
}
#define UTF8_APPEND_CHAR_UNSAFE(s, i, c) { \
/**
* Get a code point from a string at a code point boundary offset,
* and advance the offset to the next code point boundary.
* (Post-incrementing forward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The offset may point to the lead byte of a multi-byte sequence,
* in which case the macro will read the whole sequence.
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
* c is set to a negative value.
*
* @param s const UChar * string
* @param i string offset, i<length
* @param length string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_NEXT_UNSAFE
* @draft ICU 2.4
*/
#define U8_NEXT(s, i, length, c) { \
(c)=(s)[(i)++]; \
if((c)>=0x80) { \
if(U8_IS_LEAD(c)) { \
(c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, -1); \
} else { \
(c)=U_SENTINEL; \
} \
} \
}
/**
* Append a code point to a string, overwriting 1 to 4 bytes.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
* Otherwise, the result is undefined.
*
* @param s const UChar * string buffer
* @param i string offset
* @param c code point to append
* @see U8_APPEND
* @draft ICU 2.4
*/
#define U8_APPEND_UNSAFE(s, i, c) { \
if((uint32_t)(c)<=0x7f) { \
(s)[(i)++]=(uint8_t)(c); \
} else { \
@ -189,74 +291,172 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
} \
}
#define UTF8_FWD_1_UNSAFE(s, i) { \
(i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \
}
#define UTF8_FWD_N_UNSAFE(s, i, n) { \
int32_t __N=(n); \
while(__N>0) { \
UTF8_FWD_1_UNSAFE(s, i); \
--__N; \
} \
}
#define UTF8_SET_CHAR_START_UNSAFE(s, i) { \
while(UTF8_IS_TRAIL((s)[i])) { --(i); } \
}
#define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
(c)=(s)[(i)++]; \
if((c)>=0x80) { \
if(UTF8_IS_LEAD(c)) { \
(c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict, NULL); \
} else { \
(c)=UTF8_ERROR_VALUE_1; \
} \
} \
}
#define UTF8_APPEND_CHAR_SAFE(s, i, length, c) { \
/**
* Append a code point to a string, overwriting 1 or 2 code units.
* The offset points to the current end of the string contents
* and is advanced (post-increment).
* "Safe" macro, checks for a valid code point.
* If a non-ASCII code point is written, checks for sufficient space in the string.
* If the code point is not valid or trail bytes do not fit,
* then isError is set to TRUE.
*
* @param s const UChar * string buffer
* @param i string offset, i<length
* @param capacity size of the string buffer
* @param c code point to append
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
* @see U8_APPEND_UNSAFE
* @draft ICU 2.4
*/
#define U8_APPEND(s, i, length, c, isError) { \
if((uint32_t)(c)<=0x7f) { \
(s)[(i)++]=(uint8_t)(c); \
} else { \
(i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c); \
(i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, &(isError)); \
} \
}
#define UTF8_FWD_1_SAFE(s, i, length) { \
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const UChar * string
* @param i string offset
* @see U8_FWD_1
* @draft ICU 2.4
*/
#define U8_FWD_1_UNSAFE(s, i) { \
(i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \
}
/**
* Advance the string offset from one code point boundary to the next.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const UChar * string
* @param i string offset, i<length
* @param length string length
* @see U8_FWD_1_UNSAFE
* @draft ICU 2.4
*/
#define U8_FWD_1(s, i, length) { \
uint8_t __b=(s)[(i)++]; \
if(UTF8_IS_LEAD(__b)) { \
uint8_t __count=UTF8_COUNT_TRAIL_BYTES(__b); \
if(U8_IS_LEAD(__b)) { \
uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
if((i)+__count>(length)) { \
__count=(uint8_t)((length)-(i)); \
} \
while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \
while(__count>0 && U8_IS_TRAIL((s)[i])) { \
++(i); \
--__count; \
} \
} \
}
#define UTF8_FWD_N_SAFE(s, i, length, n) { \
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U8_FWD_N
* @draft ICU 2.4
*/
#define U8_FWD_N_UNSAFE(s, i, n) { \
int32_t __N=(n); \
while(__N>0 && (i)<(length)) { \
UTF8_FWD_1_SAFE(s, i, length); \
while(__N>0) { \
U8_FWD_1_UNSAFE(s, i); \
--__N; \
} \
}
#define UTF8_SET_CHAR_START_SAFE(s, start, i) { \
if(UTF8_IS_TRAIL((s)[(i)])) { \
/**
* Advance the string offset from one code point boundary to the n-th next one,
* i.e., move forward by n code points.
* (Post-incrementing iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const UChar * string
* @param i string offset, i<length
* @param length string length
* @param n number of code points to skip
* @see U8_FWD_N_UNSAFE
* @draft ICU 2.4
*/
#define U8_FWD_N(s, i, length, n) { \
int32_t __N=(n); \
while(__N>0 && (i)<(length)) { \
U8_FWD_1(s, i, length); \
--__N; \
} \
}
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const UChar * string
* @param i string offset
* @see U8_SET_CP_START
* @draft ICU 2.4
*/
#define U8_SET_CP_START_UNSAFE(s, i) { \
while(U8_IS_TRAIL((s)[i])) { --(i); } \
}
/**
* Adjust a random-access offset to a code point boundary
* at the start of a code point.
* If the offset points to a UTF-8 trail byte,
* then the offset is moved backward to the corresponding lead byte.
* Otherwise, it is not modified.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, start<=i
* @see U8_SET_CP_START_UNSAFE
* @draft ICU 2.4
*/
#define U8_SET_CP_START(s, start, i) { \
if(U8_IS_TRAIL((s)[(i)])) { \
(i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \
} \
}
/* definitions with backward iteration -------------------------------------- */
#define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Unsafe" macro, assumes well-formed UTF-8.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* The result is undefined if the offset is behind an illegal UTF-8 sequence.
*
* @param s const UChar * string
* @param i string offset
* @param c output UChar32 variable
* @see U8_PREV
* @draft ICU 2.4
*/
#define U8_PREV_UNSAFE(s, i, c) { \
(c)=(s)[--(i)]; \
if(UTF8_IS_TRAIL(c)) { \
if(U8_IS_TRAIL(c)) { \
uint8_t __b, __count=1, __shift=6; \
\
/* c is a trail byte */ \
@ -264,7 +464,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
for(;;) { \
__b=(s)[--(i)]; \
if(__b>=0xc0) { \
UTF8_MASK_LEAD_BYTE(__b, __count); \
U8_MASK_LEAD_BYTE(__b, __count); \
(c)|=(UChar32)__b<<__shift; \
break; \
} else { \
@ -276,57 +476,151 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
} \
}
#define UTF8_BACK_1_UNSAFE(s, i) { \
while(UTF8_IS_TRAIL((s)[--(i)])) {} \
}
#define UTF8_BACK_N_UNSAFE(s, i, n) { \
int32_t __N=(n); \
while(__N>0) { \
UTF8_BACK_1_UNSAFE(s, i); \
--__N; \
} \
}
#define UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) { \
UTF8_BACK_1_UNSAFE(s, i); \
UTF8_FWD_1_UNSAFE(s, i); \
}
#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \
/**
* Move the string offset from one code point boundary to the previous one
* and get the code point between them.
* (Pre-decrementing backward iteration.)
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* The input offset may be the same as the string length.
* If the offset is behind a multi-byte sequence, then the macro will read
* the whole sequence.
* If the offset is behind a lead byte, then that itself
* will be returned as the code point.
* If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, start<=i
* @param length string length
* @param c output UChar32 variable, set to <0 in case of an error
* @see U8_PREV_UNSAFE
* @draft ICU 2.4
*/
#define U8_PREV(s, start, i, c) { \
(c)=(s)[--(i)]; \
if((c)>=0x80) { \
if((c)<=0xbf) { \
(c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \
(c)=utf8_prevCharSafeBody(s, start, &(i), c, -1); \
} else { \
(c)=UTF8_ERROR_VALUE_1; \
(c)=U_SENTINEL; \
} \
} \
}
#define UTF8_BACK_1_SAFE(s, start, i) { \
if(UTF8_IS_TRAIL((s)[--(i)])) { \
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const UChar * string
* @param i string offset
* @see U8_BACK_1
* @draft ICU 2.4
*/
#define U8_BACK_1_UNSAFE(s, i) { \
while(U8_IS_TRAIL((s)[--(i)])) {} \
}
/**
* Move the string offset from one code point boundary to the previous one.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, start<=i
* @see U8_BACK_1_UNSAFE
* @draft ICU 2.4
*/
#define U8_BACK_1(s, start, i) { \
if(U8_IS_TRAIL((s)[--(i)])) { \
(i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \
} \
}
#define UTF8_BACK_N_SAFE(s, start, i, n) { \
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const UChar * string
* @param i string offset
* @param n number of code points to skip
* @see U8_BACK_N
* @draft ICU 2.4
*/
#define U8_BACK_N_UNSAFE(s, i, n) { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
UTF8_BACK_1_SAFE(s, start, i); \
while(__N>0) { \
U8_BACK_1_UNSAFE(s, i); \
--__N; \
} \
}
/*
* Need to use UTF8_FWD_1_SAFE() because UTF8_BACK_1_SAFE()
* may have started from the middle of the sequence and not checked
* all trail bytes.
/**
* Move the string offset from one code point boundary to the n-th one before it,
* i.e., move backward by n code points.
* (Pre-decrementing backward iteration.)
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const UChar * string
* @param i string offset, i<length
* @param length string length
* @param n number of code points to skip
* @see U8_BACK_N_UNSAFE
* @draft ICU 2.4
*/
#define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
#define U8_BACK_N(s, start, i, n) { \
int32_t __N=(n); \
while(__N>0 && (i)>(start)) { \
U8_BACK_1(s, start, i); \
--__N; \
} \
}
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Unsafe" macro, assumes well-formed UTF-8.
*
* @param s const UChar * string
* @param i string offset
* @see U8_SET_CP_LIMIT
* @draft ICU 2.4
*/
#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \
U8_BACK_1_UNSAFE(s, i); \
U8_FWD_1_UNSAFE(s, i); \
}
/**
* Adjust a random-access offset to a code point boundary after a code point.
* If the offset is behind a partial multi-byte sequence,
* then the offset is incremented to behind the whole sequence.
* Otherwise, it is not modified.
* The input offset may be the same as the string length.
* "Safe" macro, checks for illegal sequences and for string boundaries.
*
* @param s const UChar * string
* @param start starting string offset (usually 0)
* @param i string offset, start<=i<=length
* @param length string length
* @see U8_SET_CP_LIMIT_UNSAFE
* @draft ICU 2.4
*/
#define U8_SET_CP_LIMIT(s, start, i, length) { \
if((start)<(i) && (i)<(length)) { \
UTF8_BACK_1_SAFE(s, start, i); \
UTF8_FWD_1_SAFE(s, i, length); \
U8_BACK_1(s, start, i); \
U8_FWD_1(s, i, length); \
} \
}

File diff suppressed because it is too large Load diff

View file

@ -198,7 +198,8 @@ UnicodeString::UnicodeString(UChar32 ch)
fFlags(kShortString)
{
int32_t i = 0;
UTF_APPEND_CHAR(fStackBuffer, i, US_STACKBUF_SIZE, ch);
UBool isError = FALSE;
U16_APPEND(fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
fLength = i;
}

View file

@ -157,140 +157,118 @@ u_strchr32(const UChar *s, UChar32 c) {
}
}
/*
* Match each code point in a string against each code point in the matchSet.
* Return the index of the first string code point that
* is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
* Return -(string length)-1 if there is no such code point.
*/
static int32_t
_matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
int32_t matchLen, matchBMPLen, strItr, matchItr;
UChar32 stringCh, matchCh;
UChar c, c2;
/* first part of matchSet contains only BMP code points */
matchBMPLen = 0;
while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
++matchBMPLen;
}
/* second part of matchSet contains BMP and supplementary code points */
matchLen = matchBMPLen;
while(matchSet[matchLen] != 0) {
++matchLen;
}
for(strItr = 0; (c = string[strItr]) != 0;) {
++strItr;
if(U16_IS_SINGLE(c)) {
if(polarity) {
for(matchItr = 0; matchItr < matchLen; ++matchItr) {
if(c == matchSet[matchItr]) {
return strItr - 1; /* one matches */
}
}
} else {
for(matchItr = 0; matchItr < matchLen; ++matchItr) {
if(c == matchSet[matchItr]) {
goto endloop;
}
}
return strItr - 1; /* none matches */
}
} else {
/*
* No need to check for string length before U16_IS_TRAIL
* because c2 could at worst be the terminating NUL.
*/
if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
++strItr;
stringCh = U16_GET_SUPPLEMENTARY(c, c2);
} else {
stringCh = c; /* unpaired trail surrogate */
}
if(polarity) {
for(matchItr = matchBMPLen; matchItr < matchLen;) {
U16_NEXT(matchSet, matchItr, matchLen, matchCh);
if(stringCh == matchCh) {
return strItr - U16_LENGTH(stringCh); /* one matches */
}
}
} else {
for(matchItr = matchBMPLen; matchItr < matchLen;) {
U16_NEXT(matchSet, matchItr, matchLen, matchCh);
if(stringCh == matchCh) {
goto endloop;
}
}
return strItr - U16_LENGTH(stringCh); /* none matches */
}
}
endloop:
/* wish C had continue with labels like Java... */;
}
/* Didn't find it. */
return -strItr-1;
}
/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
U_CAPI UChar * U_EXPORT2
u_strpbrk(const UChar *string, const UChar *matchSet)
{
int32_t matchLen;
UBool single = TRUE;
for (matchLen = 0; matchSet[matchLen]; matchLen++)
{
if (!UTF_IS_SINGLE(matchSet[matchLen]))
{
single = FALSE;
}
int32_t index = _matchFromSet(string, matchSet, TRUE);
if(index >= 0) {
return (UChar *)string + index;
} else {
return NULL;
}
if (single)
{
const UChar *matchItr;
const UChar *strItr;
for (strItr = string; *strItr; strItr++)
{
for (matchItr = matchSet; *matchItr; matchItr++)
{
if (*matchItr == *strItr)
{
return (UChar *)strItr;
}
}
}
}
else
{
int32_t matchItr;
int32_t strItr;
UChar32 stringCh, matchSetCh;
int32_t stringLen = u_strlen(string);
for (strItr = 0; strItr < stringLen; strItr++)
{
UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE);
for (matchItr = 0; matchItr < matchLen; matchItr++)
{
UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE);
if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE
|| string[strItr] == UTF_ERROR_VALUE
|| (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr]))))
{
return (UChar *)string + strItr;
}
}
}
}
/* Didn't find it. */
return NULL;
}
/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
U_CAPI int32_t U_EXPORT2
u_strcspn(const UChar *string, const UChar *matchSet)
{
const UChar *foundStr = u_strpbrk(string, matchSet);
if (foundStr == NULL)
{
return u_strlen(string);
int32_t index = _matchFromSet(string, matchSet, TRUE);
if(index >= 0) {
return index;
} else {
return -index - 1; /* == u_strlen(string) */
}
return foundStr - string;
}
/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
U_CAPI int32_t U_EXPORT2
u_strspn(const UChar *string, const UChar *matchSet)
{
UBool single = TRUE;
UBool match = TRUE;
int32_t matchLen;
int32_t retValue;
for (matchLen = 0; matchSet[matchLen]; matchLen++)
{
if (!UTF_IS_SINGLE(matchSet[matchLen]))
{
single = FALSE;
}
int32_t index = _matchFromSet(string, matchSet, FALSE);
if(index >= 0) {
return index;
} else {
return -index - 1; /* == u_strlen(string) */
}
if (single)
{
const UChar *matchItr;
const UChar *strItr;
for (strItr = string; *strItr && match; strItr++)
{
match = FALSE;
for (matchItr = matchSet; *matchItr; matchItr++)
{
if (*matchItr == *strItr)
{
match = TRUE;
break;
}
}
}
retValue = strItr - string - (match == FALSE);
}
else
{
int32_t matchItr;
int32_t strItr;
UChar32 stringCh, matchSetCh;
int32_t stringLen = u_strlen(string);
for (strItr = 0; strItr < stringLen && match; strItr++)
{
match = FALSE;
UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE);
for (matchItr = 0; matchItr < matchLen; matchItr++)
{
UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE);
if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE
|| string[strItr] == UTF_ERROR_VALUE
|| (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr]))))
{
match = TRUE;
break;
}
}
}
retValue = strItr - (match == FALSE);
}
/* Found a mismatch or didn't find it. */
return retValue;
}
/* ----- Text manipulation functions --- */

View file

@ -228,7 +228,6 @@ u_strFromUTF8(UChar *dest,
int32_t index = 0;
int32_t reqLength = 0;
uint8_t* pSrc = (uint8_t*) src;
UBool isError;
/* args check */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
@ -249,8 +248,8 @@ u_strFromUTF8(UChar *dest,
if(ch <=0x7f){
*pDest++=(UChar)ch;
}else{
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, FALSE, &isError);
if(isError){
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
if(ch<0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}else if(ch<=0xFFFF){
@ -272,8 +271,8 @@ u_strFromUTF8(UChar *dest,
if(ch <= 0x7f){
reqLength++;
}else{
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, FALSE, &isError);
if(isError){
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
if(ch<0){
*pErrorCode = U_INVALID_CHAR_FOUND;
return NULL;
}

View file

@ -83,7 +83,7 @@ utf8_errorValue[6]={
};
U_CAPI UChar32 U_EXPORT2
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict, UBool *pIsError) {
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
int32_t i=*pi;
uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
if((i)+count<=(length)) {
@ -118,10 +118,11 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
illegal|=(trail&0xc0)^0x80;
break;
case 0:
if(pIsError!=NULL) {
*pIsError=TRUE;
if(strict>=0) {
return UTF8_ERROR_VALUE_1;
} else {
return U_SENTINEL;
}
return UTF8_ERROR_VALUE_1;
/* no default branch to optimize switch() - all values are covered */
}
@ -132,6 +133,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
* Starting with Unicode 3.0.1, non-shortest forms are illegal.
* Starting with Unicode 3.2, surrogate code points must not be
* encoded in UTF-8, and there are no irregular sequences any more.
*
* U8_ macros (new in ICU 2.4) return negative values for error conditions.
*/
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
@ -145,21 +148,14 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
++(i);
--count;
}
c=utf8_errorValue[errorCount-count];
if(pIsError!=NULL) {
*pIsError=TRUE;
if(strict>=0) {
c=utf8_errorValue[errorCount-count];
} else {
c=U_SENTINEL;
}
} else if((strict) && UTF_IS_UNICODE_NONCHAR(c)) {
} else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) {
/* strict: forbid non-characters like U+fffe */
c=utf8_errorValue[count];
if(pIsError!=NULL) {
*pIsError=TRUE;
}
} else {
/* good result */
if(pIsError!=NULL) {
*pIsError=FALSE;
}
}
} else /* too few bytes left */ {
/* error handling */
@ -168,9 +164,10 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
while((i)<(length) && UTF8_IS_TRAIL(s[i])) {
++(i);
}
c=utf8_errorValue[i-i0];
if(pIsError!=NULL) {
*pIsError=TRUE;
if(strict>=0) {
c=utf8_errorValue[i-i0];
} else {
c=U_SENTINEL;
}
}
*pi=i;
@ -178,8 +175,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
}
U_CAPI int32_t U_EXPORT2
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
if((c)<=0x7ff) {
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
if((uint32_t)(c)<=0x7ff) {
if((i)+1<(length)) {
(s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
@ -187,7 +184,7 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
}
} else if((uint32_t)(c)<=0xffff) {
/* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
if((i)+2<(length) && !UTF_IS_SURROGATE(c)) {
if((i)+2<(length) && !U_IS_SURROGATE(c)) {
(s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
@ -203,18 +200,22 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
}
}
/* c>0x10ffff or not enough space, write an error value */
length-=i;
if(length>0) {
int32_t offset;
if(length>3) {
length=3;
if(pIsError!=NULL) {
*pIsError=TRUE;
} else {
length-=i;
if(length>0) {
int32_t offset;
if(length>3) {
length=3;
}
s+=i;
offset=0;
c=utf8_errorValue[length-1];
UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
i=i+offset;
}
s+=i;
offset=0;
c=utf8_errorValue[length-1];
UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
i=i+offset;
}
}
return i;
}
@ -229,7 +230,11 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
for(;;) {
if(i<=start) {
/* no lead byte at all */
c=UTF8_ERROR_VALUE_1;
if(strict>=0) {
return UTF8_ERROR_VALUE_1;
} else {
return U_SENTINEL;
}
break;
}
@ -250,7 +255,11 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
if(count>=4) {
count=3;
}
c=utf8_errorValue[count];
if(strict>=0) {
c=utf8_errorValue[count];
} else {
c=U_SENTINEL;
}
} else {
/* exit with correct c */
}
@ -260,9 +269,17 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
include the trail byte that we started with */
if(count<shouldCount) {
*pi=i;
c=utf8_errorValue[count];
if(strict>=0) {
c=utf8_errorValue[count];
} else {
c=U_SENTINEL;
}
} else {
c=UTF8_ERROR_VALUE_1;
if(strict>=0) {
c=UTF8_ERROR_VALUE_1;
} else {
c=U_SENTINEL;
}
}
}
break;
@ -273,12 +290,20 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
shift+=6;
} else {
/* more than 5 trail bytes is illegal */
c=UTF8_ERROR_VALUE_1;
if(strict>=0) {
c=UTF8_ERROR_VALUE_1;
} else {
c=U_SENTINEL;
}
break;
}
} else {
/* single-byte character precedes trailing bytes */
c=UTF8_ERROR_VALUE_1;
if(strict>=0) {
c=UTF8_ERROR_VALUE_1;
} else {
c=U_SENTINEL;
}
break;
}
}

View file

@ -942,39 +942,39 @@ static void TestCodePoint(){
UChar32 c=codePoint[i];
log_verbose("Testing code unit value of \\u%4X\n", c);
if(i<6){
if(!UTF_IS_SURROGATE(c)){
if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
}
if(UTF_IS_VALID(c)){
log_err("ERROR: isValid() failed for \\u%4X\n", c);
}
if(UTF_IS_UNICODE_CHAR(c)){
if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
}
if(UTF_IS_ERROR(c)){
log_err("ERROR: isError() failed for \\u%4X\n", c);
}
}else if(i >=6 && i<18){
if(UTF_IS_SURROGATE(c)){
if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
}
if(!UTF_IS_VALID(c)){
log_err("ERROR: isValid() failed for \\u%4X\n", c);
}
if(!UTF_IS_UNICODE_CHAR(c)){
if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
}
if(UTF_IS_ERROR(c)){
log_err("ERROR: isError() failed for \\u%4X\n", c);
}
}else if(i >=18 && i<20){
if(UTF_IS_SURROGATE(c)){
if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
}
if(UTF_IS_VALID(c)){
log_err("ERROR: isValid() failed for \\u%4X\n", c);
}
if(!UTF_IS_UNICODE_CHAR(c)){
if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
}
if(!UTF_IS_ERROR(c)){
@ -982,13 +982,13 @@ static void TestCodePoint(){
}
}
else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){
if(UTF_IS_SURROGATE(c)){
if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
}
if(UTF_IS_VALID(c)){
log_err("ERROR: isValid() failed for \\u%4X\n", c);
}
if(UTF_IS_UNICODE_CHAR(c)){
if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
}
if(!UTF_IS_ERROR(c)){
@ -1018,7 +1018,7 @@ static void TestCharLength()
UBool multiple;
for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){
UChar32 c=codepoint[i+1];
if(UTF_CHAR_LENGTH(c) != codepoint[i]){
if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
log_err("The no: of code units for \\u%4X:- Expected: %d Got: %d", c, codepoint[i], UTF_CHAR_LENGTH(c));
}else{
log_verbose("The no: of code units for \\u%4X is %d", c, UTF_CHAR_LENGTH(c));
@ -1457,7 +1457,6 @@ static void TestStringFunctions()
static void TestStringSearching()
{
UChar ucharBuf[255];
const UChar testString[] = {0x0061, 0x0062, 0x0063, 0x0064, 0x0064, 0x0061, 0};
const UChar testSurrogateString[] = {0xdbff, 0x0061, 0x0062, 0xdbff, 0xdfff, 0x0063, 0x0064, 0x0064, 0xdbff, 0xdfff, 0xdb00, 0xdf00, 0x0061, 0};
const UChar surrMatchSet1[] = {0xdbff, 0xdfff, 0};
@ -1467,55 +1466,67 @@ static void TestStringSearching()
const UChar surrMatchSetBad[] = {0xdbff, 0x0061, 0};
const UChar surrMatchSetBad2[] = {0x0061, 0xdbff, 0};
const UChar surrMatchSetBad3[] = {0xdbff, 0x0061, 0x0062, 0xdbff, 0xdfff, 0}; /* has partial surrogate */
const UChar
empty[] = { 0 },
a[] = { 0x61, 0 },
ab[] = { 0x61, 0x62, 0 },
ba[] = { 0x62, 0x61, 0 },
abcd[] = { 0x61, 0x62, 0x63, 0x64, 0 },
cd[] = { 0x63, 0x64, 0 },
dc[] = { 0x64, 0x63, 0 },
cdh[] = { 0x63, 0x64, 0x68, 0 },
f[] = { 0x66, 0 },
fg[] = { 0x66, 0x67, 0 },
gf[] = { 0x67, 0x66, 0 };
log_verbose("Testing u_strpbrk()");
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "a")) != &testString[0]) {
if (u_strpbrk(testString, a) != &testString[0]) {
log_err("u_strpbrk couldn't find first letter a.\n");
}
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "dc")) != &testString[2]) {
if (u_strpbrk(testString, dc) != &testString[2]) {
log_err("u_strpbrk couldn't find d or c.\n");
}
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "cd")) != &testString[2]) {
if (u_strpbrk(testString, cd) != &testString[2]) {
log_err("u_strpbrk couldn't find c or d.\n");
}
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "cdh")) != &testString[2]) {
if (u_strpbrk(testString, cdh) != &testString[2]) {
log_err("u_strpbrk couldn't find c, d or h.\n");
}
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "f")) != NULL) {
if (u_strpbrk(testString, f) != NULL) {
log_err("u_strpbrk didn't return NULL for \"f\".\n");
}
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "fg")) != NULL) {
if (u_strpbrk(testString, fg) != NULL) {
log_err("u_strpbrk didn't return NULL for \"fg\".\n");
}
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "gf")) != NULL) {
if (u_strpbrk(testString, gf) != NULL) {
log_err("u_strpbrk didn't return NULL for \"gf\".\n");
}
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "")) != NULL) {
if (u_strpbrk(testString, empty) != NULL) {
log_err("u_strpbrk didn't return NULL for \"\".\n");
}
log_verbose("Testing u_strpbrk() with surrogates");
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "a")) != &testSurrogateString[1]) {
if (u_strpbrk(testSurrogateString, a) != &testSurrogateString[1]) {
log_err("u_strpbrk couldn't find first letter a.\n");
}
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != &testSurrogateString[5]) {
if (u_strpbrk(testSurrogateString, dc) != &testSurrogateString[5]) {
log_err("u_strpbrk couldn't find d or c.\n");
}
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != &testSurrogateString[5]) {
if (u_strpbrk(testSurrogateString, cd) != &testSurrogateString[5]) {
log_err("u_strpbrk couldn't find c or d.\n");
}
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "cdh")) != &testSurrogateString[5]) {
if (u_strpbrk(testSurrogateString, cdh) != &testSurrogateString[5]) {
log_err("u_strpbrk couldn't find c, d or h.\n");
}
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != NULL) {
if (u_strpbrk(testSurrogateString, f) != NULL) {
log_err("u_strpbrk didn't return NULL for \"f\".\n");
}
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "fg")) != NULL) {
if (u_strpbrk(testSurrogateString, fg) != NULL) {
log_err("u_strpbrk didn't return NULL for \"fg\".\n");
}
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "gf")) != NULL) {
if (u_strpbrk(testSurrogateString, gf) != NULL) {
log_err("u_strpbrk didn't return NULL for \"gf\".\n");
}
if (u_strpbrk(testSurrogateString, surrMatchSet1) != &testSurrogateString[3]) {
@ -1536,49 +1547,49 @@ static void TestStringSearching()
log_verbose("Testing u_strcspn()");
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "a")) != 0) {
if (u_strcspn(testString, a) != 0) {
log_err("u_strcspn couldn't find first letter a.\n");
}
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "dc")) != 2) {
if (u_strcspn(testString, dc) != 2) {
log_err("u_strcspn couldn't find d or c.\n");
}
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "cd")) != 2) {
if (u_strcspn(testString, cd) != 2) {
log_err("u_strcspn couldn't find c or d.\n");
}
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "cdh")) != 2) {
if (u_strcspn(testString, cdh) != 2) {
log_err("u_strcspn couldn't find c, d or h.\n");
}
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "f")) != u_strlen(testString)) {
if (u_strcspn(testString, f) != u_strlen(testString)) {
log_err("u_strcspn didn't return NULL for \"f\".\n");
}
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "fg")) != u_strlen(testString)) {
if (u_strcspn(testString, fg) != u_strlen(testString)) {
log_err("u_strcspn didn't return NULL for \"fg\".\n");
}
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "gf")) != u_strlen(testString)) {
if (u_strcspn(testString, gf) != u_strlen(testString)) {
log_err("u_strcspn didn't return NULL for \"gf\".\n");
}
log_verbose("Testing u_strcspn() with surrogates");
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "a")) != 1) {
if (u_strcspn(testSurrogateString, a) != 1) {
log_err("u_strcspn couldn't find first letter a.\n");
}
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != 5) {
if (u_strcspn(testSurrogateString, dc) != 5) {
log_err("u_strcspn couldn't find d or c.\n");
}
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != 5) {
if (u_strcspn(testSurrogateString, cd) != 5) {
log_err("u_strcspn couldn't find c or d.\n");
}
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "cdh")) != 5) {
if (u_strcspn(testSurrogateString, cdh) != 5) {
log_err("u_strcspn couldn't find c, d or h.\n");
}
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != u_strlen(testSurrogateString)) {
if (u_strcspn(testSurrogateString, f) != u_strlen(testSurrogateString)) {
log_err("u_strcspn didn't return NULL for \"f\".\n");
}
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "fg")) != u_strlen(testSurrogateString)) {
if (u_strcspn(testSurrogateString, fg) != u_strlen(testSurrogateString)) {
log_err("u_strcspn didn't return NULL for \"fg\".\n");
}
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "gf")) != u_strlen(testSurrogateString)) {
if (u_strcspn(testSurrogateString, gf) != u_strlen(testSurrogateString)) {
log_err("u_strcspn didn't return NULL for \"gf\".\n");
}
if (u_strcspn(testSurrogateString, surrMatchSet1) != 3) {
@ -1597,25 +1608,25 @@ static void TestStringSearching()
log_verbose("Testing u_strspn()");
if (u_strspn(testString, u_uastrcpy(ucharBuf, "a")) != 1) {
if (u_strspn(testString, a) != 1) {
log_err("u_strspn couldn't skip first letter a.\n");
}
if (u_strspn(testString, u_uastrcpy(ucharBuf, "ab")) != 2) {
if (u_strspn(testString, ab) != 2) {
log_err("u_strspn couldn't skip a or b.\n");
}
if (u_strspn(testString, u_uastrcpy(ucharBuf, "ba")) != 2) {
if (u_strspn(testString, ba) != 2) {
log_err("u_strspn couldn't skip a or b.\n");
}
if (u_strspn(testString, u_uastrcpy(ucharBuf, "f")) != 0) {
if (u_strspn(testString, f) != 0) {
log_err("u_strspn didn't return 0 for \"f\".\n");
}
if (u_strspn(testString, u_uastrcpy(ucharBuf, "dc")) != 0) {
if (u_strspn(testString, dc) != 0) {
log_err("u_strspn couldn't find first letter a (skip d or c).\n");
}
if (u_strspn(testString, u_uastrcpy(ucharBuf, "abcd")) != u_strlen(testString)) {
if (u_strspn(testString, abcd) != u_strlen(testString)) {
log_err("u_strspn couldn't skip over the whole string.\n");
}
if (u_strspn(testString, u_uastrcpy(ucharBuf, "")) != 0) {
if (u_strspn(testString, empty) != 0) {
log_err("u_strspn should have returned 0 for empty string.\n");
}
@ -1626,13 +1637,13 @@ static void TestStringSearching()
if (u_strspn(testSurrogateString, surrMatchSetBad2) != 2) {
log_err("u_strspn couldn't skip 0xdbff or a.\n");
}
if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != 0) {
if (u_strspn(testSurrogateString, f) != 0) {
log_err("u_strspn couldn't skip d or c (skip first letter).\n");
}
if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != 0) {
if (u_strspn(testSurrogateString, dc) != 0) {
log_err("u_strspn couldn't skip d or c (skip first letter).\n");
}
if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != 0) {
if (u_strspn(testSurrogateString, cd) != 0) {
log_err("u_strspn couldn't skip d or c (skip first letter).\n");
}
if (u_strspn(testSurrogateString, testSurrogateString) != u_strlen(testSurrogateString)) {

View file

@ -21,6 +21,7 @@
#include "cintltst.h"
#include <stdio.h>
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
static void printUChars(const UChar *uchars);
@ -31,6 +32,7 @@ static void TestNextPrevChar(void);
static void TestFwdBack(void);
static void TestSetChar(void);
static void TestAppendChar(void);
static void TestAppend(void);
static void TestSurrogate(void);
void addUTF16Test(TestNode** root);
@ -45,6 +47,7 @@ addUTF16Test(TestNode** root)
addTest(root, &TestFwdBack, "utf16tst/TestFwdBack" );
addTest(root, &TestSetChar, "utf16tst/TestSetChar" );
addTest(root, &TestAppendChar, "utf16tst/TestAppendChar" );
addTest(root, &TestAppend, "utf8tst/TestAppend" );
addTest(root, &TestSurrogate, "utf16tst/TestSurrogate" );
}
@ -57,17 +60,17 @@ static void TestCodeUnitValues()
UChar c=codeunit[i];
log_verbose("Testing code unit value of %x\n", c);
if(i<4){
if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c)){
if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c) || !U16_IS_SINGLE(c) || U16_IS_LEAD(c) || U16_IS_TRAIL(c)){
log_err("ERROR: %x is a single character\n", c);
}
}
if(i >= 4 && i< 8){
if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c)){
if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c) || !U16_IS_LEAD(c) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c)){
log_err("ERROR: %x is a first surrogate\n", c);
}
}
if(i >= 8 && i< 12){
if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c)){
if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || !U16_IS_TRAIL(c) || U16_IS_SINGLE(c) || U16_IS_LEAD(c)){
log_err("ERROR: %x is a second surrogate\n", c);
}
}
@ -93,7 +96,7 @@ static void TestCharLength()
UBool multiple;
for(i=0; i<sizeof(codepoint)/sizeof(codepoint[0]); i=(int16_t)(i+2)){
UChar32 c=codepoint[i+1];
if(UTF16_CHAR_LENGTH(c) != (uint16_t)codepoint[i]){
if(UTF16_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U16_LENGTH(c) != (uint16_t)codepoint[i]){
log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF16_CHAR_LENGTH(c));
}else{
log_verbose("The no: of code units for %lx is %d\n",c, UTF16_CHAR_LENGTH(c) );
@ -150,11 +153,23 @@ static void TestGetChar()
if(c != result[i]){
log_err("ERROR: UTF16_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
}
U16_GET_UNSAFE(input, offset, c);
if(c != result[i]){
log_err("ERROR: U16_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
}
}
UTF16_GET_CHAR_SAFE(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE);
if(c != result[i+1]){
log_err("ERROR: UTF16_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
U16_GET(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c);
if(c != result[i+1]){
log_err("ERROR: U16_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
UTF16_GET_CHAR_SAFE(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE);
if(c != result[i+2]){
log_err("ERROR: UTF16_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
@ -213,6 +228,16 @@ static void TestNextPrevChar(){
log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
}
setOffset=offset;
U16_NEXT_UNSAFE(input, setOffset, c);
if(setOffset != movedOffset[i]){
log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i], setOffset);
}
if(c != result[i]){
log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
}
setOffset=offset;
UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE);
if(setOffset != movedOffset[i+1]){
@ -223,6 +248,16 @@ static void TestNextPrevChar(){
log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
setOffset=offset;
U16_NEXT(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c);
if(setOffset != movedOffset[i+1]){
log_err("ERROR: U16_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+1], setOffset);
}
if(c != result[i+1]){
log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
setOffset=offset;
UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE);
if(setOffset != movedOffset[i+1]){
@ -247,6 +282,16 @@ static void TestNextPrevChar(){
log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
}
setOffset=offset;
U16_PREV_UNSAFE(input, setOffset, c);
if(setOffset != movedOffset[i+3]){
log_err("ERROR: U16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+3], setOffset);
}
if(c != result[i+3]){
log_err("ERROR: U16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
}
setOffset=offset;
UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
if(setOffset != movedOffset[i+4]){
@ -257,6 +302,16 @@ static void TestNextPrevChar(){
log_err("ERROR: UTF16_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
}
setOffset=offset;
U16_PREV(input, 0, setOffset, c);
if(setOffset != movedOffset[i+4]){
log_err("ERROR: U16_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+4], setOffset);
}
if(c != result[i+4]){
log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
}
setOffset=offset;
UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
if(setOffset != movedOffset[i+5]){
@ -285,7 +340,6 @@ static void TestFwdBack(){
static uint16_t back_N_unsafe[]={12, 11, 8, 5, 3};
static uint16_t back_N_safe[] ={12, 11, 8, 5, 3, 0, 0};
uint16_t offunsafe=0, offsafe=0;
uint16_t i=0;
while(offunsafe < sizeof(input)/U_SIZEOF_UCHAR){
@ -295,6 +349,17 @@ static void TestFwdBack(){
}
i++;
}
offunsafe=0, offsafe=0;
i=0;
while(offunsafe < sizeof(input)/U_SIZEOF_UCHAR){
U16_FWD_1_UNSAFE(input, offunsafe);
if(offunsafe != fwd_unsafe[i]){
log_err("ERROR: U16_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
}
i++;
}
i=0;
while(offsafe < sizeof(input)/U_SIZEOF_UCHAR){
UTF16_FWD_1_SAFE(input, offsafe, sizeof(input)/U_SIZEOF_UCHAR);
@ -303,6 +368,16 @@ static void TestFwdBack(){
}
i++;
}
i=0;
while(offsafe < sizeof(input)/U_SIZEOF_UCHAR){
U16_FWD_1(input, offsafe, sizeof(input)/U_SIZEOF_UCHAR);
if(offsafe != fwd_safe[i]){
log_err("ERROR: U16_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
}
i++;
}
offunsafe=sizeof(input)/U_SIZEOF_UCHAR;
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
i=0;
@ -313,6 +388,18 @@ static void TestFwdBack(){
}
i++;
}
offunsafe=sizeof(input)/U_SIZEOF_UCHAR;
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
i=0;
while(offunsafe > 0){
U16_BACK_1_UNSAFE(input, offunsafe);
if(offunsafe != back_unsafe[i]){
log_err("ERROR: U16_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
}
i++;
}
i=0;
while(offsafe > 0){
UTF16_BACK_1_SAFE(input,0, offsafe);
@ -321,6 +408,16 @@ static void TestFwdBack(){
}
i++;
}
i=0;
while(offsafe > 0){
U16_BACK_1(input,0, offsafe);
if(offsafe != back_safe[i]){
log_err("ERROR: U16_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
}
i++;
}
offunsafe=0;
offsafe=0;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ /*didn't want it to fail(we assume 0<i<length)*/
@ -329,6 +426,16 @@ static void TestFwdBack(){
log_err("ERROR: Forward_N_unsafe offset expected:%d, Got:%d\n", fwd_N_unsafe[i], offunsafe);
}
}
offunsafe=0;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ /*didn't want it to fail(we assume 0<i<length)*/
U16_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
if(offunsafe != fwd_N_unsafe[i]){
log_err("ERROR: U16_FWD_N_UNSAFE offset expected:%d, Got:%d\n", fwd_N_unsafe[i], offunsafe);
}
}
offsafe=0;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
UTF16_FWD_N_SAFE(input, offsafe, sizeof(input)/U_SIZEOF_UCHAR, Nvalue[i]);
if(offsafe != fwd_N_safe[i]){
@ -336,20 +443,47 @@ static void TestFwdBack(){
}
}
offsafe=0;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
U16_FWD_N(input, offsafe, sizeof(input)/U_SIZEOF_UCHAR, Nvalue[i]);
if(offsafe != fwd_N_safe[i]){
log_err("ERROR: U16_FWD_N offset expected:%d, Got:%d\n", fwd_N_safe[i], offsafe);
}
}
offunsafe=sizeof(input)/U_SIZEOF_UCHAR;
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
UTF16_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
if(offunsafe != back_N_unsafe[i]){
log_err("ERROR: backward_N_unsafe offset expected:%d, Got:%d\n", back_N_unsafe[i], offunsafe);
}
}
offunsafe=sizeof(input)/U_SIZEOF_UCHAR;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
U16_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
if(offunsafe != back_N_unsafe[i]){
log_err("ERROR: U16_BACK_N_UNSAFE offset expected:%d, Got:%d\n", back_N_unsafe[i], offunsafe);
}
}
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
UTF16_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
if(offsafe != back_N_safe[i]){
log_err("ERROR: backward_N_safe offset expected:%d, Got:%d\n", back_N_safe[i], offsafe);
}
}
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
U16_BACK_N(input, 0, offsafe, Nvalue[i]);
if(offsafe != back_N_safe[i]){
log_err("ERROR: U16_BACK_N offset expected:%d, Got:%d\n", back_N_safe[i], offsafe);
}
}
}
static void TestSetChar(){
@ -367,23 +501,45 @@ static void TestSetChar(){
if(setOffset != start_unsafe[i]){
log_err("ERROR: UTF16_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, start_unsafe[i], setOffset);
}
setOffset=offset;
U16_SET_CP_START_UNSAFE(input, setOffset);
if(setOffset != start_unsafe[i]){
log_err("ERROR: U16_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, start_unsafe[i], setOffset);
}
setOffset=offset;
UTF16_SET_CHAR_START_SAFE(input, 0, setOffset);
if(setOffset != start_safe[i]){
log_err("ERROR: UTF16_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, start_safe[i], setOffset);
}
setOffset=offset;
U16_SET_CP_START(input, 0, setOffset);
if(setOffset != start_safe[i]){
log_err("ERROR: U16_SET_CHAR_START failed for offset=%ld. Expected:%lx Got:%lx\n", offset, start_safe[i], setOffset);
}
if (offset > 0) {
setOffset=offset;
UTF16_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
if(setOffset != limit_unsafe[i]){
log_err("ERROR: UTF16_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_unsafe[i], setOffset);
}
setOffset=offset;
U16_SET_CP_LIMIT_UNSAFE(input, setOffset);
if(setOffset != limit_unsafe[i]){
log_err("ERROR: U16_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_unsafe[i], setOffset);
}
}
setOffset=offset;
UTF16_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input)/U_SIZEOF_UCHAR);
U16_SET_CP_LIMIT(input,0, setOffset, sizeof(input)/U_SIZEOF_UCHAR);
if(setOffset != limit_safe[i]){
log_err("ERROR: UTF16_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_safe[i], setOffset);
log_err("ERROR: U16_SET_CHAR_LIMIT failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_safe[i], setOffset);
}
i++;
}
}
@ -487,6 +643,67 @@ static void TestAppendChar(){
}
static void TestAppend() {
static const UChar32 codePoints[]={
0x61, 0xdf, 0x901, 0x3040,
0xac00, 0xd800, 0xdbff, 0xdcde,
0xdffd, 0xe000, 0xffff, 0x10000,
0x12345, 0xe0021, 0x10ffff, 0x110000,
0x234567, 0x7fffffff, -1, -1000,
0, 0x400
};
static const UChar expectUnsafe[]={
0x61, 0xdf, 0x901, 0x3040,
0xac00, 0xd800, 0xdbff, 0xdcde,
0xdffd, 0xe000, 0xffff, 0xd800, 0xdc00,
0xd848, 0xdf45, 0xdb40, 0xdc21, 0xdbff, 0xdfff, /* not 0x110000 */
/* none from this line */
0, 0x400
}, expectSafe[]={
0x61, 0xdf, 0x901, 0x3040,
0xac00, 0xd800, 0xdbff, 0xdcde,
0xdffd, 0xe000, 0xffff, 0xd800, 0xdc00,
0xd848, 0xdf45, 0xdb40, 0xdc21, 0xdbff, 0xdfff, /* not 0x110000 */
/* none from this line */
0, 0x400
};
UChar buffer[100];
UChar32 c;
int32_t i, length;
UBool isError, expectIsError, wrongIsError;
length=0;
for(i=0; i<LENGTHOF(codePoints); ++i) {
c=codePoints[i];
if(c<0 || 0x10ffff<c) {
continue; /* skip non-code points for U16_APPEND_UNSAFE */
}
U16_APPEND_UNSAFE(buffer, length, c);
}
if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length*U_SIZEOF_UCHAR)) {
log_err("U16_APPEND_UNSAFE did not generate the expected output\n");
}
length=0;
wrongIsError=FALSE;
for(i=0; i<LENGTHOF(codePoints); ++i) {
c=codePoints[i];
expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
isError=FALSE;
U16_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
wrongIsError|= isError!=expectIsError;
}
if(wrongIsError) {
log_err("U16_APPEND did not set isError correctly\n");
}
if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length*U_SIZEOF_UCHAR)) {
log_err("U16_APPEND did not generate the expected output\n");
}
}
static void TestSurrogate(){
static UChar32 s[] = {0x10000, 0x10ffff, 0x50000, 0x100000, 0x1abcd};
int i = 0;
@ -497,11 +714,11 @@ static void TestSurrogate(){
UChar firstresult = (UChar)(((s[i] - 0x10000) / 0x400) + 0xD800);
UChar secondresult = (UChar)(((s[i] - 0x10000) % 0x400) + 0xDC00);
if (first != UTF16_LEAD(s[i]) || first != firstresult) {
if (first != UTF16_LEAD(s[i]) || first != U16_LEAD(s[i]) || first != firstresult) {
log_err("Failure in first surrogate in 0x%x expected to be 0x%x\n",
s[i], firstresult);
}
if (second != UTF16_TRAIL(s[i]) || second != secondresult) {
if (second != UTF16_TRAIL(s[i]) || second != U16_TRAIL(s[i]) || second != secondresult) {
log_err("Failure in second surrogate in 0x%x expected to be 0x%x\n",
s[i], secondresult);
}
@ -515,6 +732,3 @@ static void printUChars(const UChar *uchars){
printf("%x ", *(uchars+i));
}
}

View file

@ -18,6 +18,7 @@
#include "cmemory.h"
#include "cintltst.h"
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
static void printUChars(const uint8_t *uchars, int16_t len);
@ -28,6 +29,7 @@ static void TestNextPrevChar(void);
static void TestFwdBack(void);
static void TestSetChar(void);
static void TestAppendChar(void);
static void TestAppend(void);
void addUTF8Test(TestNode** root);
@ -41,6 +43,7 @@ addUTF8Test(TestNode** root)
addTest(root, &TestFwdBack, "utf8tst/TestFwdBack" );
addTest(root, &TestSetChar, "utf8tst/TestSetChar" );
addTest(root, &TestAppendChar, "utf8tst/TestAppendChar" );
addTest(root, &TestAppend, "utf8tst/TestAppend" );
}
static void TestCodeUnitValues()
@ -52,17 +55,17 @@ static void TestCodeUnitValues()
uint8_t c=codeunit[i];
log_verbose("Testing code unit value of %x\n", c);
if(i<4){
if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c)){
if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
}
} else if(i< 8){
if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c)){
if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
}
} else if(i< 12){
if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c)){
if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
}
@ -93,7 +96,7 @@ static void TestCharLength()
UBool multiple;
for(i=0; i<sizeof(codepoint)/sizeof(codepoint[0]); i=(int16_t)(i+2)){
UChar32 c=codepoint[i+1];
if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i]){
if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uint16_t)codepoint[i]){
log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF8_CHAR_LENGTH(c));
}else{
log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_LENGTH(c) );
@ -152,11 +155,24 @@ static void TestGetChar()
log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
}
U8_GET_UNSAFE(input, offset, c);
if(c != result[i]){
log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
}
}
U8_GET(input, 0, offset, sizeof(input), c);
if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
if(c != result[i+1]){
log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
if(c != result[i+2]){
log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
@ -225,7 +241,18 @@ static void TestNextPrevChar(){
if(c != result[i]){
log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
}
setOffset=offset;
U8_NEXT_UNSAFE(input, setOffset, c);
if(setOffset != movedOffset[i]){
log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i], setOffset);
}
if(c != result[i]){
log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
}
}
setOffset=offset;
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
if(setOffset != movedOffset[i+1]){
@ -235,6 +262,17 @@ static void TestNextPrevChar(){
if(c != result[i+1]){
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
setOffset=offset;
U8_NEXT(input, setOffset, sizeof(input), c);
if(setOffset != movedOffset[i+1]){
log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+1], setOffset);
}
if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
}
setOffset=offset;
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
if(setOffset != movedOffset[i+1]){
@ -244,8 +282,10 @@ static void TestNextPrevChar(){
if(c != result[i+2]){
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
}
i=i+6;
}
i=0;
for(offset=sizeof(input); offset > 0; --offset){
setOffset=offset;
@ -257,6 +297,7 @@ static void TestNextPrevChar(){
if(c != result[i+3]){
log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
}
setOffset=offset;
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
if(setOffset != movedOffset[i+4]){
@ -266,6 +307,17 @@ static void TestNextPrevChar(){
if(c != result[i+4]){
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
}
setOffset=offset;
U8_PREV(input, 0, setOffset, c);
if(setOffset != movedOffset[i+4]){
log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
offset, movedOffset[i+4], setOffset);
}
if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
}
setOffset=offset;
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
if(setOffset != movedOffset[i+5]){
@ -275,6 +327,7 @@ static void TestNextPrevChar(){
if(c != result[i+5]){
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
}
i=i+6;
}
@ -295,6 +348,7 @@ static void TestFwdBack(){
uint32_t offunsafe=0, offsafe=0;
uint32_t i=0;
while(offunsafe < sizeof(input)){
UTF8_FWD_1_UNSAFE(input, offunsafe);
@ -303,6 +357,16 @@ static void TestFwdBack(){
}
i++;
}
i=0;
while(offunsafe < sizeof(input)){
U8_FWD_1_UNSAFE(input, offunsafe);
if(offunsafe != fwd_unsafe[i]){
log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
}
i++;
}
i=0;
while(offsafe < sizeof(input)){
UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
@ -311,6 +375,16 @@ static void TestFwdBack(){
}
i++;
}
i=0;
while(offsafe < sizeof(input)){
U8_FWD_1(input, offsafe, sizeof(input));
if(offsafe != fwd_safe[i]){
log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
}
i++;
}
offunsafe=sizeof(input);
i=0;
while(offunsafe > 0){
@ -320,6 +394,17 @@ static void TestFwdBack(){
}
i++;
}
offunsafe=sizeof(input);
i=0;
while(offunsafe > 0){
U8_BACK_1_UNSAFE(input, offunsafe);
if(offunsafe != back_unsafe[i]){
log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
}
i++;
}
i=0;
offsafe=sizeof(input);
while(offsafe > 0){
@ -329,14 +414,34 @@ static void TestFwdBack(){
}
i++;
}
i=0;
offsafe=sizeof(input);
while(offsafe > 0){
U8_BACK_1(input, 0, offsafe);
if(offsafe != back_safe[i]){
log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
}
i++;
}
offunsafe=0;
offsafe=0;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
if(offunsafe != fwd_N_unsafe[i]){
log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
}
}
offunsafe=0;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
if(offunsafe != fwd_N_unsafe[i]){
log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
}
}
offsafe=0;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
if(offsafe != fwd_N_safe[i]){
@ -344,20 +449,47 @@ static void TestFwdBack(){
}
}
offsafe=0;
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
if(offsafe != fwd_N_safe[i]){
log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
}
}
offunsafe=sizeof(input);
offsafe=sizeof(input);
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
if(offunsafe != back_N_unsafe[i]){
log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
}
}
offunsafe=sizeof(input);
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
if(offunsafe != back_N_unsafe[i]){
log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
}
}
offsafe=sizeof(input);
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
if(offsafe != back_N_safe[i]){
log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
}
}
offsafe=sizeof(input);
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
U8_BACK_N(input, 0, offsafe, Nvalue[i]);
if(offsafe != back_N_safe[i]){
log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
}
}
}
static void TestSetChar(){
@ -380,23 +512,51 @@ static void TestSetChar(){
if(setOffset != start_unsafe[i]){
log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
}
setOffset=offset;
U8_SET_CP_START_UNSAFE(input, setOffset);
if(setOffset != start_unsafe[i]){
log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
}
setOffset=offset;
UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
if(setOffset != start_safe[i]){
log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
}
setOffset=offset;
U8_SET_CP_START(input, 0, setOffset);
if(setOffset != start_safe[i]){
log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
}
if (offset != 0) { /* Can't have it go off the end of the array */
setOffset=offset;
UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
if(setOffset != limit_unsafe[i]){
log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
}
setOffset=offset;
U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
if(setOffset != limit_unsafe[i]){
log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
}
}
setOffset=offset;
UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
if(setOffset != limit_safe[i]){
log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
}
setOffset=offset;
U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
if(setOffset != limit_safe[i]){
log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
}
i++;
}
}
@ -451,7 +611,7 @@ static void TestAppendChar(){
8,
9,
/*offse-moved-to(safe)*/
/*offset-moved-to(safe)*/
4, /*for append-pos: 0, CHAR 0x10401*/
3,
4,
@ -570,6 +730,67 @@ static void TestAppendChar(){
}
static void TestAppend() {
static const UChar32 codePoints[]={
0x61, 0xdf, 0x901, 0x3040,
0xac00, 0xd800, 0xdbff, 0xdcde,
0xdffd, 0xe000, 0xffff, 0x10000,
0x12345, 0xe0021, 0x10ffff, 0x110000,
0x234567, 0x7fffffff, -1, -1000,
0, 0x400
};
static const uint8_t expectUnsafe[]={
0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
/* none from this line */
0, 0xd0, 0x80
}, expectSafe[]={
0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
0xea, 0xb0, 0x80, /* no surrogates */
/* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
/* none from this line */
0, 0xd0, 0x80
};
uint8_t buffer[100];
UChar32 c;
int32_t i, length;
UBool isError, expectIsError, wrongIsError;
length=0;
for(i=0; i<LENGTHOF(codePoints); ++i) {
c=codePoints[i];
if(c<0 || 0x10ffff<c) {
continue; /* skip non-code points for U8_APPEND_UNSAFE */
}
U8_APPEND_UNSAFE(buffer, length, c);
}
if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
}
length=0;
wrongIsError=FALSE;
for(i=0; i<LENGTHOF(codePoints); ++i) {
c=codePoints[i];
expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
isError=FALSE;
U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
wrongIsError|= isError!=expectIsError;
}
if(wrongIsError) {
log_err("U8_APPEND did not set isError correctly\n");
}
if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
log_err("U8_APPEND did not generate the expected output\n");
}
}
static void printUChars(const uint8_t *uchars, int16_t len){
int16_t i=0;
for(i=0; i<len; i++){