mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-13 00:43:32 +00:00
ICU-2150 simplify/improve UTF macros
X-SVN-Rev: 9930
This commit is contained in:
parent
22e1a4fe61
commit
6b1fa6036a
16 changed files with 2991 additions and 1199 deletions
icu4c/source
|
@ -582,7 +582,7 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
|
|||
int32_t inputLen = 0;
|
||||
UChar decomp[decompSize];
|
||||
|
||||
UTF_APPEND_CHAR(temp, inputLen, bufSize, comp);
|
||||
U16_APPEND_UNSAFE(temp, inputLen, comp);
|
||||
int32_t decompLen = unorm_getDecomposition(comp, FALSE, decomp, decompSize);
|
||||
if(decompLen < 0) {
|
||||
decompLen = -decompLen;
|
||||
|
@ -597,7 +597,9 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
|
|||
UChar32 decompCp;
|
||||
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
|
||||
|
||||
int32_t i = 0;
|
||||
int32_t i;
|
||||
UBool overflow = FALSE;
|
||||
|
||||
i = segmentPos;
|
||||
while(i < segLen) {
|
||||
UTF_NEXT_CHAR(segment, i, segLen, cp);
|
||||
|
@ -620,7 +622,19 @@ Hashtable *CanonicalIterator::extract(UChar32 comp, const UChar *segment, int32_
|
|||
|
||||
// brute force approach
|
||||
|
||||
UTF_APPEND_CHAR(buff, bufLen, bufSize, cp);
|
||||
U16_APPEND(buff, bufLen, bufSize, cp, overflow);
|
||||
|
||||
if(overflow) {
|
||||
/*
|
||||
* ### TODO handle buffer overflow
|
||||
* The buffer is large, but an overflow may still happen with
|
||||
* unusual input (many combining marks?).
|
||||
* Reallocate buffer and continue.
|
||||
* markus 20020929
|
||||
*/
|
||||
|
||||
overflow = FALSE;
|
||||
}
|
||||
|
||||
/* TODO: optimize
|
||||
// since we know that the classes are monotonically increasing, after zero
|
||||
|
|
|
@ -3100,6 +3100,10 @@ InputPath=.\unicode\utf8.h
|
|||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\utf_old.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\util.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
|
|
@ -141,7 +141,7 @@ ucnv_getUChar32KeepOverflow(UConverter *cnv, const UChar *buffer, int32_t length
|
|||
|
||||
/* get the first code point in the buffer */
|
||||
i=0;
|
||||
UTF_NEXT_CHAR_SAFE(buffer, i, length, c, FALSE);
|
||||
UTF_NEXT_CHAR(buffer, i, length, c);
|
||||
if(i<length) {
|
||||
/* there are UChars left in the buffer that need to go into the overflow buffer */
|
||||
UChar *overflow=cnv->UCharErrorBuffer;
|
||||
|
|
|
@ -156,7 +156,7 @@ u_strstr(const UChar *s, const UChar *substring);
|
|||
* but u_strchr32() will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that UTF_GET_CHAR(u_strchr32(c))==c.
|
||||
* This behavior ensures that U16_GET(u_strchr32(c))==c.
|
||||
*
|
||||
* @param s The string to search.
|
||||
* @param c The code point (0..0x10ffff) to find.
|
||||
|
@ -628,7 +628,7 @@ u_memchr(const UChar *src, UChar ch, int32_t count);
|
|||
* but u_memchr32() will find neither because they
|
||||
* combine to the code point U+10000.
|
||||
* Either function will find U+d800 in "a\ud800b".
|
||||
* This behavior ensures that UTF_GET_CHAR(u_memchr32(c))==c.
|
||||
* This behavior ensures that U16_GET(u_memchr32(c))==c.
|
||||
*
|
||||
* @param src string to search in
|
||||
* @param ch character to find
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2001, International Business Machines
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -15,77 +15,89 @@
|
|||
*/
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: UChar and UChar32 data types and UTF macros for C Unicode string handling
|
||||
*
|
||||
* <p>This file defines the UChar and UChar32 data types for Unicode code units
|
||||
* and code points, as well as macros for efficiently getting code points
|
||||
* in and out of a string.</p>
|
||||
*
|
||||
* <p>utf.h is included by utypes.h and itself includes the utfXX.h after some
|
||||
* common definitions. Those files define the macros for each UTF-size.</p>
|
||||
*
|
||||
* <p>The original concept for these files was for ICU to allow
|
||||
* in principle to set which UTF (UTF-8/16/32) is used internally
|
||||
* by defining UTF_SIZE to either 8, 16, or 32. utf.h would then define the UChar type
|
||||
* accordingly. UTF-16 was the default.</p>
|
||||
*
|
||||
* <p>This concept has been abandoned.
|
||||
* A lot of the ICU source code — especially low-level code like
|
||||
* conversion, normalization, and collation — assumes UTF-16,
|
||||
* utf.h enforces the default of UTF-16.
|
||||
* The UTF-8 and UTF-32 macros remain for now for completeness and backward compatibility.</p>
|
||||
*
|
||||
* <p>Accordingly, utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then
|
||||
* UChar is defined to be exactly wchar_t, otherwise uint16_t.</p>
|
||||
*
|
||||
* <p>UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
|
||||
* Unicode code point (Unicode scalar value, 0..0x10ffff).
|
||||
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
|
||||
* the definition of UChar. For details see the documentation for UChar32 itself.</p>
|
||||
*
|
||||
* <p>utf.h also defines a number of C macros for handling single Unicode code points and
|
||||
* for using UTF Unicode strings. It includes utf8.h, utf16.h, and utf32.h for the actual
|
||||
* implementations of those macros and then aliases one set of them (for UTF-16) for general use.
|
||||
* The UTF-specific macros have the UTF size in the macro name prefixes (UTF16_...), while
|
||||
* the general alias macros always begin with UTF_...</p>
|
||||
*
|
||||
* <p>Many string operations can be done with or without error checking.
|
||||
* Where such a distinction is useful, there are two versions of the macros, "unsafe" and "safe"
|
||||
* ones with ..._UNSAFE and ..._SAFE suffixes. The unsafe macros are fast but may cause
|
||||
* program failures if the strings are not well-formed. The safe macros have an additional, boolean
|
||||
* parameter "strict". If strict is FALSE, then only illegal sequences are detected.
|
||||
* Otherwise, irregular sequences and non-characters are detected as well (like single surrogates).
|
||||
* Safe macros return special error code points for illegal/irregular sequences:
|
||||
* Typically, U+ffff, or values that would result in a code unit sequence of the same length
|
||||
* as the erroneous input sequence.<br>
|
||||
* Note that _UNSAFE macros have fewer parameters: They do not have the strictness parameter, and
|
||||
* they do not have start/length parameters for boundary checking.</p>
|
||||
*
|
||||
* <p>Here, the macros are aliased in two steps:
|
||||
* In the first step, the UTF-specific macros with UTF16_ prefix and _UNSAFE and _SAFE suffixes are
|
||||
* aliased according to the UTF_SIZE to macros with UTF_ prefix and the same suffixes and signatures.
|
||||
* Then, in a second step, the default, general alias macros are set to use either the unsafe or
|
||||
* the safe/not strict (default) or the safe/strict macro;
|
||||
* these general macros do not have a strictness parameter.</p>
|
||||
*
|
||||
* <p>It is possible to change the default choice for the general alias macros to be unsafe, safe/not strict or safe/strict.
|
||||
* The default is safe/not strict. It is not recommended to select the unsafe macros as the basis for
|
||||
* Unicode string handling in ICU! To select this, define UTF_SAFE, UTF_STRICT, or UTF_UNSAFE.</p>
|
||||
*
|
||||
* <p>For general use, one should use the default, general macros with UTF_ prefix and no _SAFE/_UNSAFE suffix.
|
||||
* Only in some cases it may be necessary to control the choice of macro directly and use a less generic alias.
|
||||
* For example, if it can be assumed that a string is well-formed and the index will stay within the bounds,
|
||||
* then the _UNSAFE version may be used.
|
||||
* If a UTF-8 string is to be processed, then the macros with UTF8_ prefixes need to be used.</p>
|
||||
* <p><b>Usage:</b> ICU coding guidelines for if() statements should be followed when using these macros.
|
||||
* Compound statements (curly braces {}) must be used for if-else-while...
|
||||
* bodies and all macro statements should be terminated with semicolon.</p>
|
||||
*/
|
||||
* \file
|
||||
* \brief C API: UChar and UChar32 data types and code point macros
|
||||
*
|
||||
* This file defines the UChar and UChar32 data types for Unicode code units
|
||||
* and code points, as well as macros for checking whether a code point is
|
||||
* a surrogate or a non-character.
|
||||
*
|
||||
* utf.h is included by utypes.h and itself includes utf8.h and utf16.h after some
|
||||
* common definitions. Those files define macros for efficiently getting code points
|
||||
* in and out of UTF-8/16 strings.
|
||||
* utf16.h macros have "U16_" prefixes.
|
||||
* utf8.h defines similar macros with "U8_" prefixes for UTF-8 string handling.
|
||||
*
|
||||
* ICU processes 16-bit Unicode strings.
|
||||
* Most of the time, such strings are well-formed UTF-16.
|
||||
* Single, unpaired surrogates must be handled as well, and are treated in ICU
|
||||
* like regular code points where possible.
|
||||
* (Pairs of surrogate code points are indistinguishable from supplementary
|
||||
* code points encoded as pairs of supplementary code units.)
|
||||
*
|
||||
* In fact, almost all Unicode code points in normal text (>99%)
|
||||
* are on the BMP (<=U+ffff) and even <=U+d7ff.
|
||||
* ICU functions handle supplementary code points (U+10000..U+10ffff)
|
||||
* but are optimized for the much more frequently occurring BMP code points.
|
||||
*
|
||||
* utf.h defines UChar to be an unsigned 16-bit integer. If this matches wchar_t, then
|
||||
* UChar is defined to be exactly wchar_t, otherwise uint16_t.
|
||||
*
|
||||
* UChar32 is defined to be a signed 32-bit integer (int32_t), large enough for a 21-bit
|
||||
* Unicode code point (Unicode scalar value, 0..0x10ffff).
|
||||
* Before ICU 2.4, the definition of UChar32 was similarly platform-dependent as
|
||||
* the definition of UChar. For details see the documentation for UChar32 itself.
|
||||
*
|
||||
* utf.h also defines a small number of C macros for single Unicode code points.
|
||||
* These are simple checks for surrogates and non-characters.
|
||||
* For actual Unicode character properties see uchar.h.
|
||||
*
|
||||
* By default, string operations must be done with error checking in case
|
||||
* a string is not well-formed UTF-16.
|
||||
* The macros will detect if a surrogate code unit is unpaired
|
||||
* (lead unit without trail unit or vice versa) and just return the unit itself
|
||||
* as the code point.
|
||||
* (It is an accidental property of Unicode and UTF-16 that all
|
||||
* malformed sequences can be expressed unambiguously with a distinct subrange
|
||||
* of Unicode code points.)
|
||||
*
|
||||
* When it is safe to assume that text is well-formed UTF-16
|
||||
* (does not contain single, unpaired surrogates), then one can use
|
||||
* U16_..._UNSAFE macros.
|
||||
* These do not check for proper code unit sequences or truncated text and may
|
||||
* yield wrong results or even cause a crash if they are used with "malformed"
|
||||
* text.
|
||||
* In practice, U16_..._UNSAFE macros will produce slightly less code but
|
||||
* should not be faster because the processing is only different when a
|
||||
* surrogate code unit is detected, which will be rare.
|
||||
*
|
||||
* Similarly for UTF-8, there are "safe" macros without a suffix,
|
||||
* and U8_..._UNSAFE versions.
|
||||
* The performance differences are much larger here because UTF-8 provides so
|
||||
* many opportunities for malformed sequences.
|
||||
* The unsafe UTF-8 macros are entirely implemented inside the macro definitions
|
||||
* and are fast, while the safe UTF-8 macros call functions for all but the
|
||||
* trivial (ASCII) cases.
|
||||
*
|
||||
* Unlike with UTF-16, malformed sequences cannot be expressed with distinct
|
||||
* code point values (0..U+10ffff). They are indicated with negative values instead.
|
||||
*
|
||||
* For more information see the ICU User Guide Strings chapter
|
||||
* (http://oss.software.ibm.com/icu/userguide/).
|
||||
*
|
||||
* <em>Usage:</em>
|
||||
* ICU coding guidelines for if() statements should be followed when using these macros.
|
||||
* Compound statements (curly braces {}) must be used for if-else-while...
|
||||
* bodies and all macro statements should be terminated with semicolon.
|
||||
*
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
|
||||
#ifndef __UTF_H__
|
||||
#define __UTF_H__
|
||||
|
||||
/* wchar_t-related definitions ---------------------------------------------- */
|
||||
|
||||
/*
|
||||
* ANSI C headers:
|
||||
* stddef.h defines wchar_t
|
||||
|
@ -94,18 +106,11 @@
|
|||
#include <stddef.h>
|
||||
/* include the utfXX.h after the following definitions */
|
||||
|
||||
/* If there is no compiler option for the preferred UTF size, then default to UTF-16. */
|
||||
#ifndef UTF_SIZE
|
||||
/** Number of bits in a Unicode string code unit, same as x in UTF-x (8, 16, or 32). */
|
||||
# define UTF_SIZE 16
|
||||
#endif
|
||||
|
||||
/** Number of bytes in a UChar (sizeof(UChar)). */
|
||||
#define U_SIZEOF_UCHAR (UTF_SIZE>>3)
|
||||
|
||||
/*!
|
||||
* \def U_SIZEOF_WCHAR_T
|
||||
* U_SIZEOF_WCHAR_T==sizeof(wchar_t).
|
||||
*
|
||||
* @stable
|
||||
*/
|
||||
#ifndef U_HAVE_WCHAR_H
|
||||
# define U_HAVE_WCHAR_H 1
|
||||
|
@ -120,10 +125,14 @@
|
|||
/*!
|
||||
* \def U_WCHAR_IS_UTF16
|
||||
* Defined if wchar_t uses UTF-16.
|
||||
*
|
||||
* @stable
|
||||
*/
|
||||
/*!
|
||||
* \def U_WCHAR_IS_UTF32
|
||||
* Defined if wchar_t uses UTF-32.
|
||||
*
|
||||
* @stable
|
||||
*/
|
||||
#if !defined(U_WCHAR_IS_UTF16) && !defined(U_WCHAR_IS_UTF32)
|
||||
# ifdef __STDC_ISO_10646__
|
||||
|
@ -145,139 +154,10 @@
|
|||
# endif
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Define UChar32 as a type for single Unicode code points.
|
||||
* UChar32 is a signed 32-bit integer.
|
||||
*
|
||||
* The Unicode code point range is 0..0x10ffff.
|
||||
* All other values (negative or >=0x110000) are illegal as Unicode code points.
|
||||
* They may be used as sentinel values to indicate "done", "error"
|
||||
* or similar non-code point conditions.
|
||||
*
|
||||
* Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
|
||||
* to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
|
||||
* or else to be uint32_t.
|
||||
* That is, the definition of UChar32 was platform-dependent.
|
||||
*
|
||||
* @see UTF_SENTINEL
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
typedef int32_t UChar32;
|
||||
/* UChar and UChar32 definitions -------------------------------------------- */
|
||||
|
||||
/**
|
||||
* Unicode string and array offset and index type.
|
||||
* ICU always counts Unicode code units (UChars) for
|
||||
* string offsets, indexes, and lengths, not Unicode code points.
|
||||
*
|
||||
* @deprecated Use int32_t directly. UTextOffset to be removed after 2003-mar.
|
||||
*/
|
||||
typedef int32_t UTextOffset;
|
||||
|
||||
/* Specify which macro versions are the default ones - safe or fast. */
|
||||
#if !defined(UTF_SAFE) && !defined(UTF_STRICT) && !defined(UTF_UNSAFE)
|
||||
/**
|
||||
* The default choice for general Unicode string macros is to use the ..._SAFE macro implementations
|
||||
* with strict=FALSE. See the utf.h file description.
|
||||
*/
|
||||
# define UTF_SAFE
|
||||
#endif
|
||||
|
||||
/* internal definitions ----------------------------------------------------- */
|
||||
|
||||
/**
|
||||
* <p>UTF8_ERROR_VALUE_1 and UTF8_ERROR_VALUE_2 are special error values for UTF-8,
|
||||
* which need 1 or 2 bytes in UTF-8:<br>
|
||||
* U+0015 = NAK = Negative Acknowledge, C0 control character<br>
|
||||
* U+009f = highest C1 control character</p>
|
||||
*
|
||||
* <p>These are used by ("safe") UTF-8 macros so that they can return an error value
|
||||
* that needs the same number of code units (bytes) as were seen by
|
||||
* a macro. They should be tested with UTF_IS_ERROR() or UTF_IS_VALID().</p>
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
#define UTF8_ERROR_VALUE_1 0x15
|
||||
/**
|
||||
* See documentation on UTF8_ERROR_VALUE_1 for details.
|
||||
*/
|
||||
#define UTF8_ERROR_VALUE_2 0x9f
|
||||
|
||||
/**
|
||||
* Error value for all UTFs. This code point value will be set by macros with error
|
||||
* checking if an error is detected.
|
||||
*/
|
||||
#define UTF_ERROR_VALUE 0xffff
|
||||
|
||||
/* single-code point definitions -------------------------------------------- */
|
||||
|
||||
/**
|
||||
* This value is intended for sentinel values for APIs that
|
||||
* (take or) return single code points (UChar32).
|
||||
* It is outside of the Unicode code point range 0..0x10ffff.
|
||||
*
|
||||
* For example, a "done" or "error" value in a new API
|
||||
* could be indicated with UTF_SENTINEL.
|
||||
*
|
||||
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
|
||||
* values, mostly 0xffff.
|
||||
* Those may need to be distinguished from
|
||||
* actual U+ffff text contents by calling functions like
|
||||
* CharacterIterator::hasNext() or UnicodeString::length().
|
||||
*
|
||||
* @see UChar32
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF_SENTINEL (-1)
|
||||
|
||||
/** Is this code unit or code point a surrogate (U+d800..U+dfff)? */
|
||||
#define UTF_IS_SURROGATE(uchar) (((uchar)&0xfffff800)==0xd800)
|
||||
|
||||
/**
|
||||
* Is a given 32-bit code point a Unicode noncharacter?
|
||||
*/
|
||||
#define UTF_IS_UNICODE_NONCHAR(c) \
|
||||
((c)>=0xfdd0 && \
|
||||
((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
|
||||
(uint32_t)(c)<=0x10ffff)
|
||||
|
||||
/**
|
||||
* Is a given 32-bit code point/Unicode scalar value
|
||||
* actually a valid Unicode (abstract) character?
|
||||
*
|
||||
* Code points that are not characters include:
|
||||
* - single surrogate code points (U+d800..U+dfff, 2048 code points)
|
||||
* - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
|
||||
* - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
|
||||
* - the highest Unicode code point value is U+10ffff
|
||||
*
|
||||
* This means that all code points below U+d800 are character code points,
|
||||
* and that boundary is tested first for performance.
|
||||
*/
|
||||
#define UTF_IS_UNICODE_CHAR(c) \
|
||||
((uint32_t)(c)<0xd800 || \
|
||||
((uint32_t)(c)>0xdfff && \
|
||||
(uint32_t)(c)<=0x10ffff && \
|
||||
!UTF_IS_UNICODE_NONCHAR(c)))
|
||||
|
||||
/**
|
||||
* Is a given 32-bit code an error value
|
||||
* as returned by one of the macros for any UTF?
|
||||
*/
|
||||
#define UTF_IS_ERROR(c) \
|
||||
(((c)&0xfffe)==0xfffe || (c)==UTF8_ERROR_VALUE_1 || (c)==UTF8_ERROR_VALUE_2)
|
||||
|
||||
/** This is a combined macro: Is c a valid Unicode value _and_ not an error code? */
|
||||
#define UTF_IS_VALID(c) \
|
||||
(UTF_IS_UNICODE_CHAR(c) && \
|
||||
(c)!=UTF8_ERROR_VALUE_1 && (c)!=UTF8_ERROR_VALUE_2)
|
||||
|
||||
/* include the utfXX.h ------------------------------------------------------ */
|
||||
|
||||
#include "unicode/utf8.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "unicode/utf32.h"
|
||||
|
||||
/* Define types and macros according to the selected UTF size. -------------- */
|
||||
/** Number of bytes in a UChar. @stable */
|
||||
#define U_SIZEOF_UCHAR 2
|
||||
|
||||
/*!
|
||||
* \var UChar
|
||||
|
@ -290,262 +170,127 @@ typedef int32_t UTextOffset;
|
|||
* @stable
|
||||
*/
|
||||
|
||||
#if UTF_SIZE==8
|
||||
|
||||
# error UTF-8 is not implemented, undefine UTF_SIZE or define it to 16
|
||||
|
||||
/*
|
||||
* ANSI C header:
|
||||
* limits.h defines CHAR_MAX
|
||||
*/
|
||||
# include <limits.h>
|
||||
|
||||
/* Define UChar to be compatible with char if possible. */
|
||||
# if CHAR_MAX>=255
|
||||
typedef char UChar;
|
||||
# else
|
||||
typedef uint8_t UChar;
|
||||
# endif
|
||||
|
||||
#elif UTF_SIZE==16
|
||||
|
||||
/* Define UChar to be compatible with wchar_t if possible. */
|
||||
# if U_SIZEOF_WCHAR_T==2
|
||||
typedef wchar_t UChar;
|
||||
# else
|
||||
typedef uint16_t UChar;
|
||||
# endif
|
||||
|
||||
/** Does this code unit alone encode a code point? */
|
||||
# define UTF_IS_SINGLE(uchar) UTF16_IS_SINGLE(uchar)
|
||||
/** Is this code unit the first one of several? */
|
||||
# define UTF_IS_LEAD(uchar) UTF16_IS_LEAD(uchar)
|
||||
/** Is this code unit one of several but not the first one? */
|
||||
# define UTF_IS_TRAIL(uchar) UTF16_IS_TRAIL(uchar)
|
||||
|
||||
/** Does this code point require multiple code units? */
|
||||
# define UTF_NEED_MULTIPLE_UCHAR(c) UTF16_NEED_MULTIPLE_UCHAR(c)
|
||||
/** How many code units are used to encode this code point? */
|
||||
# define UTF_CHAR_LENGTH(c) UTF16_CHAR_LENGTH(c)
|
||||
/** How many code units are used at most for any Unicode code point? */
|
||||
# define UTF_MAX_CHAR_LENGTH UTF16_MAX_CHAR_LENGTH
|
||||
/** Estimate the number of code units for a string based on the number of UTF-16 code units. */
|
||||
# define UTF_ARRAY_SIZE(size) UTF16_ARRAY_SIZE(size)
|
||||
|
||||
/** See file documentation and UTF_GET_CHAR. */
|
||||
# define UTF_GET_CHAR_UNSAFE(s, i, c) UTF16_GET_CHAR_UNSAFE(s, i, c)
|
||||
/** See file documentation and UTF_GET_CHAR. */
|
||||
# define UTF_GET_CHAR_SAFE(s, start, i, length, c, strict) UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict)
|
||||
|
||||
/** See file documentation and UTF_NEXT_CHAR. */
|
||||
# define UTF_NEXT_CHAR_UNSAFE(s, i, c) UTF16_NEXT_CHAR_UNSAFE(s, i, c)
|
||||
/** See file documentation and UTF_NEXT_CHAR. */
|
||||
# define UTF_NEXT_CHAR_SAFE(s, i, length, c, strict) UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict)
|
||||
|
||||
/** See file documentation and UTF_APPEND_CHAR. */
|
||||
# define UTF_APPEND_CHAR_UNSAFE(s, i, c) UTF16_APPEND_CHAR_UNSAFE(s, i, c)
|
||||
/** See file documentation and UTF_APPEND_CHAR. */
|
||||
# define UTF_APPEND_CHAR_SAFE(s, i, length, c) UTF16_APPEND_CHAR_SAFE(s, i, length, c)
|
||||
|
||||
/** See file documentation and UTF_FWD_1. */
|
||||
# define UTF_FWD_1_UNSAFE(s, i) UTF16_FWD_1_UNSAFE(s, i)
|
||||
/** See file documentation and UTF_FWD_1. */
|
||||
# define UTF_FWD_1_SAFE(s, i, length) UTF16_FWD_1_SAFE(s, i, length)
|
||||
|
||||
/** See file documentation and UTF_FWD_N. */
|
||||
# define UTF_FWD_N_UNSAFE(s, i, n) UTF16_FWD_N_UNSAFE(s, i, n)
|
||||
/** See file documentation and UTF_FWD_N. */
|
||||
# define UTF_FWD_N_SAFE(s, i, length, n) UTF16_FWD_N_SAFE(s, i, length, n)
|
||||
|
||||
/** See file documentation and UTF_SET_CHAR_START. */
|
||||
# define UTF_SET_CHAR_START_UNSAFE(s, i) UTF16_SET_CHAR_START_UNSAFE(s, i)
|
||||
/** See file documentation and UTF_SET_CHAR_START. */
|
||||
# define UTF_SET_CHAR_START_SAFE(s, start, i) UTF16_SET_CHAR_START_SAFE(s, start, i)
|
||||
|
||||
/** See file documentation and UTF_PREV_CHAR. */
|
||||
# define UTF_PREV_CHAR_UNSAFE(s, i, c) UTF16_PREV_CHAR_UNSAFE(s, i, c)
|
||||
/** See file documentation and UTF_PREV_CHAR. */
|
||||
# define UTF_PREV_CHAR_SAFE(s, start, i, c, strict) UTF16_PREV_CHAR_SAFE(s, start, i, c, strict)
|
||||
|
||||
/** See file documentation and UTF_BACK_1. */
|
||||
# define UTF_BACK_1_UNSAFE(s, i) UTF16_BACK_1_UNSAFE(s, i)
|
||||
/** See file documentation and UTF_BACK_1. */
|
||||
# define UTF_BACK_1_SAFE(s, start, i) UTF16_BACK_1_SAFE(s, start, i)
|
||||
|
||||
/** See file documentation and UTF_BACK_N. */
|
||||
# define UTF_BACK_N_UNSAFE(s, i, n) UTF16_BACK_N_UNSAFE(s, i, n)
|
||||
/** See file documentation and UTF_BACK_N. */
|
||||
# define UTF_BACK_N_SAFE(s, start, i, n) UTF16_BACK_N_SAFE(s, start, i, n)
|
||||
|
||||
/** See file documentation and UTF_SET_CHAR_LIMIT. */
|
||||
# define UTF_SET_CHAR_LIMIT_UNSAFE(s, i) UTF16_SET_CHAR_LIMIT_UNSAFE(s, i)
|
||||
/** See file documentation and UTF_SET_CHAR_LIMIT. */
|
||||
# define UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length) UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length)
|
||||
|
||||
#elif UTF_SIZE==32
|
||||
|
||||
# error UTF-32 is not implemented, undefine UTF_SIZE or define it to 16
|
||||
|
||||
typedef UChar32 UChar;
|
||||
|
||||
/* Define UChar to be compatible with wchar_t if possible. */
|
||||
#if U_SIZEOF_WCHAR_T==2
|
||||
typedef wchar_t UChar;
|
||||
#else
|
||||
# error UTF_SIZE must be undefined or one of { 8, 16, 32 } - only 16 is implemented
|
||||
typedef uint16_t UChar;
|
||||
#endif
|
||||
|
||||
/* Define the default macros for handling UTF characters. ------------------- */
|
||||
/**
|
||||
* Define UChar32 as a type for single Unicode code points.
|
||||
* UChar32 is a signed 32-bit integer (same as int32_t).
|
||||
*
|
||||
* The Unicode code point range is 0..0x10ffff.
|
||||
* All other values (negative or >=0x110000) are illegal as Unicode code points.
|
||||
* They may be used as sentinel values to indicate "done", "error"
|
||||
* or similar non-code point conditions.
|
||||
*
|
||||
* Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
|
||||
* to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
|
||||
* or else to be uint32_t.
|
||||
* That is, the definition of UChar32 was platform-dependent.
|
||||
*
|
||||
* @see U_SENTINEL
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
typedef int32_t UChar32;
|
||||
|
||||
/* single-code point definitions -------------------------------------------- */
|
||||
|
||||
/**
|
||||
* \def UTF_GET_CHAR(s, start, i, length, c)
|
||||
* This value is intended for sentinel values for APIs that
|
||||
* (take or) return single code points (UChar32).
|
||||
* It is outside of the Unicode code point range 0..0x10ffff.
|
||||
*
|
||||
* For example, a "done" or "error" value in a new API
|
||||
* could be indicated with U_SENTINEL.
|
||||
*
|
||||
* Set c to the code point that contains the code unit i.
|
||||
* i could point to the first, the last, or an intermediate code unit.
|
||||
* i is not modified.
|
||||
* \pre 0<=i<length
|
||||
* ICU APIs designed before ICU 2.4 usually define service-specific "done"
|
||||
* values, mostly 0xffff.
|
||||
* Those may need to be distinguished from
|
||||
* actual U+ffff text contents by calling functions like
|
||||
* CharacterIterator::hasNext() or UnicodeString::length().
|
||||
*
|
||||
* @return -1
|
||||
* @see UChar32
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U_SENTINEL (-1)
|
||||
|
||||
/**
|
||||
* \def UTF_NEXT_CHAR(s, i, length, c)
|
||||
*
|
||||
* Set c to the code point that starts at code unit i
|
||||
* and advance i to beyond the code units of this code point (post-increment).
|
||||
* i must point to the first code unit of a code point.
|
||||
* \pre 0<=i<length
|
||||
* \post 0<i<=length
|
||||
* Is this code point a Unicode noncharacter?
|
||||
* @param c 32-bit code point
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U_IS_UNICODE_NONCHAR(c) \
|
||||
((c)>=0xfdd0 && \
|
||||
((uint32_t)(c)<=0xfdef || ((c)&0xfffe)==0xfffe) && \
|
||||
(uint32_t)(c)<=0x10ffff)
|
||||
|
||||
/**
|
||||
* \def UTF_APPEND_CHAR(s, i, length, c)
|
||||
* Is c a Unicode code point value (0..U+10ffff)
|
||||
* that can be assigned a character?
|
||||
*
|
||||
* Append the code units of code point c to the string at index i
|
||||
* and advance i to beyond the new code units (post-increment).
|
||||
* The code units beginning at index i will be overwritten.
|
||||
* \pre 0<=c<=0x10ffff
|
||||
* \pre 0<=i<length
|
||||
* \post 0<i<=length
|
||||
* Code points that are not characters include:
|
||||
* - single surrogate code points (U+d800..U+dfff, 2048 code points)
|
||||
* - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
|
||||
* - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
|
||||
* - the highest Unicode code point value is U+10ffff
|
||||
*
|
||||
* This means that all code points below U+d800 are character code points,
|
||||
* and that boundary is tested first for performance.
|
||||
*
|
||||
* @param c 32-bit code point
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U_IS_UNICODE_CHAR(c) \
|
||||
((uint32_t)(c)<0xd800 || \
|
||||
((uint32_t)(c)>0xdfff && \
|
||||
(uint32_t)(c)<=0x10ffff && \
|
||||
!U_IS_UNICODE_NONCHAR(c)))
|
||||
|
||||
/**
|
||||
* \def UTF_FWD_1(s, i, length)
|
||||
*
|
||||
* Advance i to beyond the code units of the code point that begins at i.
|
||||
* I.e., advance i by one code point.
|
||||
* i must point to the first code unit of a code point.
|
||||
* \pre 0<=i<length
|
||||
* \post 0<i<=length
|
||||
* Is this code point a lead surrogate (U+d800..U+dbff)?
|
||||
* @param c 32-bit code point
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
|
||||
|
||||
/**
|
||||
* \def UTF_FWD_N(s, i, length, n)
|
||||
*
|
||||
* Advance i to beyond the code units of the n code points where the first one begins at i.
|
||||
* I.e., advance i by n code points.
|
||||
* i must point to the first code unit of a code point.
|
||||
* \pre 0<=i<length
|
||||
* \post 0<i<=length
|
||||
* Is this code point a trail surrogate (U+dc00..U+dfff)?
|
||||
* @param c 32-bit code point
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
|
||||
|
||||
/**
|
||||
* \def UTF_SET_CHAR_START(s, start, i)
|
||||
*
|
||||
* Take the random-access index i and adjust it so that it points to the beginning
|
||||
* of a code point.
|
||||
* The input index points to any code unit of a code point and is moved to point to
|
||||
* the first code unit of the same code point. i is never incremented.
|
||||
* This can be used to start an iteration with UTF_NEXT_CHAR() from a random index.
|
||||
* \pre start<=i<length
|
||||
* \post start<=i<length
|
||||
* Is this code point a surrogate (U+d800..U+dfff)?
|
||||
* @param c 32-bit code point
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
|
||||
|
||||
/**
|
||||
* \def UTF_PREV_CHAR(s, start, i, c)
|
||||
*
|
||||
* Set c to the code point that has code units before i
|
||||
* and move i backward (towards the beginning of the string)
|
||||
* to the first code unit of this code point (pre-increment).
|
||||
* i must point to the first code unit after the last unit of a code point (i==length is allowed).
|
||||
* \pre start<i<=length
|
||||
* \post start<=i<length
|
||||
* Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
|
||||
* is it a lead surrogate?
|
||||
* @param c 32-bit code point
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
|
||||
|
||||
/**
|
||||
* \def UTF_BACK_1(s, start, i)
|
||||
*
|
||||
* Move i backward (towards the beginning of the string)
|
||||
* to the first code unit of the code point that has code units before i.
|
||||
* I.e., move i backward by one code point.
|
||||
* i must point to the first code unit after the last unit of a code point (i==length is allowed).
|
||||
* \pre start<i<=length
|
||||
* \post start<=i<length
|
||||
*/
|
||||
/* include the utfXX.h ------------------------------------------------------ */
|
||||
|
||||
/**
|
||||
* \def UTF_BACK_N(s, start, i, n)
|
||||
*
|
||||
* Move i backward (towards the beginning of the string)
|
||||
* to the first code unit of the n code points that have code units before i.
|
||||
* I.e., move i backward by n code points.
|
||||
* i must point to the first code unit after the last unit of a code point (i==length is allowed).
|
||||
* \pre start<i<=length
|
||||
* \post start<=i<length
|
||||
*/
|
||||
#include "unicode/utf8.h"
|
||||
#include "unicode/utf16.h"
|
||||
|
||||
/**
|
||||
* \def UTF_SET_CHAR_LIMIT(s, start, i, length)
|
||||
*
|
||||
* Take the random-access index i and adjust it so that it points beyond
|
||||
* a code point. The input index points beyond any code unit
|
||||
* of a code point and is moved to point beyond the last code unit of the same
|
||||
* code point. i is never decremented.
|
||||
* This can be used to start an iteration with UTF_PREV_CHAR() from a random index.
|
||||
* \pre start<i<=length
|
||||
* \post start<i<=length
|
||||
*/
|
||||
|
||||
#ifdef UTF_SAFE
|
||||
|
||||
# define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, FALSE)
|
||||
|
||||
# define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, FALSE)
|
||||
# define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c)
|
||||
# define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length)
|
||||
# define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n)
|
||||
# define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i)
|
||||
|
||||
# define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, FALSE)
|
||||
# define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i)
|
||||
# define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n)
|
||||
# define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)
|
||||
|
||||
#elif defined(UTF_STRICT)
|
||||
|
||||
# define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_SAFE(s, start, i, length, c, TRUE)
|
||||
|
||||
# define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_SAFE(s, i, length, c, TRUE)
|
||||
# define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_SAFE(s, i, length, c)
|
||||
# define UTF_FWD_1(s, i, length) UTF_FWD_1_SAFE(s, i, length)
|
||||
# define UTF_FWD_N(s, i, length, n) UTF_FWD_N_SAFE(s, i, length, n)
|
||||
# define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_SAFE(s, start, i)
|
||||
|
||||
# define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_SAFE(s, start, i, c, TRUE)
|
||||
# define UTF_BACK_1(s, start, i) UTF_BACK_1_SAFE(s, start, i)
|
||||
# define UTF_BACK_N(s, start, i, n) UTF_BACK_N_SAFE(s, start, i, n)
|
||||
# define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_SAFE(s, start, i, length)
|
||||
|
||||
#else /* UTF_UNSAFE */
|
||||
|
||||
# define UTF_GET_CHAR(s, start, i, length, c) UTF_GET_CHAR_UNSAFE(s, i, c)
|
||||
|
||||
# define UTF_NEXT_CHAR(s, i, length, c) UTF_NEXT_CHAR_UNSAFE(s, i, c)
|
||||
# define UTF_APPEND_CHAR(s, i, length, c) UTF_APPEND_CHAR_UNSAFE(s, i, c)
|
||||
# define UTF_FWD_1(s, i, length) UTF_FWD_1_UNSAFE(s, i)
|
||||
# define UTF_FWD_N(s, i, length, n) UTF_FWD_N_UNSAFE(s, i, n)
|
||||
# define UTF_SET_CHAR_START(s, start, i) UTF_SET_CHAR_START_UNSAFE(s, i)
|
||||
|
||||
# define UTF_PREV_CHAR(s, start, i, c) UTF_PREV_CHAR_UNSAFE(s, i, c)
|
||||
# define UTF_BACK_1(s, start, i) UTF_BACK_1_UNSAFE(s, i)
|
||||
# define UTF_BACK_N(s, start, i, n) UTF_BACK_N_UNSAFE(s, i, n)
|
||||
# define UTF_SET_CHAR_LIMIT(s, start, i, length) UTF_SET_CHAR_LIMIT_UNSAFE(s, i)
|
||||
|
||||
#endif
|
||||
/* utf_old.h contains deprecated, pre-ICU 2.4 definitions */
|
||||
#include "unicode/utf_old.h"
|
||||
|
||||
#endif
|
||||
|
|
|
@ -15,141 +15,265 @@
|
|||
*/
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: UTF-16 macros
|
||||
*
|
||||
* This file defines macros to deal with UTF-16 code units and code points.
|
||||
* "Safe" macros check for length overruns and illegal sequences, and
|
||||
* also for irregular sequences when the strict option is set.
|
||||
* "Unsafe" macros are designed for maximum speed.
|
||||
* utf16.h is included by utf.h after unicode/umachine.h
|
||||
* and some common definitions.</p>
|
||||
* <p><b>Usage:</b> ICU coding guidelines for if() statements should be followed when using these macros.
|
||||
* Compound statements (curly braces {}) must be used for if-else-while...
|
||||
* bodies and all macro statements should be terminated with semicolon.</p>
|
||||
*/
|
||||
* \file
|
||||
* \brief C API: 16-bit Unicode handling macros
|
||||
*
|
||||
* This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
|
||||
* utf16.h is included by utf.h after unicode/umachine.h
|
||||
* and some common definitions.
|
||||
*
|
||||
* For more information see utf.h and the ICU User Guide Strings chapter
|
||||
* (http://oss.software.ibm.com/icu/userguide/).
|
||||
*
|
||||
* <em>Usage:</em>
|
||||
* ICU coding guidelines for if() statements should be followed when using these macros.
|
||||
* Compound statements (curly braces {}) must be used for if-else-while...
|
||||
* bodies and all macro statements should be terminated with semicolon.
|
||||
*/
|
||||
|
||||
/* utf.h must be included first. */
|
||||
#ifndef __UTF_H__
|
||||
# include "unicode/utf.h"
|
||||
#endif
|
||||
|
||||
#ifndef __UTF16_H__
|
||||
#define __UTF16_H__
|
||||
|
||||
/* single-code point definitions -------------------------------------------- */
|
||||
|
||||
/* handle surrogate pairs */
|
||||
#define UTF_IS_FIRST_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xd800)
|
||||
#define UTF_IS_SECOND_SURROGATE(uchar) (((uchar)&0xfffffc00)==0xdc00)
|
||||
|
||||
#define UTF_IS_SURROGATE_FIRST(c) (((c)&0x400)==0)
|
||||
|
||||
/** Get the UTF-32 value directly from the surrogate pseudo-characters */
|
||||
#define UTF_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
|
||||
|
||||
#define UTF16_GET_PAIR_VALUE(first, second) \
|
||||
(((first)<<10UL)+(second)-UTF_SURROGATE_OFFSET)
|
||||
|
||||
/* get the first and second surrogates for a supplementary code point */
|
||||
/**
|
||||
* Takes a supplementary code point (0x10000..0x10ffff)
|
||||
* and computes the first surrogate (0xd800..0xdbff)
|
||||
* for UTF-16 encoding.
|
||||
* Does this code unit alone encode a code point (BMP, not a surrogate)?
|
||||
* @param c 16-bit code unit
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF_FIRST_SURROGATE(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
|
||||
#define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
|
||||
|
||||
/**
|
||||
* Takes a supplementary code point (0x10000..0x10ffff)
|
||||
* and computes the second surrogate (0xdc00..0xdfff)
|
||||
* for UTF-16 encoding.
|
||||
* Is this code unit a lead surrogate (U+d800..U+dbff)?
|
||||
* @param c 16-bit code unit
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF_SECOND_SURROGATE(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
|
||||
|
||||
/** alias for UTF_FIRST_SURROGATE */
|
||||
#define UTF16_LEAD(supplementary) UTF_FIRST_SURROGATE(supplementary)
|
||||
|
||||
/** alias for UTF_SECOND_SURROGATE */
|
||||
#define UTF16_TRAIL(supplementary) UTF_SECOND_SURROGATE(supplementary)
|
||||
|
||||
/* classes of code unit values */
|
||||
#define UTF16_IS_SINGLE(uchar) !UTF_IS_SURROGATE(uchar)
|
||||
#define UTF16_IS_LEAD(uchar) UTF_IS_FIRST_SURROGATE(uchar)
|
||||
#define UTF16_IS_TRAIL(uchar) UTF_IS_SECOND_SURROGATE(uchar)
|
||||
|
||||
/* number of code units per code point */
|
||||
#define UTF16_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0xffff)
|
||||
#define UTF16_CHAR_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
|
||||
#define UTF16_MAX_CHAR_LENGTH 2
|
||||
|
||||
/* average number of code units compared to UTF-16 */
|
||||
#define UTF16_ARRAY_SIZE(size) (size)
|
||||
#define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
|
||||
|
||||
/**
|
||||
* Get a single code point from an offset that points to any
|
||||
* of the code units that belong to that code point.
|
||||
* Assume 0<=i<length.
|
||||
* Is this code unit a trail surrogate (U+dc00..U+dfff)?
|
||||
* @param c 16-bit code unit
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
|
||||
|
||||
/**
|
||||
* Is this code unit a surrogate (U+d800..U+dfff)?
|
||||
* @param c 16-bit code unit
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
|
||||
|
||||
/**
|
||||
* Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
|
||||
* is it a lead surrogate?
|
||||
* @param c 16-bit code unit
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
|
||||
|
||||
/**
|
||||
* Helper constant for U16_GET_SUPPLEMENTARY.
|
||||
* @internal
|
||||
*/
|
||||
#define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
|
||||
|
||||
/**
|
||||
* Get a supplementary code point value (U+10000..U+10ffff)
|
||||
* from its lead and trail surrogates.
|
||||
* The result is undefined if the input values are not
|
||||
* lead and trail surrogates.
|
||||
*
|
||||
* This could be used for iteration together with
|
||||
* UTF16_CHAR_LENGTH() and UTF_IS_ERROR(),
|
||||
* but the use of UTF16_NEXT_CHAR_[UN]SAFE() and
|
||||
* UTF16_PREV_CHAR_[UN]SAFE() is more efficient for that.
|
||||
* @param lead lead surrogate (U+d800..U+dbff)
|
||||
* @param trail trail surrogate (U+dc00..U+dfff)
|
||||
* @return supplementary code point (U+10000..U+10ffff)
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF16_GET_CHAR_UNSAFE(s, i, c) { \
|
||||
#define U16_GET_SUPPLEMENTARY(lead, trail) \
|
||||
(((lead)<<10UL)+(trail)-U16_SURROGATE_OFFSET)
|
||||
|
||||
|
||||
/**
|
||||
* Get the lead surrogate (0xd800..0xdbff) for a
|
||||
* supplementary code point (0x10000..0x10ffff).
|
||||
* @param c 32-bit code point (U+10000..U+10ffff)
|
||||
* @return lead surrogate (U+d800..U+dbff) for c
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
|
||||
|
||||
/**
|
||||
* Get the trail surrogate (0xdc00..0xdfff) for a
|
||||
* supplementary code point (0x10000..0x10ffff).
|
||||
* @param c 32-bit code point (U+10000..U+10ffff)
|
||||
* @return trail surrogate (U+dc00..U+dfff) for c
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
|
||||
|
||||
/**
|
||||
* How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
|
||||
* The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
|
||||
* @param c 32-bit code point
|
||||
* @return 1 or 2
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
|
||||
|
||||
/**
|
||||
* The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
|
||||
* @return 2
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_MAX_LENGTH 2
|
||||
|
||||
/**
|
||||
* Get a code point from a string at a random-access offset,
|
||||
* without changing the offset.
|
||||
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||
*
|
||||
* The offset may point to either the lead or trail surrogate unit
|
||||
* for a supplementary code point, in which case the macro will read
|
||||
* the adjacent matching surrogate as well.
|
||||
* The result is undefined if the offset points to a single, unpaired surrogate.
|
||||
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_GET
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_GET_UNSAFE(s, i, c) { \
|
||||
(c)=(s)[i]; \
|
||||
if(UTF_IS_SURROGATE(c)) { \
|
||||
if(UTF_IS_SURROGATE_FIRST(c)) { \
|
||||
(c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)+1]); \
|
||||
if(U16_IS_SURROGATE(c)) { \
|
||||
if(U16_IS_SURROGATE_LEAD(c)) { \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
|
||||
} else { \
|
||||
(c)=UTF16_GET_PAIR_VALUE((s)[(i)-1], (c)); \
|
||||
(c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
|
||||
/**
|
||||
* Get a code point from a string at a random-access offset,
|
||||
* without changing the offset.
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* The offset may point to either the lead or trail surrogate unit
|
||||
* for a supplementary code point, in which case the macro will read
|
||||
* the adjacent matching surrogate as well.
|
||||
* If the offset points to a single, unpaired surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
* Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, start<=i<length
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_GET_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_GET(s, start, i, length, c) { \
|
||||
(c)=(s)[i]; \
|
||||
if(UTF_IS_SURROGATE(c)) { \
|
||||
if(U16_IS_SURROGATE(c)) { \
|
||||
uint16_t __c2; \
|
||||
if(UTF_IS_SURROGATE_FIRST(c)) { \
|
||||
if((i)+1<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)+1])) { \
|
||||
(c)=UTF16_GET_PAIR_VALUE((c), __c2); \
|
||||
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
|
||||
} else if(strict) {\
|
||||
/* unmatched first surrogate */ \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
if(U16_IS_SURROGATE_LEAD(c)) { \
|
||||
if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||
} \
|
||||
} else { \
|
||||
if((i)-1>=(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
|
||||
(c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
|
||||
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
|
||||
} else if(strict) {\
|
||||
/* unmatched second surrogate */ \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||
} \
|
||||
} \
|
||||
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
}
|
||||
|
||||
/* definitions with forward iteration --------------------------------------- */
|
||||
|
||||
/*
|
||||
* all the macros that go forward assume that
|
||||
* the initial offset is 0<=i<length;
|
||||
* they update the offset
|
||||
*/
|
||||
|
||||
/* fast versions, no error-checking */
|
||||
|
||||
/**
|
||||
* Get a single code point from an offset that points to the first
|
||||
* of the code units that belong to that code point.
|
||||
* Assume 0<=i<length.
|
||||
* Get a code point from a string at a code point boundary offset,
|
||||
* and advance the offset to the next code point boundary.
|
||||
* (Post-incrementing forward iteration.)
|
||||
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||
*
|
||||
* The offset may point to the lead surrogate unit
|
||||
* for a supplementary code point, in which case the macro will read
|
||||
* the following trail surrogate as well.
|
||||
* If the offset points to a trail surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
* The result is undefined if the offset points to a single, unpaired lead surrogate.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_NEXT
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF16_NEXT_CHAR_UNSAFE(s, i, c) { \
|
||||
#define U16_NEXT_UNSAFE(s, i, c) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
if(UTF_IS_FIRST_SURROGATE(c)) { \
|
||||
(c)=UTF16_GET_PAIR_VALUE((c), (s)[(i)++]); \
|
||||
if(U16_IS_LEAD(c)) { \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_APPEND_CHAR_UNSAFE(s, i, c) { \
|
||||
/**
|
||||
* Get a code point from a string at a code point boundary offset,
|
||||
* and advance the offset to the next code point boundary.
|
||||
* (Post-incrementing forward iteration.)
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* The offset may point to the lead surrogate unit
|
||||
* for a supplementary code point, in which case the macro will read
|
||||
* the following trail surrogate as well.
|
||||
* If the offset points to a trail surrogate or
|
||||
* to a single, unpaired lead surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, i<length
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_NEXT_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_NEXT(s, i, length, c) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
if(U16_IS_LEAD(c)) { \
|
||||
uint16_t __c2; \
|
||||
if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
|
||||
++(i); \
|
||||
(c)=U16_GET_SUPPLEMENTARY((c), __c2); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a code point to a string, overwriting 1 or 2 code units.
|
||||
* The offset points to the current end of the string contents
|
||||
* and is advanced (post-increment).
|
||||
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
|
||||
* Otherwise, the result is undefined.
|
||||
*
|
||||
* @param s const UChar * string buffer
|
||||
* @param i string offset
|
||||
* @param c code point to append
|
||||
* @see U16_APPEND
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_APPEND_UNSAFE(s, i, c) { \
|
||||
if((uint32_t)(c)<=0xffff) { \
|
||||
(s)[(i)++]=(uint16_t)(c); \
|
||||
} else { \
|
||||
|
@ -158,178 +282,323 @@
|
|||
} \
|
||||
}
|
||||
|
||||
#define UTF16_FWD_1_UNSAFE(s, i) { \
|
||||
if(UTF_IS_FIRST_SURROGATE((s)[(i)++])) { \
|
||||
/**
|
||||
* Append a code point to a string, overwriting 1 or 2 code units.
|
||||
* The offset points to the current end of the string contents
|
||||
* and is advanced (post-increment).
|
||||
* "Safe" macro, checks for a valid code point.
|
||||
* If a surrogate pair is written, checks for sufficient space in the string.
|
||||
* If the code point is not valid or a trail surrogate does not fit,
|
||||
* then isError is set to TRUE.
|
||||
*
|
||||
* @param s const UChar * string buffer
|
||||
* @param i string offset, i<length
|
||||
* @param capacity size of the string buffer
|
||||
* @param c code point to append
|
||||
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
|
||||
* @see U16_APPEND_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_APPEND(s, i, capacity, c, isError) { \
|
||||
if((uint32_t)(c)<=0xffff) { \
|
||||
(s)[(i)++]=(uint16_t)(c); \
|
||||
} else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
|
||||
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
|
||||
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
|
||||
} else /* c>0x10ffff or not enough space */ { \
|
||||
(isError)=TRUE; \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance the string offset from one code point boundary to the next.
|
||||
* (Post-incrementing iteration.)
|
||||
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @see U16_FWD_1
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_FWD_1_UNSAFE(s, i) { \
|
||||
if(U16_IS_LEAD((s)[(i)++])) { \
|
||||
++(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_FWD_N_UNSAFE(s, i, n) { \
|
||||
/**
|
||||
* Advance the string offset from one code point boundary to the next.
|
||||
* (Post-incrementing iteration.)
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, i<length
|
||||
* @param length string length
|
||||
* @see U16_FWD_1_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_FWD_1(s, i, length) { \
|
||||
if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
|
||||
++(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance the string offset from one code point boundary to the n-th next one,
|
||||
* i.e., move forward by n code points.
|
||||
* (Post-incrementing iteration.)
|
||||
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param n number of code points to skip
|
||||
* @see U16_FWD_N
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_FWD_N_UNSAFE(s, i, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0) { \
|
||||
UTF16_FWD_1_UNSAFE(s, i); \
|
||||
U16_FWD_1_UNSAFE(s, i); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a random-access offset and adjust it so that
|
||||
* it points to the beginning of a Unicode character.
|
||||
* The offset that is passed in points to
|
||||
* any code unit of a code point
|
||||
* and will point to the first code unit after
|
||||
* the macro invocation.
|
||||
* Never increments the offset.
|
||||
* Advance the string offset from one code point boundary to the n-th next one,
|
||||
* i.e., move forward by n code points.
|
||||
* (Post-incrementing iteration.)
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, i<length
|
||||
* @param length string length
|
||||
* @param n number of code points to skip
|
||||
* @see U16_FWD_N_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF16_SET_CHAR_START_UNSAFE(s, i) { \
|
||||
if(UTF_IS_SECOND_SURROGATE((s)[i])) { \
|
||||
--(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
/* safe versions with error-checking and optional regularity-checking */
|
||||
|
||||
#define UTF16_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
if(UTF_IS_FIRST_SURROGATE(c)) { \
|
||||
uint16_t __c2; \
|
||||
if((i)<(length) && UTF_IS_SECOND_SURROGATE(__c2=(s)[(i)])) { \
|
||||
++(i); \
|
||||
(c)=UTF16_GET_PAIR_VALUE((c), __c2); \
|
||||
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
|
||||
} else if(strict) {\
|
||||
/* unmatched first surrogate */ \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
|
||||
/* unmatched second surrogate or other non-character */ \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_APPEND_CHAR_SAFE(s, i, length, c) { \
|
||||
if((uint32_t)(c)<=0xffff) { \
|
||||
(s)[(i)++]=(uint16_t)(c); \
|
||||
} else if((uint32_t)(c)<=0x10ffff) { \
|
||||
if((i)+1<(length)) { \
|
||||
(s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
|
||||
(s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
|
||||
} else /* not enough space */ { \
|
||||
(s)[(i)++]=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
} else /* c>0x10ffff, write error value */ { \
|
||||
(s)[(i)++]=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_FWD_1_SAFE(s, i, length) { \
|
||||
if(UTF_IS_FIRST_SURROGATE((s)[(i)++]) && (i)<(length) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
|
||||
++(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_FWD_N_SAFE(s, i, length, n) { \
|
||||
#define U16_FWD_N(s, i, length, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0 && (i)<(length)) { \
|
||||
UTF16_FWD_1_SAFE(s, i, length); \
|
||||
U16_FWD_1(s, i, length); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_SET_CHAR_START_SAFE(s, start, i) { \
|
||||
if(UTF_IS_SECOND_SURROGATE((s)[i]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
|
||||
/**
|
||||
* Adjust a random-access offset to a code point boundary
|
||||
* at the start of a code point.
|
||||
* If the offset points to the trail surrogate of a surrogate pair,
|
||||
* then the offset is decremented.
|
||||
* Otherwise, it is not modified.
|
||||
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @see U16_SET_CP_START
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_SET_CP_START_UNSAFE(s, i) { \
|
||||
if(U16_IS_TRAIL((s)[i])) { \
|
||||
--(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjust a random-access offset to a code point boundary
|
||||
* at the start of a code point.
|
||||
* If the offset points to the trail surrogate of a surrogate pair,
|
||||
* then the offset is decremented.
|
||||
* Otherwise, it is not modified.
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, start<=i
|
||||
* @see U16_SET_CP_START_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_SET_CP_START(s, start, i) { \
|
||||
if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
|
||||
--(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
/* definitions with backward iteration -------------------------------------- */
|
||||
|
||||
/*
|
||||
* all the macros that go backward assume that
|
||||
* the valid buffer range starts at offset 0
|
||||
* and that the initial offset is 0<i<=length;
|
||||
* they update the offset
|
||||
*/
|
||||
|
||||
/* fast versions, no error-checking */
|
||||
|
||||
/**
|
||||
* Get a single code point from an offset that points behind the last
|
||||
* of the code units that belong to that code point.
|
||||
* Assume 0<=i<length.
|
||||
* Move the string offset from one code point boundary to the previous one
|
||||
* and get the code point between them.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||
*
|
||||
* The input offset may be the same as the string length.
|
||||
* If the offset is behind a trail surrogate unit
|
||||
* for a supplementary code point, then the macro will read
|
||||
* the preceding lead surrogate as well.
|
||||
* If the offset is behind a lead surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
* The result is undefined if the offset is behind a single, unpaired trail surrogate.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_PREV
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF16_PREV_CHAR_UNSAFE(s, i, c) { \
|
||||
#define U16_PREV_UNSAFE(s, i, c) { \
|
||||
(c)=(s)[--(i)]; \
|
||||
if(UTF_IS_SECOND_SURROGATE(c)) { \
|
||||
(c)=UTF16_GET_PAIR_VALUE((s)[--(i)], (c)); \
|
||||
if(U16_IS_TRAIL(c)) { \
|
||||
(c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_BACK_1_UNSAFE(s, i) { \
|
||||
if(UTF_IS_SECOND_SURROGATE((s)[--(i)])) { \
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one
|
||||
* and get the code point between them.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* The input offset may be the same as the string length.
|
||||
* If the offset is behind a trail surrogate unit
|
||||
* for a supplementary code point, then the macro will read
|
||||
* the preceding lead surrogate as well.
|
||||
* If the offset is behind a lead surrogate or behind a single, unpaired
|
||||
* trail surrogate, then that itself
|
||||
* will be returned as the code point.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, start<=i
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable
|
||||
* @see U16_PREV_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_PREV(s, start, i, c) { \
|
||||
(c)=(s)[--(i)]; \
|
||||
if(U16_IS_TRAIL(c)) { \
|
||||
uint16_t __c2; \
|
||||
if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
|
||||
--(i); \
|
||||
(c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* The input offset may be the same as the string length.
|
||||
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @see U16_BACK_1
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_BACK_1_UNSAFE(s, i) { \
|
||||
if(U16_IS_TRAIL((s)[--(i)])) { \
|
||||
--(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_BACK_N_UNSAFE(s, i, n) { \
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* The input offset may be the same as the string length.
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, start<=i
|
||||
* @see U16_BACK_1_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_BACK_1(s, start, i) { \
|
||||
if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
|
||||
--(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the n-th one before it,
|
||||
* i.e., move backward by n code points.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* The input offset may be the same as the string length.
|
||||
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param n number of code points to skip
|
||||
* @see U16_BACK_N
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_BACK_N_UNSAFE(s, i, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0) { \
|
||||
UTF16_BACK_1_UNSAFE(s, i); \
|
||||
U16_BACK_1_UNSAFE(s, i); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a random-access offset and adjust it so that
|
||||
* it points after the end of a Unicode character.
|
||||
* The offset that is passed in points behind
|
||||
* any code unit of a code point
|
||||
* and will point behind the last code unit after
|
||||
* the macro invocation.
|
||||
* Never decrements the offset.
|
||||
* Move the string offset from one code point boundary to the n-th one before it,
|
||||
* i.e., move backward by n code points.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* The input offset may be the same as the string length.
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, i<length
|
||||
* @param length string length
|
||||
* @param n number of code points to skip
|
||||
* @see U16_BACK_N_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF16_SET_CHAR_LIMIT_UNSAFE(s, i) { \
|
||||
if(UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
|
||||
#define U16_BACK_N(s, start, i, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0 && (i)>(start)) { \
|
||||
U16_BACK_1(s, start, i); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjust a random-access offset to a code point boundary after a code point.
|
||||
* If the offset is behind the lead surrogate of a surrogate pair,
|
||||
* then the offset is incremented.
|
||||
* Otherwise, it is not modified.
|
||||
* The input offset may be the same as the string length.
|
||||
* "Unsafe" macro, assumes well-formed UTF-16.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @see U16_SET_CP_LIMIT
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
|
||||
if(U16_IS_LEAD((s)[(i)-1])) { \
|
||||
++(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
/* safe versions with error-checking and optional regularity-checking */
|
||||
|
||||
#define UTF16_PREV_CHAR_SAFE(s, start, i, c, strict) { \
|
||||
(c)=(s)[--(i)]; \
|
||||
if(UTF_IS_SECOND_SURROGATE(c)) { \
|
||||
uint16_t __c2; \
|
||||
if((i)>(start) && UTF_IS_FIRST_SURROGATE(__c2=(s)[(i)-1])) { \
|
||||
--(i); \
|
||||
(c)=UTF16_GET_PAIR_VALUE(__c2, (c)); \
|
||||
/* strict: ((c)&0xfffe)==0xfffe is caught by UTF_IS_ERROR() and UTF_IS_UNICODE_CHAR() */ \
|
||||
} else if(strict) {\
|
||||
/* unmatched second surrogate */ \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) { \
|
||||
/* unmatched first surrogate or other non-character */ \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_BACK_1_SAFE(s, start, i) { \
|
||||
if(UTF_IS_SECOND_SURROGATE((s)[--(i)]) && (i)>(start) && UTF_IS_FIRST_SURROGATE((s)[(i)-1])) { \
|
||||
--(i); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_BACK_N_SAFE(s, start, i, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0 && (i)>(start)) { \
|
||||
UTF16_BACK_1_SAFE(s, start, i); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF16_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
|
||||
if((start)<(i) && (i)<(length) && UTF_IS_FIRST_SURROGATE((s)[(i)-1]) && UTF_IS_SECOND_SURROGATE((s)[i])) { \
|
||||
/**
|
||||
* Adjust a random-access offset to a code point boundary after a code point.
|
||||
* If the offset is behind the lead surrogate of a surrogate pair,
|
||||
* then the offset is incremented.
|
||||
* Otherwise, it is not modified.
|
||||
* The input offset may be the same as the string length.
|
||||
* "Safe" macro, handles unpaired surrogates and checks for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, start<=i<=length
|
||||
* @param length string length
|
||||
* @see U16_SET_CP_LIMIT_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U16_SET_CP_LIMIT(s, start, i, length) { \
|
||||
if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
|
||||
++(i); \
|
||||
} \
|
||||
}
|
||||
|
|
|
@ -14,146 +14,10 @@
|
|||
* created by: Markus W. Scherer
|
||||
*/
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: UTF-32 macros
|
||||
*
|
||||
* This file defines macros to deal with UTF-32 code units and code points.
|
||||
* Signatures and semantics are the same as for the similarly named macros
|
||||
* in utf16.h.
|
||||
* utf32.h is included by utf.h after unicode/umachine.h</p>
|
||||
* and some common definitions.
|
||||
* <p><b>Usage:</b> ICU coding guidelines for if() statements should be followed when using these macros.
|
||||
* Compound statements (curly braces {}) must be used for if-else-while...
|
||||
* bodies and all macro statements should be terminated with semicolon.</p>
|
||||
*/
|
||||
|
||||
#ifndef __UTF32_H__
|
||||
#define __UTF32_H__
|
||||
|
||||
/* internal definitions ----------------------------------------------------- */
|
||||
|
||||
#define UTF32_IS_SAFE(c, strict) \
|
||||
(!(strict) ? \
|
||||
(uint32_t)(c)<=0x10ffff : \
|
||||
UTF_IS_UNICODE_CHAR(c))
|
||||
|
||||
/*
|
||||
* For the semantics of all of these macros, see utf16.h.
|
||||
* The UTF-32 versions are trivial because any code point is
|
||||
* encoded using exactly one code unit.
|
||||
* \file
|
||||
* \brief C API: UTF-32 macros
|
||||
*
|
||||
* This file is deprecated and its contents moved to utf_old.h.
|
||||
* See utf_old.h and Jitterbug 2150 and its discussion on the ICU mailing list
|
||||
* in September 2002.
|
||||
*/
|
||||
|
||||
/* single-code point definitions -------------------------------------------- */
|
||||
|
||||
/* classes of code unit values */
|
||||
#define UTF32_IS_SINGLE(uchar) 1
|
||||
#define UTF32_IS_LEAD(uchar) 0
|
||||
#define UTF32_IS_TRAIL(uchar) 0
|
||||
|
||||
/* number of code units per code point */
|
||||
#define UTF32_NEED_MULTIPLE_UCHAR(c) 0
|
||||
#define UTF32_CHAR_LENGTH(c) 1
|
||||
#define UTF32_MAX_CHAR_LENGTH 1
|
||||
|
||||
/* average number of code units compared to UTF-16 */
|
||||
#define UTF32_ARRAY_SIZE(size) (size)
|
||||
|
||||
#define UTF32_GET_CHAR_UNSAFE(s, i, c) { \
|
||||
(c)=(s)[i]; \
|
||||
}
|
||||
|
||||
#define UTF32_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
|
||||
(c)=(s)[i]; \
|
||||
if(!UTF32_IS_SAFE(c, strict)) { \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
}
|
||||
|
||||
/* definitions with forward iteration --------------------------------------- */
|
||||
|
||||
#define UTF32_NEXT_CHAR_UNSAFE(s, i, c) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
}
|
||||
|
||||
#define UTF32_APPEND_CHAR_UNSAFE(s, i, c) { \
|
||||
(s)[(i)++]=(c); \
|
||||
}
|
||||
|
||||
#define UTF32_FWD_1_UNSAFE(s, i) { \
|
||||
++(i); \
|
||||
}
|
||||
|
||||
#define UTF32_FWD_N_UNSAFE(s, i, n) { \
|
||||
(i)+=(n); \
|
||||
}
|
||||
|
||||
#define UTF32_SET_CHAR_START_UNSAFE(s, i) { \
|
||||
}
|
||||
|
||||
#define UTF32_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
if(!UTF32_IS_SAFE(c, strict)) { \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF32_APPEND_CHAR_SAFE(s, i, length, c) { \
|
||||
if((uint32_t)(c)<=0x10ffff) { \
|
||||
(s)[(i)++]=(c); \
|
||||
} else /* c>0x10ffff, write 0xfffd */ { \
|
||||
(s)[(i)++]=0xfffd; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF32_FWD_1_SAFE(s, i, length) { \
|
||||
++(i); \
|
||||
}
|
||||
|
||||
#define UTF32_FWD_N_SAFE(s, i, length, n) { \
|
||||
if(((i)+=(n))>(length)) { \
|
||||
(i)=(length); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF32_SET_CHAR_START_SAFE(s, start, i) { \
|
||||
}
|
||||
|
||||
/* definitions with backward iteration -------------------------------------- */
|
||||
|
||||
#define UTF32_PREV_CHAR_UNSAFE(s, i, c) { \
|
||||
(c)=(s)[--(i)]; \
|
||||
}
|
||||
|
||||
#define UTF32_BACK_1_UNSAFE(s, i) { \
|
||||
--(i); \
|
||||
}
|
||||
|
||||
#define UTF32_BACK_N_UNSAFE(s, i, n) { \
|
||||
(i)-=(n); \
|
||||
}
|
||||
|
||||
#define UTF32_SET_CHAR_LIMIT_UNSAFE(s, i) { \
|
||||
}
|
||||
|
||||
#define UTF32_PREV_CHAR_SAFE(s, start, i, c, strict) { \
|
||||
(c)=(s)[--(i)]; \
|
||||
if(!UTF32_IS_SAFE(c, strict)) { \
|
||||
(c)=UTF_ERROR_VALUE; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF32_BACK_1_SAFE(s, start, i) { \
|
||||
--(i); \
|
||||
}
|
||||
|
||||
#define UTF32_BACK_N_SAFE(s, start, i, n) { \
|
||||
(i)-=(n); \
|
||||
if((i)<(start)) { \
|
||||
(i)=(start); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF32_SET_CHAR_LIMIT_SAFE(s, i, length) { \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -15,23 +15,25 @@
|
|||
*/
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: UTF-8 macros
|
||||
*
|
||||
* This file defines macros to deal with UTF-8 code units and code points.
|
||||
* Signatures and semantics are the same as for the similarly named macros
|
||||
* in utf16.h.
|
||||
* utf8.h is included by utf.h after unicode/umachine.h
|
||||
* and some common definitions.</p>
|
||||
* <p><b>Usage:</b> ICU coding guidelines for if() statements should be followed when using these macros.
|
||||
* Compound statements (curly braces {}) must be used for if-else-while...
|
||||
* bodies and all macro statements should be terminated with semicolon.</p>
|
||||
*/
|
||||
|
||||
* \file
|
||||
* \brief C API: 8-bit Unicode handling macros
|
||||
*
|
||||
* This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings.
|
||||
* utf8.h is included by utf.h after unicode/umachine.h
|
||||
* and some common definitions.
|
||||
*
|
||||
* For more information see utf.h and the ICU User Guide Strings chapter
|
||||
* (http://oss.software.ibm.com/icu/userguide/).
|
||||
*
|
||||
* <em>Usage:</em>
|
||||
* ICU coding guidelines for if() statements should be followed when using these macros.
|
||||
* Compound statements (curly braces {}) must be used for if-else-while...
|
||||
* bodies and all macro statements should be terminated with semicolon.
|
||||
*/
|
||||
|
||||
/* utf.h must be included first. */
|
||||
#ifndef __UTF_H__
|
||||
# include "unicode/utf.h"
|
||||
# include "unicode/utf.h"
|
||||
#endif
|
||||
|
||||
#ifndef __UTF8_H__
|
||||
|
@ -39,6 +41,12 @@
|
|||
|
||||
/* internal definitions ----------------------------------------------------- */
|
||||
|
||||
/**
|
||||
* \var utf8_countTrailBytes
|
||||
* Internal array with numbers of trail bytes for any given byte used in
|
||||
* lead byte position.
|
||||
* @internal
|
||||
*/
|
||||
#ifdef U_UTF8_IMPL
|
||||
U_CAPI const uint8_t
|
||||
utf8_countTrailBytes[256];
|
||||
|
@ -48,114 +56,166 @@ utf8_countTrailBytes[256];
|
|||
#endif
|
||||
|
||||
/**
|
||||
* Count the trail bytes for a lead byte -
|
||||
* this macro should be used so that the assembler code
|
||||
* that is mentioned in utf_impl.c could be used here.
|
||||
* Count the trail bytes for a UTF-8 lead byte.
|
||||
* @internal
|
||||
*/
|
||||
#define UTF8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte])
|
||||
#define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[(uint8_t)leadByte])
|
||||
|
||||
/* use a macro here, too - there may be a simpler way with some machines */
|
||||
#define UTF8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
|
||||
/**
|
||||
* Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value.
|
||||
* @internal
|
||||
*/
|
||||
#define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
|
||||
|
||||
/**
|
||||
* Function for handling "next code point" with error-checking.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict, UBool *pIsError);
|
||||
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict);
|
||||
|
||||
/**
|
||||
* Function for handling "append code point" with error-checking.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c);
|
||||
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError);
|
||||
|
||||
/**
|
||||
* Function for handling "previous code point" with error-checking.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict);
|
||||
|
||||
/**
|
||||
* Function for handling "skip backward one code point" with error-checking.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
||||
|
||||
/*
|
||||
* For the semantics of all of these macros, see utf16.h.
|
||||
* The UTF-8 macros favor sequences more the shorter they are.
|
||||
* Sometimes, only the single-byte case is covered by a macro,
|
||||
* while longer sequences are handled by a function call.
|
||||
*/
|
||||
|
||||
/* single-code point definitions -------------------------------------------- */
|
||||
|
||||
/** Is this this code point a single code unit (byte)? */
|
||||
#define UTF8_IS_SINGLE(uchar) (((uchar)&0x80)==0)
|
||||
/** Is this this code unit the lead code unit (byte) of a code point? */
|
||||
#define UTF8_IS_LEAD(uchar) ((uint8_t)((uchar)-0xc0)<0x3e)
|
||||
/** Is this this code unit a trailing code unit (byte) of a code point? */
|
||||
#define UTF8_IS_TRAIL(uchar) (((uchar)&0xc0)==0x80)
|
||||
|
||||
/** Does this scalar Unicode value need multiple code units for storage? */
|
||||
#define UTF8_NEED_MULTIPLE_UCHAR(c) ((uint32_t)(c)>0x7f)
|
||||
/**
|
||||
* Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_IS_SINGLE(c) (((c)&0x80)==0)
|
||||
|
||||
/**
|
||||
* Given the lead character, how many bytes are taken by this code point.
|
||||
* ICU does not deal with code points >0x10ffff
|
||||
* unless necessary for advancing in the byte stream.
|
||||
*
|
||||
* These length macros take into account that for values >0x10ffff
|
||||
* the "safe" append macros would write the error code point 0xffff
|
||||
* with 3 bytes.
|
||||
* Code point comparisons need to be in uint32_t because UChar32
|
||||
* may be a signed type, and negative values must be recognized.
|
||||
* Is this code unit (byte) a UTF-8 lead byte?
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#if 1
|
||||
# define UTF8_CHAR_LENGTH(c) \
|
||||
((uint32_t)(c)<=0x7f ? 1 : \
|
||||
((uint32_t)(c)<=0x7ff ? 2 : \
|
||||
((uint32_t)((c)-0x10000)>0xfffff ? 3 : 4) \
|
||||
) \
|
||||
)
|
||||
#else
|
||||
# define UTF8_CHAR_LENGTH(c) \
|
||||
((uint32_t)(c)<=0x7f ? 1 : \
|
||||
((uint32_t)(c)<=0x7ff ? 2 : \
|
||||
((uint32_t)(c)<=0xffff ? 3 : \
|
||||
((uint32_t)(c)<=0x10ffff ? 4 : \
|
||||
((uint32_t)(c)<=0x3ffffff ? 5 : \
|
||||
((uint32_t)(c)<=0x7fffffff ? 6 : 3) \
|
||||
) \
|
||||
) \
|
||||
#define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e)
|
||||
|
||||
/**
|
||||
* Is this code unit (byte) a UTF-8 trail byte?
|
||||
* @param c 8-bit code unit (byte)
|
||||
* @return TRUE or FALSE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_IS_TRAIL(c) (((c)&0xc0)==0x80)
|
||||
|
||||
/**
|
||||
* How many code units (bytes) are used for the UTF-8 encoding
|
||||
* of this Unicode code point?
|
||||
* @param c 32-bit code point
|
||||
* @return 1..4, or 0 if c is a surrogate or not a Unicode code point
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_LENGTH(c) \
|
||||
((uint32_t)(c)<=0x7f ? 1 : \
|
||||
((uint32_t)(c)<=0x7ff ? 2 : \
|
||||
((uint32_t)(c)<=0xd7ff ? 3 : \
|
||||
((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
|
||||
((uint32_t)(c)<=0xffff ? 3 : 4)\
|
||||
) \
|
||||
) \
|
||||
)
|
||||
#endif
|
||||
) \
|
||||
)
|
||||
|
||||
/** The maximum number of bytes per code point */
|
||||
#define UTF8_MAX_CHAR_LENGTH 4
|
||||
/**
|
||||
* The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
|
||||
* @return 4
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_MAX_LENGTH 4
|
||||
|
||||
/** Average number of code units compared to UTF-16 */
|
||||
#define UTF8_ARRAY_SIZE(size) ((5*(size))/2)
|
||||
|
||||
#define UTF8_GET_CHAR_UNSAFE(s, i, c) { \
|
||||
/**
|
||||
* Get a code point from a string at a random-access offset,
|
||||
* without changing the offset.
|
||||
* The offset may point to either the lead byte or one of the trail bytes
|
||||
* for a code point, in which case the macro will read all of the bytes
|
||||
* for the code point.
|
||||
* The result is undefined if the offset points to an illegal UTF-8
|
||||
* byte sequence.
|
||||
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param c output UChar32 variable
|
||||
* @see U8_GET
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_GET_UNSAFE(s, i, c) { \
|
||||
int32_t __I=(int32_t)(i); \
|
||||
UTF8_SET_CHAR_START_UNSAFE(s, __I); \
|
||||
UTF8_NEXT_CHAR_UNSAFE(s, __I, c); \
|
||||
U8_SET_CP_START_UNSAFE(s, __I); \
|
||||
U8_NEXT_UNSAFE(s, __I, c); \
|
||||
}
|
||||
|
||||
#define UTF8_GET_CHAR_SAFE(s, start, i, length, c, strict) { \
|
||||
/**
|
||||
* Get a code point from a string at a random-access offset,
|
||||
* without changing the offset.
|
||||
* The offset may point to either the lead byte or one of the trail bytes
|
||||
* for a code point, in which case the macro will read all of the bytes
|
||||
* for the code point.
|
||||
* If the offset points to an illegal UTF-8 byte sequence, then
|
||||
* c is set to a negative value.
|
||||
* Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset
|
||||
* @param i string offset, start<=i<length
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable, set to <0 in case of an error
|
||||
* @see U8_GET_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_GET(s, start, i, length, c) { \
|
||||
int32_t __I=(int32_t)(i); \
|
||||
UTF8_SET_CHAR_START_SAFE(s, start, __I); \
|
||||
UTF8_NEXT_CHAR_SAFE(s, __I, length, c, strict); \
|
||||
U8_SET_CP_START(s, start, __I); \
|
||||
U8_NEXT(s, __I, length, c); \
|
||||
}
|
||||
|
||||
/* definitions with forward iteration --------------------------------------- */
|
||||
|
||||
/**
|
||||
* Read a Unicode scalar value from an array of UTF-8 bytes.
|
||||
* Only values <=0x10ffff are accepted, and if an error occurs,
|
||||
* then c will be set such that UTF_IS_ERROR(c).
|
||||
* The _UNSAFE macro is fast and does not check for errors.
|
||||
* The _SAFE macro checks for errors and optionally for
|
||||
* irregular sequences, too, i.e., for sequences that
|
||||
* are longer than necessary, such as <c0 80> instead of <0>.
|
||||
* The strict checks also check for non-characters.
|
||||
* Get a code point from a string at a code point boundary offset,
|
||||
* and advance the offset to the next code point boundary.
|
||||
* (Post-incrementing forward iteration.)
|
||||
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||
*
|
||||
* The offset may point to the lead byte of a multi-byte sequence,
|
||||
* in which case the macro will read the whole sequence.
|
||||
* The result is undefined if the offset points to a trail byte
|
||||
* or an illegal UTF-8 sequence.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param c output UChar32 variable
|
||||
* @see U8_NEXT
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF8_NEXT_CHAR_UNSAFE(s, i, c) { \
|
||||
#define U8_NEXT_UNSAFE(s, i, c) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
if((uint8_t)((c)-0xc0)<0x35) { \
|
||||
uint8_t __count=UTF8_COUNT_TRAIL_BYTES(c); \
|
||||
UTF8_MASK_LEAD_BYTE(c, __count); \
|
||||
uint8_t __count=U8_COUNT_TRAIL_BYTES(c); \
|
||||
U8_MASK_LEAD_BYTE(c, __count); \
|
||||
switch(__count) { \
|
||||
/* each following branch falls through to the next one */ \
|
||||
case 3: \
|
||||
|
@ -170,7 +230,49 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
} \
|
||||
}
|
||||
|
||||
#define UTF8_APPEND_CHAR_UNSAFE(s, i, c) { \
|
||||
/**
|
||||
* Get a code point from a string at a code point boundary offset,
|
||||
* and advance the offset to the next code point boundary.
|
||||
* (Post-incrementing forward iteration.)
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* The offset may point to the lead byte of a multi-byte sequence,
|
||||
* in which case the macro will read the whole sequence.
|
||||
* If the offset points to a trail byte or an illegal UTF-8 sequence, then
|
||||
* c is set to a negative value.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, i<length
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable, set to <0 in case of an error
|
||||
* @see U8_NEXT_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_NEXT(s, i, length, c) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
if((c)>=0x80) { \
|
||||
if(U8_IS_LEAD(c)) { \
|
||||
(c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, -1); \
|
||||
} else { \
|
||||
(c)=U_SENTINEL; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Append a code point to a string, overwriting 1 to 4 bytes.
|
||||
* The offset points to the current end of the string contents
|
||||
* and is advanced (post-increment).
|
||||
* "Unsafe" macro, assumes a valid code point and sufficient space in the string.
|
||||
* Otherwise, the result is undefined.
|
||||
*
|
||||
* @param s const UChar * string buffer
|
||||
* @param i string offset
|
||||
* @param c code point to append
|
||||
* @see U8_APPEND
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_APPEND_UNSAFE(s, i, c) { \
|
||||
if((uint32_t)(c)<=0x7f) { \
|
||||
(s)[(i)++]=(uint8_t)(c); \
|
||||
} else { \
|
||||
|
@ -189,74 +291,172 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
} \
|
||||
}
|
||||
|
||||
#define UTF8_FWD_1_UNSAFE(s, i) { \
|
||||
(i)+=1+UTF8_COUNT_TRAIL_BYTES((s)[i]); \
|
||||
}
|
||||
|
||||
#define UTF8_FWD_N_UNSAFE(s, i, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0) { \
|
||||
UTF8_FWD_1_UNSAFE(s, i); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF8_SET_CHAR_START_UNSAFE(s, i) { \
|
||||
while(UTF8_IS_TRAIL((s)[i])) { --(i); } \
|
||||
}
|
||||
|
||||
#define UTF8_NEXT_CHAR_SAFE(s, i, length, c, strict) { \
|
||||
(c)=(s)[(i)++]; \
|
||||
if((c)>=0x80) { \
|
||||
if(UTF8_IS_LEAD(c)) { \
|
||||
(c)=utf8_nextCharSafeBody(s, &(i), (int32_t)(length), c, strict, NULL); \
|
||||
} else { \
|
||||
(c)=UTF8_ERROR_VALUE_1; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF8_APPEND_CHAR_SAFE(s, i, length, c) { \
|
||||
/**
|
||||
* Append a code point to a string, overwriting 1 or 2 code units.
|
||||
* The offset points to the current end of the string contents
|
||||
* and is advanced (post-increment).
|
||||
* "Safe" macro, checks for a valid code point.
|
||||
* If a non-ASCII code point is written, checks for sufficient space in the string.
|
||||
* If the code point is not valid or trail bytes do not fit,
|
||||
* then isError is set to TRUE.
|
||||
*
|
||||
* @param s const UChar * string buffer
|
||||
* @param i string offset, i<length
|
||||
* @param capacity size of the string buffer
|
||||
* @param c code point to append
|
||||
* @param isError output UBool set to TRUE if an error occurs, otherwise not modified
|
||||
* @see U8_APPEND_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_APPEND(s, i, length, c, isError) { \
|
||||
if((uint32_t)(c)<=0x7f) { \
|
||||
(s)[(i)++]=(uint8_t)(c); \
|
||||
} else { \
|
||||
(i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c); \
|
||||
(i)=utf8_appendCharSafeBody(s, (int32_t)(i), (int32_t)(length), c, &(isError)); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF8_FWD_1_SAFE(s, i, length) { \
|
||||
/**
|
||||
* Advance the string offset from one code point boundary to the next.
|
||||
* (Post-incrementing iteration.)
|
||||
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @see U8_FWD_1
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_FWD_1_UNSAFE(s, i) { \
|
||||
(i)+=1+U8_COUNT_TRAIL_BYTES((s)[i]); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance the string offset from one code point boundary to the next.
|
||||
* (Post-incrementing iteration.)
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, i<length
|
||||
* @param length string length
|
||||
* @see U8_FWD_1_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_FWD_1(s, i, length) { \
|
||||
uint8_t __b=(s)[(i)++]; \
|
||||
if(UTF8_IS_LEAD(__b)) { \
|
||||
uint8_t __count=UTF8_COUNT_TRAIL_BYTES(__b); \
|
||||
if(U8_IS_LEAD(__b)) { \
|
||||
uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \
|
||||
if((i)+__count>(length)) { \
|
||||
__count=(uint8_t)((length)-(i)); \
|
||||
} \
|
||||
while(__count>0 && UTF8_IS_TRAIL((s)[i])) { \
|
||||
while(__count>0 && U8_IS_TRAIL((s)[i])) { \
|
||||
++(i); \
|
||||
--__count; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF8_FWD_N_SAFE(s, i, length, n) { \
|
||||
/**
|
||||
* Advance the string offset from one code point boundary to the n-th next one,
|
||||
* i.e., move forward by n code points.
|
||||
* (Post-incrementing iteration.)
|
||||
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param n number of code points to skip
|
||||
* @see U8_FWD_N
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_FWD_N_UNSAFE(s, i, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0 && (i)<(length)) { \
|
||||
UTF8_FWD_1_SAFE(s, i, length); \
|
||||
while(__N>0) { \
|
||||
U8_FWD_1_UNSAFE(s, i); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF8_SET_CHAR_START_SAFE(s, start, i) { \
|
||||
if(UTF8_IS_TRAIL((s)[(i)])) { \
|
||||
/**
|
||||
* Advance the string offset from one code point boundary to the n-th next one,
|
||||
* i.e., move forward by n code points.
|
||||
* (Post-incrementing iteration.)
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, i<length
|
||||
* @param length string length
|
||||
* @param n number of code points to skip
|
||||
* @see U8_FWD_N_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_FWD_N(s, i, length, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0 && (i)<(length)) { \
|
||||
U8_FWD_1(s, i, length); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjust a random-access offset to a code point boundary
|
||||
* at the start of a code point.
|
||||
* If the offset points to a UTF-8 trail byte,
|
||||
* then the offset is moved backward to the corresponding lead byte.
|
||||
* Otherwise, it is not modified.
|
||||
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @see U8_SET_CP_START
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_SET_CP_START_UNSAFE(s, i) { \
|
||||
while(U8_IS_TRAIL((s)[i])) { --(i); } \
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjust a random-access offset to a code point boundary
|
||||
* at the start of a code point.
|
||||
* If the offset points to a UTF-8 trail byte,
|
||||
* then the offset is moved backward to the corresponding lead byte.
|
||||
* Otherwise, it is not modified.
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, start<=i
|
||||
* @see U8_SET_CP_START_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_SET_CP_START(s, start, i) { \
|
||||
if(U8_IS_TRAIL((s)[(i)])) { \
|
||||
(i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \
|
||||
} \
|
||||
}
|
||||
|
||||
/* definitions with backward iteration -------------------------------------- */
|
||||
|
||||
#define UTF8_PREV_CHAR_UNSAFE(s, i, c) { \
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one
|
||||
* and get the code point between them.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||
*
|
||||
* The input offset may be the same as the string length.
|
||||
* If the offset is behind a multi-byte sequence, then the macro will read
|
||||
* the whole sequence.
|
||||
* If the offset is behind a lead byte, then that itself
|
||||
* will be returned as the code point.
|
||||
* The result is undefined if the offset is behind an illegal UTF-8 sequence.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param c output UChar32 variable
|
||||
* @see U8_PREV
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_PREV_UNSAFE(s, i, c) { \
|
||||
(c)=(s)[--(i)]; \
|
||||
if(UTF8_IS_TRAIL(c)) { \
|
||||
if(U8_IS_TRAIL(c)) { \
|
||||
uint8_t __b, __count=1, __shift=6; \
|
||||
\
|
||||
/* c is a trail byte */ \
|
||||
|
@ -264,7 +464,7 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
for(;;) { \
|
||||
__b=(s)[--(i)]; \
|
||||
if(__b>=0xc0) { \
|
||||
UTF8_MASK_LEAD_BYTE(__b, __count); \
|
||||
U8_MASK_LEAD_BYTE(__b, __count); \
|
||||
(c)|=(UChar32)__b<<__shift; \
|
||||
break; \
|
||||
} else { \
|
||||
|
@ -276,57 +476,151 @@ utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i);
|
|||
} \
|
||||
}
|
||||
|
||||
#define UTF8_BACK_1_UNSAFE(s, i) { \
|
||||
while(UTF8_IS_TRAIL((s)[--(i)])) {} \
|
||||
}
|
||||
|
||||
#define UTF8_BACK_N_UNSAFE(s, i, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0) { \
|
||||
UTF8_BACK_1_UNSAFE(s, i); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF8_SET_CHAR_LIMIT_UNSAFE(s, i) { \
|
||||
UTF8_BACK_1_UNSAFE(s, i); \
|
||||
UTF8_FWD_1_UNSAFE(s, i); \
|
||||
}
|
||||
|
||||
#define UTF8_PREV_CHAR_SAFE(s, start, i, c, strict) { \
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one
|
||||
* and get the code point between them.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* The input offset may be the same as the string length.
|
||||
* If the offset is behind a multi-byte sequence, then the macro will read
|
||||
* the whole sequence.
|
||||
* If the offset is behind a lead byte, then that itself
|
||||
* will be returned as the code point.
|
||||
* If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, start<=i
|
||||
* @param length string length
|
||||
* @param c output UChar32 variable, set to <0 in case of an error
|
||||
* @see U8_PREV_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_PREV(s, start, i, c) { \
|
||||
(c)=(s)[--(i)]; \
|
||||
if((c)>=0x80) { \
|
||||
if((c)<=0xbf) { \
|
||||
(c)=utf8_prevCharSafeBody(s, start, &(i), c, strict); \
|
||||
(c)=utf8_prevCharSafeBody(s, start, &(i), c, -1); \
|
||||
} else { \
|
||||
(c)=UTF8_ERROR_VALUE_1; \
|
||||
(c)=U_SENTINEL; \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF8_BACK_1_SAFE(s, start, i) { \
|
||||
if(UTF8_IS_TRAIL((s)[--(i)])) { \
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* The input offset may be the same as the string length.
|
||||
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @see U8_BACK_1
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_BACK_1_UNSAFE(s, i) { \
|
||||
while(U8_IS_TRAIL((s)[--(i)])) {} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the previous one.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* The input offset may be the same as the string length.
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, start<=i
|
||||
* @see U8_BACK_1_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_BACK_1(s, start, i) { \
|
||||
if(U8_IS_TRAIL((s)[--(i)])) { \
|
||||
(i)=utf8_back1SafeBody(s, start, (int32_t)(i)); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF8_BACK_N_SAFE(s, start, i, n) { \
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the n-th one before it,
|
||||
* i.e., move backward by n code points.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* The input offset may be the same as the string length.
|
||||
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @param n number of code points to skip
|
||||
* @see U8_BACK_N
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_BACK_N_UNSAFE(s, i, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0 && (i)>(start)) { \
|
||||
UTF8_BACK_1_SAFE(s, start, i); \
|
||||
while(__N>0) { \
|
||||
U8_BACK_1_UNSAFE(s, i); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
/*
|
||||
* Need to use UTF8_FWD_1_SAFE() because UTF8_BACK_1_SAFE()
|
||||
* may have started from the middle of the sequence and not checked
|
||||
* all trail bytes.
|
||||
/**
|
||||
* Move the string offset from one code point boundary to the n-th one before it,
|
||||
* i.e., move backward by n code points.
|
||||
* (Pre-decrementing backward iteration.)
|
||||
* The input offset may be the same as the string length.
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset, i<length
|
||||
* @param length string length
|
||||
* @param n number of code points to skip
|
||||
* @see U8_BACK_N_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define UTF8_SET_CHAR_LIMIT_SAFE(s, start, i, length) { \
|
||||
#define U8_BACK_N(s, start, i, n) { \
|
||||
int32_t __N=(n); \
|
||||
while(__N>0 && (i)>(start)) { \
|
||||
U8_BACK_1(s, start, i); \
|
||||
--__N; \
|
||||
} \
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjust a random-access offset to a code point boundary after a code point.
|
||||
* If the offset is behind a partial multi-byte sequence,
|
||||
* then the offset is incremented to behind the whole sequence.
|
||||
* Otherwise, it is not modified.
|
||||
* The input offset may be the same as the string length.
|
||||
* "Unsafe" macro, assumes well-formed UTF-8.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param i string offset
|
||||
* @see U8_SET_CP_LIMIT
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_SET_CP_LIMIT_UNSAFE(s, i) { \
|
||||
U8_BACK_1_UNSAFE(s, i); \
|
||||
U8_FWD_1_UNSAFE(s, i); \
|
||||
}
|
||||
|
||||
/**
|
||||
* Adjust a random-access offset to a code point boundary after a code point.
|
||||
* If the offset is behind a partial multi-byte sequence,
|
||||
* then the offset is incremented to behind the whole sequence.
|
||||
* Otherwise, it is not modified.
|
||||
* The input offset may be the same as the string length.
|
||||
* "Safe" macro, checks for illegal sequences and for string boundaries.
|
||||
*
|
||||
* @param s const UChar * string
|
||||
* @param start starting string offset (usually 0)
|
||||
* @param i string offset, start<=i<=length
|
||||
* @param length string length
|
||||
* @see U8_SET_CP_LIMIT_UNSAFE
|
||||
* @draft ICU 2.4
|
||||
*/
|
||||
#define U8_SET_CP_LIMIT(s, start, i, length) { \
|
||||
if((start)<(i) && (i)<(length)) { \
|
||||
UTF8_BACK_1_SAFE(s, start, i); \
|
||||
UTF8_FWD_1_SAFE(s, i, length); \
|
||||
U8_BACK_1(s, start, i); \
|
||||
U8_FWD_1(s, i, length); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
|
1153
icu4c/source/common/unicode/utf_old.h
Normal file
1153
icu4c/source/common/unicode/utf_old.h
Normal file
File diff suppressed because it is too large
Load diff
|
@ -198,7 +198,8 @@ UnicodeString::UnicodeString(UChar32 ch)
|
|||
fFlags(kShortString)
|
||||
{
|
||||
int32_t i = 0;
|
||||
UTF_APPEND_CHAR(fStackBuffer, i, US_STACKBUF_SIZE, ch);
|
||||
UBool isError = FALSE;
|
||||
U16_APPEND(fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
|
||||
fLength = i;
|
||||
}
|
||||
|
||||
|
|
|
@ -157,140 +157,118 @@ u_strchr32(const UChar *s, UChar32 c) {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Match each code point in a string against each code point in the matchSet.
|
||||
* Return the index of the first string code point that
|
||||
* is (polarity==TRUE) or is not (FALSE) contained in the matchSet.
|
||||
* Return -(string length)-1 if there is no such code point.
|
||||
*/
|
||||
static int32_t
|
||||
_matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) {
|
||||
int32_t matchLen, matchBMPLen, strItr, matchItr;
|
||||
UChar32 stringCh, matchCh;
|
||||
UChar c, c2;
|
||||
|
||||
/* first part of matchSet contains only BMP code points */
|
||||
matchBMPLen = 0;
|
||||
while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
|
||||
++matchBMPLen;
|
||||
}
|
||||
|
||||
/* second part of matchSet contains BMP and supplementary code points */
|
||||
matchLen = matchBMPLen;
|
||||
while(matchSet[matchLen] != 0) {
|
||||
++matchLen;
|
||||
}
|
||||
|
||||
for(strItr = 0; (c = string[strItr]) != 0;) {
|
||||
++strItr;
|
||||
if(U16_IS_SINGLE(c)) {
|
||||
if(polarity) {
|
||||
for(matchItr = 0; matchItr < matchLen; ++matchItr) {
|
||||
if(c == matchSet[matchItr]) {
|
||||
return strItr - 1; /* one matches */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(matchItr = 0; matchItr < matchLen; ++matchItr) {
|
||||
if(c == matchSet[matchItr]) {
|
||||
goto endloop;
|
||||
}
|
||||
}
|
||||
return strItr - 1; /* none matches */
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* No need to check for string length before U16_IS_TRAIL
|
||||
* because c2 could at worst be the terminating NUL.
|
||||
*/
|
||||
if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
|
||||
++strItr;
|
||||
stringCh = U16_GET_SUPPLEMENTARY(c, c2);
|
||||
} else {
|
||||
stringCh = c; /* unpaired trail surrogate */
|
||||
}
|
||||
|
||||
if(polarity) {
|
||||
for(matchItr = matchBMPLen; matchItr < matchLen;) {
|
||||
U16_NEXT(matchSet, matchItr, matchLen, matchCh);
|
||||
if(stringCh == matchCh) {
|
||||
return strItr - U16_LENGTH(stringCh); /* one matches */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(matchItr = matchBMPLen; matchItr < matchLen;) {
|
||||
U16_NEXT(matchSet, matchItr, matchLen, matchCh);
|
||||
if(stringCh == matchCh) {
|
||||
goto endloop;
|
||||
}
|
||||
}
|
||||
return strItr - U16_LENGTH(stringCh); /* none matches */
|
||||
}
|
||||
}
|
||||
endloop:
|
||||
/* wish C had continue with labels like Java... */;
|
||||
}
|
||||
|
||||
/* Didn't find it. */
|
||||
return -strItr-1;
|
||||
}
|
||||
|
||||
/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
|
||||
U_CAPI UChar * U_EXPORT2
|
||||
u_strpbrk(const UChar *string, const UChar *matchSet)
|
||||
{
|
||||
int32_t matchLen;
|
||||
UBool single = TRUE;
|
||||
|
||||
for (matchLen = 0; matchSet[matchLen]; matchLen++)
|
||||
{
|
||||
if (!UTF_IS_SINGLE(matchSet[matchLen]))
|
||||
{
|
||||
single = FALSE;
|
||||
}
|
||||
int32_t index = _matchFromSet(string, matchSet, TRUE);
|
||||
if(index >= 0) {
|
||||
return (UChar *)string + index;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (single)
|
||||
{
|
||||
const UChar *matchItr;
|
||||
const UChar *strItr;
|
||||
|
||||
for (strItr = string; *strItr; strItr++)
|
||||
{
|
||||
for (matchItr = matchSet; *matchItr; matchItr++)
|
||||
{
|
||||
if (*matchItr == *strItr)
|
||||
{
|
||||
return (UChar *)strItr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
int32_t matchItr;
|
||||
int32_t strItr;
|
||||
UChar32 stringCh, matchSetCh;
|
||||
int32_t stringLen = u_strlen(string);
|
||||
|
||||
for (strItr = 0; strItr < stringLen; strItr++)
|
||||
{
|
||||
UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE);
|
||||
for (matchItr = 0; matchItr < matchLen; matchItr++)
|
||||
{
|
||||
UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE);
|
||||
if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE
|
||||
|| string[strItr] == UTF_ERROR_VALUE
|
||||
|| (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr]))))
|
||||
{
|
||||
return (UChar *)string + strItr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Didn't find it. */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Search for a codepoint in a string that matches one of the matchSet codepoints. */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_strcspn(const UChar *string, const UChar *matchSet)
|
||||
{
|
||||
const UChar *foundStr = u_strpbrk(string, matchSet);
|
||||
if (foundStr == NULL)
|
||||
{
|
||||
return u_strlen(string);
|
||||
int32_t index = _matchFromSet(string, matchSet, TRUE);
|
||||
if(index >= 0) {
|
||||
return index;
|
||||
} else {
|
||||
return -index - 1; /* == u_strlen(string) */
|
||||
}
|
||||
return foundStr - string;
|
||||
}
|
||||
|
||||
/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_strspn(const UChar *string, const UChar *matchSet)
|
||||
{
|
||||
UBool single = TRUE;
|
||||
UBool match = TRUE;
|
||||
int32_t matchLen;
|
||||
int32_t retValue;
|
||||
|
||||
for (matchLen = 0; matchSet[matchLen]; matchLen++)
|
||||
{
|
||||
if (!UTF_IS_SINGLE(matchSet[matchLen]))
|
||||
{
|
||||
single = FALSE;
|
||||
}
|
||||
int32_t index = _matchFromSet(string, matchSet, FALSE);
|
||||
if(index >= 0) {
|
||||
return index;
|
||||
} else {
|
||||
return -index - 1; /* == u_strlen(string) */
|
||||
}
|
||||
|
||||
if (single)
|
||||
{
|
||||
const UChar *matchItr;
|
||||
const UChar *strItr;
|
||||
|
||||
for (strItr = string; *strItr && match; strItr++)
|
||||
{
|
||||
match = FALSE;
|
||||
for (matchItr = matchSet; *matchItr; matchItr++)
|
||||
{
|
||||
if (*matchItr == *strItr)
|
||||
{
|
||||
match = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
retValue = strItr - string - (match == FALSE);
|
||||
}
|
||||
else
|
||||
{
|
||||
int32_t matchItr;
|
||||
int32_t strItr;
|
||||
UChar32 stringCh, matchSetCh;
|
||||
int32_t stringLen = u_strlen(string);
|
||||
|
||||
for (strItr = 0; strItr < stringLen && match; strItr++)
|
||||
{
|
||||
match = FALSE;
|
||||
UTF_GET_CHAR_SAFE(string, 0, strItr, stringLen, stringCh, TRUE);
|
||||
for (matchItr = 0; matchItr < matchLen; matchItr++)
|
||||
{
|
||||
UTF_GET_CHAR_SAFE(matchSet, 0, matchItr, matchLen, matchSetCh, TRUE);
|
||||
if (stringCh == matchSetCh && (stringCh != UTF_ERROR_VALUE
|
||||
|| string[strItr] == UTF_ERROR_VALUE
|
||||
|| (matchSetCh == UTF_ERROR_VALUE && !UTF_IS_SINGLE(matchSet[matchItr]))))
|
||||
{
|
||||
match = TRUE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
retValue = strItr - (match == FALSE);
|
||||
}
|
||||
|
||||
/* Found a mismatch or didn't find it. */
|
||||
return retValue;
|
||||
}
|
||||
|
||||
/* ----- Text manipulation functions --- */
|
||||
|
|
|
@ -228,7 +228,6 @@ u_strFromUTF8(UChar *dest,
|
|||
int32_t index = 0;
|
||||
int32_t reqLength = 0;
|
||||
uint8_t* pSrc = (uint8_t*) src;
|
||||
UBool isError;
|
||||
|
||||
/* args check */
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
|
||||
|
@ -249,8 +248,8 @@ u_strFromUTF8(UChar *dest,
|
|||
if(ch <=0x7f){
|
||||
*pDest++=(UChar)ch;
|
||||
}else{
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, FALSE, &isError);
|
||||
if(isError){
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
|
||||
if(ch<0){
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}else if(ch<=0xFFFF){
|
||||
|
@ -272,8 +271,8 @@ u_strFromUTF8(UChar *dest,
|
|||
if(ch <= 0x7f){
|
||||
reqLength++;
|
||||
}else{
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, FALSE, &isError);
|
||||
if(isError){
|
||||
ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -1);
|
||||
if(ch<0){
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -83,7 +83,7 @@ utf8_errorValue[6]={
|
|||
};
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict, UBool *pIsError) {
|
||||
utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict) {
|
||||
int32_t i=*pi;
|
||||
uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
|
||||
if((i)+count<=(length)) {
|
||||
|
@ -118,10 +118,11 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
|
|||
illegal|=(trail&0xc0)^0x80;
|
||||
break;
|
||||
case 0:
|
||||
if(pIsError!=NULL) {
|
||||
*pIsError=TRUE;
|
||||
if(strict>=0) {
|
||||
return UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
return UTF8_ERROR_VALUE_1;
|
||||
/* no default branch to optimize switch() - all values are covered */
|
||||
}
|
||||
|
||||
|
@ -132,6 +133,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
|
|||
* Starting with Unicode 3.0.1, non-shortest forms are illegal.
|
||||
* Starting with Unicode 3.2, surrogate code points must not be
|
||||
* encoded in UTF-8, and there are no irregular sequences any more.
|
||||
*
|
||||
* U8_ macros (new in ICU 2.4) return negative values for error conditions.
|
||||
*/
|
||||
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
||||
|
@ -145,21 +148,14 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
|
|||
++(i);
|
||||
--count;
|
||||
}
|
||||
c=utf8_errorValue[errorCount-count];
|
||||
if(pIsError!=NULL) {
|
||||
*pIsError=TRUE;
|
||||
if(strict>=0) {
|
||||
c=utf8_errorValue[errorCount-count];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
} else if((strict) && UTF_IS_UNICODE_NONCHAR(c)) {
|
||||
} else if((strict)>0 && UTF_IS_UNICODE_NONCHAR(c)) {
|
||||
/* strict: forbid non-characters like U+fffe */
|
||||
c=utf8_errorValue[count];
|
||||
if(pIsError!=NULL) {
|
||||
*pIsError=TRUE;
|
||||
}
|
||||
} else {
|
||||
/* good result */
|
||||
if(pIsError!=NULL) {
|
||||
*pIsError=FALSE;
|
||||
}
|
||||
}
|
||||
} else /* too few bytes left */ {
|
||||
/* error handling */
|
||||
|
@ -168,9 +164,10 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
|
|||
while((i)<(length) && UTF8_IS_TRAIL(s[i])) {
|
||||
++(i);
|
||||
}
|
||||
c=utf8_errorValue[i-i0];
|
||||
if(pIsError!=NULL) {
|
||||
*pIsError=TRUE;
|
||||
if(strict>=0) {
|
||||
c=utf8_errorValue[i-i0];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
}
|
||||
*pi=i;
|
||||
|
@ -178,8 +175,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
|
|||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
|
||||
if((c)<=0x7ff) {
|
||||
utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError) {
|
||||
if((uint32_t)(c)<=0x7ff) {
|
||||
if((i)+1<(length)) {
|
||||
(s)[(i)++]=(uint8_t)(((c)>>6)|0xc0);
|
||||
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
|
||||
|
@ -187,7 +184,7 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
|
|||
}
|
||||
} else if((uint32_t)(c)<=0xffff) {
|
||||
/* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
|
||||
if((i)+2<(length) && !UTF_IS_SURROGATE(c)) {
|
||||
if((i)+2<(length) && !U_IS_SURROGATE(c)) {
|
||||
(s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
|
||||
(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
|
||||
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
|
||||
|
@ -203,18 +200,22 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
|
|||
}
|
||||
}
|
||||
/* c>0x10ffff or not enough space, write an error value */
|
||||
length-=i;
|
||||
if(length>0) {
|
||||
int32_t offset;
|
||||
if(length>3) {
|
||||
length=3;
|
||||
if(pIsError!=NULL) {
|
||||
*pIsError=TRUE;
|
||||
} else {
|
||||
length-=i;
|
||||
if(length>0) {
|
||||
int32_t offset;
|
||||
if(length>3) {
|
||||
length=3;
|
||||
}
|
||||
s+=i;
|
||||
offset=0;
|
||||
c=utf8_errorValue[length-1];
|
||||
UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
|
||||
i=i+offset;
|
||||
}
|
||||
s+=i;
|
||||
offset=0;
|
||||
c=utf8_errorValue[length-1];
|
||||
UTF8_APPEND_CHAR_UNSAFE(s, offset, c);
|
||||
i=i+offset;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
|
@ -229,7 +230,11 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
|||
for(;;) {
|
||||
if(i<=start) {
|
||||
/* no lead byte at all */
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
if(strict>=0) {
|
||||
return UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
return U_SENTINEL;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -250,7 +255,11 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
|||
if(count>=4) {
|
||||
count=3;
|
||||
}
|
||||
c=utf8_errorValue[count];
|
||||
if(strict>=0) {
|
||||
c=utf8_errorValue[count];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
} else {
|
||||
/* exit with correct c */
|
||||
}
|
||||
|
@ -260,9 +269,17 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
|||
include the trail byte that we started with */
|
||||
if(count<shouldCount) {
|
||||
*pi=i;
|
||||
c=utf8_errorValue[count];
|
||||
if(strict>=0) {
|
||||
c=utf8_errorValue[count];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
} else {
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
if(strict>=0) {
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -273,12 +290,20 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
|||
shift+=6;
|
||||
} else {
|
||||
/* more than 5 trail bytes is illegal */
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
if(strict>=0) {
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* single-byte character precedes trailing bytes */
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
if(strict>=0) {
|
||||
c=UTF8_ERROR_VALUE_1;
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -942,39 +942,39 @@ static void TestCodePoint(){
|
|||
UChar32 c=codePoint[i];
|
||||
log_verbose("Testing code unit value of \\u%4X\n", c);
|
||||
if(i<6){
|
||||
if(!UTF_IS_SURROGATE(c)){
|
||||
if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){
|
||||
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(UTF_IS_VALID(c)){
|
||||
log_err("ERROR: isValid() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(UTF_IS_UNICODE_CHAR(c)){
|
||||
if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
|
||||
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(UTF_IS_ERROR(c)){
|
||||
log_err("ERROR: isError() failed for \\u%4X\n", c);
|
||||
}
|
||||
}else if(i >=6 && i<18){
|
||||
if(UTF_IS_SURROGATE(c)){
|
||||
if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
|
||||
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(!UTF_IS_VALID(c)){
|
||||
log_err("ERROR: isValid() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(!UTF_IS_UNICODE_CHAR(c)){
|
||||
if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
|
||||
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(UTF_IS_ERROR(c)){
|
||||
log_err("ERROR: isError() failed for \\u%4X\n", c);
|
||||
}
|
||||
}else if(i >=18 && i<20){
|
||||
if(UTF_IS_SURROGATE(c)){
|
||||
if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
|
||||
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(UTF_IS_VALID(c)){
|
||||
log_err("ERROR: isValid() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(!UTF_IS_UNICODE_CHAR(c)){
|
||||
if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){
|
||||
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(!UTF_IS_ERROR(c)){
|
||||
|
@ -982,13 +982,13 @@ static void TestCodePoint(){
|
|||
}
|
||||
}
|
||||
else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){
|
||||
if(UTF_IS_SURROGATE(c)){
|
||||
if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){
|
||||
log_err("ERROR: isSurrogate() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(UTF_IS_VALID(c)){
|
||||
log_err("ERROR: isValid() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(UTF_IS_UNICODE_CHAR(c)){
|
||||
if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){
|
||||
log_err("ERROR: isUnicodeChar() failed for \\u%4X\n", c);
|
||||
}
|
||||
if(!UTF_IS_ERROR(c)){
|
||||
|
@ -1018,7 +1018,7 @@ static void TestCharLength()
|
|||
UBool multiple;
|
||||
for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){
|
||||
UChar32 c=codepoint[i+1];
|
||||
if(UTF_CHAR_LENGTH(c) != codepoint[i]){
|
||||
if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){
|
||||
log_err("The no: of code units for \\u%4X:- Expected: %d Got: %d", c, codepoint[i], UTF_CHAR_LENGTH(c));
|
||||
}else{
|
||||
log_verbose("The no: of code units for \\u%4X is %d", c, UTF_CHAR_LENGTH(c));
|
||||
|
@ -1457,7 +1457,6 @@ static void TestStringFunctions()
|
|||
|
||||
static void TestStringSearching()
|
||||
{
|
||||
UChar ucharBuf[255];
|
||||
const UChar testString[] = {0x0061, 0x0062, 0x0063, 0x0064, 0x0064, 0x0061, 0};
|
||||
const UChar testSurrogateString[] = {0xdbff, 0x0061, 0x0062, 0xdbff, 0xdfff, 0x0063, 0x0064, 0x0064, 0xdbff, 0xdfff, 0xdb00, 0xdf00, 0x0061, 0};
|
||||
const UChar surrMatchSet1[] = {0xdbff, 0xdfff, 0};
|
||||
|
@ -1467,55 +1466,67 @@ static void TestStringSearching()
|
|||
const UChar surrMatchSetBad[] = {0xdbff, 0x0061, 0};
|
||||
const UChar surrMatchSetBad2[] = {0x0061, 0xdbff, 0};
|
||||
const UChar surrMatchSetBad3[] = {0xdbff, 0x0061, 0x0062, 0xdbff, 0xdfff, 0}; /* has partial surrogate */
|
||||
const UChar
|
||||
empty[] = { 0 },
|
||||
a[] = { 0x61, 0 },
|
||||
ab[] = { 0x61, 0x62, 0 },
|
||||
ba[] = { 0x62, 0x61, 0 },
|
||||
abcd[] = { 0x61, 0x62, 0x63, 0x64, 0 },
|
||||
cd[] = { 0x63, 0x64, 0 },
|
||||
dc[] = { 0x64, 0x63, 0 },
|
||||
cdh[] = { 0x63, 0x64, 0x68, 0 },
|
||||
f[] = { 0x66, 0 },
|
||||
fg[] = { 0x66, 0x67, 0 },
|
||||
gf[] = { 0x67, 0x66, 0 };
|
||||
|
||||
log_verbose("Testing u_strpbrk()");
|
||||
|
||||
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "a")) != &testString[0]) {
|
||||
if (u_strpbrk(testString, a) != &testString[0]) {
|
||||
log_err("u_strpbrk couldn't find first letter a.\n");
|
||||
}
|
||||
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "dc")) != &testString[2]) {
|
||||
if (u_strpbrk(testString, dc) != &testString[2]) {
|
||||
log_err("u_strpbrk couldn't find d or c.\n");
|
||||
}
|
||||
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "cd")) != &testString[2]) {
|
||||
if (u_strpbrk(testString, cd) != &testString[2]) {
|
||||
log_err("u_strpbrk couldn't find c or d.\n");
|
||||
}
|
||||
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "cdh")) != &testString[2]) {
|
||||
if (u_strpbrk(testString, cdh) != &testString[2]) {
|
||||
log_err("u_strpbrk couldn't find c, d or h.\n");
|
||||
}
|
||||
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "f")) != NULL) {
|
||||
if (u_strpbrk(testString, f) != NULL) {
|
||||
log_err("u_strpbrk didn't return NULL for \"f\".\n");
|
||||
}
|
||||
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "fg")) != NULL) {
|
||||
if (u_strpbrk(testString, fg) != NULL) {
|
||||
log_err("u_strpbrk didn't return NULL for \"fg\".\n");
|
||||
}
|
||||
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "gf")) != NULL) {
|
||||
if (u_strpbrk(testString, gf) != NULL) {
|
||||
log_err("u_strpbrk didn't return NULL for \"gf\".\n");
|
||||
}
|
||||
if (u_strpbrk(testString, u_uastrcpy(ucharBuf, "")) != NULL) {
|
||||
if (u_strpbrk(testString, empty) != NULL) {
|
||||
log_err("u_strpbrk didn't return NULL for \"\".\n");
|
||||
}
|
||||
|
||||
log_verbose("Testing u_strpbrk() with surrogates");
|
||||
|
||||
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "a")) != &testSurrogateString[1]) {
|
||||
if (u_strpbrk(testSurrogateString, a) != &testSurrogateString[1]) {
|
||||
log_err("u_strpbrk couldn't find first letter a.\n");
|
||||
}
|
||||
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != &testSurrogateString[5]) {
|
||||
if (u_strpbrk(testSurrogateString, dc) != &testSurrogateString[5]) {
|
||||
log_err("u_strpbrk couldn't find d or c.\n");
|
||||
}
|
||||
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != &testSurrogateString[5]) {
|
||||
if (u_strpbrk(testSurrogateString, cd) != &testSurrogateString[5]) {
|
||||
log_err("u_strpbrk couldn't find c or d.\n");
|
||||
}
|
||||
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "cdh")) != &testSurrogateString[5]) {
|
||||
if (u_strpbrk(testSurrogateString, cdh) != &testSurrogateString[5]) {
|
||||
log_err("u_strpbrk couldn't find c, d or h.\n");
|
||||
}
|
||||
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != NULL) {
|
||||
if (u_strpbrk(testSurrogateString, f) != NULL) {
|
||||
log_err("u_strpbrk didn't return NULL for \"f\".\n");
|
||||
}
|
||||
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "fg")) != NULL) {
|
||||
if (u_strpbrk(testSurrogateString, fg) != NULL) {
|
||||
log_err("u_strpbrk didn't return NULL for \"fg\".\n");
|
||||
}
|
||||
if (u_strpbrk(testSurrogateString, u_uastrcpy(ucharBuf, "gf")) != NULL) {
|
||||
if (u_strpbrk(testSurrogateString, gf) != NULL) {
|
||||
log_err("u_strpbrk didn't return NULL for \"gf\".\n");
|
||||
}
|
||||
if (u_strpbrk(testSurrogateString, surrMatchSet1) != &testSurrogateString[3]) {
|
||||
|
@ -1536,49 +1547,49 @@ static void TestStringSearching()
|
|||
|
||||
log_verbose("Testing u_strcspn()");
|
||||
|
||||
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "a")) != 0) {
|
||||
if (u_strcspn(testString, a) != 0) {
|
||||
log_err("u_strcspn couldn't find first letter a.\n");
|
||||
}
|
||||
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "dc")) != 2) {
|
||||
if (u_strcspn(testString, dc) != 2) {
|
||||
log_err("u_strcspn couldn't find d or c.\n");
|
||||
}
|
||||
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "cd")) != 2) {
|
||||
if (u_strcspn(testString, cd) != 2) {
|
||||
log_err("u_strcspn couldn't find c or d.\n");
|
||||
}
|
||||
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "cdh")) != 2) {
|
||||
if (u_strcspn(testString, cdh) != 2) {
|
||||
log_err("u_strcspn couldn't find c, d or h.\n");
|
||||
}
|
||||
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "f")) != u_strlen(testString)) {
|
||||
if (u_strcspn(testString, f) != u_strlen(testString)) {
|
||||
log_err("u_strcspn didn't return NULL for \"f\".\n");
|
||||
}
|
||||
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "fg")) != u_strlen(testString)) {
|
||||
if (u_strcspn(testString, fg) != u_strlen(testString)) {
|
||||
log_err("u_strcspn didn't return NULL for \"fg\".\n");
|
||||
}
|
||||
if (u_strcspn(testString, u_uastrcpy(ucharBuf, "gf")) != u_strlen(testString)) {
|
||||
if (u_strcspn(testString, gf) != u_strlen(testString)) {
|
||||
log_err("u_strcspn didn't return NULL for \"gf\".\n");
|
||||
}
|
||||
|
||||
log_verbose("Testing u_strcspn() with surrogates");
|
||||
|
||||
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "a")) != 1) {
|
||||
if (u_strcspn(testSurrogateString, a) != 1) {
|
||||
log_err("u_strcspn couldn't find first letter a.\n");
|
||||
}
|
||||
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != 5) {
|
||||
if (u_strcspn(testSurrogateString, dc) != 5) {
|
||||
log_err("u_strcspn couldn't find d or c.\n");
|
||||
}
|
||||
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != 5) {
|
||||
if (u_strcspn(testSurrogateString, cd) != 5) {
|
||||
log_err("u_strcspn couldn't find c or d.\n");
|
||||
}
|
||||
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "cdh")) != 5) {
|
||||
if (u_strcspn(testSurrogateString, cdh) != 5) {
|
||||
log_err("u_strcspn couldn't find c, d or h.\n");
|
||||
}
|
||||
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != u_strlen(testSurrogateString)) {
|
||||
if (u_strcspn(testSurrogateString, f) != u_strlen(testSurrogateString)) {
|
||||
log_err("u_strcspn didn't return NULL for \"f\".\n");
|
||||
}
|
||||
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "fg")) != u_strlen(testSurrogateString)) {
|
||||
if (u_strcspn(testSurrogateString, fg) != u_strlen(testSurrogateString)) {
|
||||
log_err("u_strcspn didn't return NULL for \"fg\".\n");
|
||||
}
|
||||
if (u_strcspn(testSurrogateString, u_uastrcpy(ucharBuf, "gf")) != u_strlen(testSurrogateString)) {
|
||||
if (u_strcspn(testSurrogateString, gf) != u_strlen(testSurrogateString)) {
|
||||
log_err("u_strcspn didn't return NULL for \"gf\".\n");
|
||||
}
|
||||
if (u_strcspn(testSurrogateString, surrMatchSet1) != 3) {
|
||||
|
@ -1597,25 +1608,25 @@ static void TestStringSearching()
|
|||
|
||||
log_verbose("Testing u_strspn()");
|
||||
|
||||
if (u_strspn(testString, u_uastrcpy(ucharBuf, "a")) != 1) {
|
||||
if (u_strspn(testString, a) != 1) {
|
||||
log_err("u_strspn couldn't skip first letter a.\n");
|
||||
}
|
||||
if (u_strspn(testString, u_uastrcpy(ucharBuf, "ab")) != 2) {
|
||||
if (u_strspn(testString, ab) != 2) {
|
||||
log_err("u_strspn couldn't skip a or b.\n");
|
||||
}
|
||||
if (u_strspn(testString, u_uastrcpy(ucharBuf, "ba")) != 2) {
|
||||
if (u_strspn(testString, ba) != 2) {
|
||||
log_err("u_strspn couldn't skip a or b.\n");
|
||||
}
|
||||
if (u_strspn(testString, u_uastrcpy(ucharBuf, "f")) != 0) {
|
||||
if (u_strspn(testString, f) != 0) {
|
||||
log_err("u_strspn didn't return 0 for \"f\".\n");
|
||||
}
|
||||
if (u_strspn(testString, u_uastrcpy(ucharBuf, "dc")) != 0) {
|
||||
if (u_strspn(testString, dc) != 0) {
|
||||
log_err("u_strspn couldn't find first letter a (skip d or c).\n");
|
||||
}
|
||||
if (u_strspn(testString, u_uastrcpy(ucharBuf, "abcd")) != u_strlen(testString)) {
|
||||
if (u_strspn(testString, abcd) != u_strlen(testString)) {
|
||||
log_err("u_strspn couldn't skip over the whole string.\n");
|
||||
}
|
||||
if (u_strspn(testString, u_uastrcpy(ucharBuf, "")) != 0) {
|
||||
if (u_strspn(testString, empty) != 0) {
|
||||
log_err("u_strspn should have returned 0 for empty string.\n");
|
||||
}
|
||||
|
||||
|
@ -1626,13 +1637,13 @@ static void TestStringSearching()
|
|||
if (u_strspn(testSurrogateString, surrMatchSetBad2) != 2) {
|
||||
log_err("u_strspn couldn't skip 0xdbff or a.\n");
|
||||
}
|
||||
if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "f")) != 0) {
|
||||
if (u_strspn(testSurrogateString, f) != 0) {
|
||||
log_err("u_strspn couldn't skip d or c (skip first letter).\n");
|
||||
}
|
||||
if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "dc")) != 0) {
|
||||
if (u_strspn(testSurrogateString, dc) != 0) {
|
||||
log_err("u_strspn couldn't skip d or c (skip first letter).\n");
|
||||
}
|
||||
if (u_strspn(testSurrogateString, u_uastrcpy(ucharBuf, "cd")) != 0) {
|
||||
if (u_strspn(testSurrogateString, cd) != 0) {
|
||||
log_err("u_strspn couldn't skip d or c (skip first letter).\n");
|
||||
}
|
||||
if (u_strspn(testSurrogateString, testSurrogateString) != u_strlen(testSurrogateString)) {
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "cintltst.h"
|
||||
#include <stdio.h>
|
||||
|
||||
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
static void printUChars(const UChar *uchars);
|
||||
|
||||
|
@ -31,6 +32,7 @@ static void TestNextPrevChar(void);
|
|||
static void TestFwdBack(void);
|
||||
static void TestSetChar(void);
|
||||
static void TestAppendChar(void);
|
||||
static void TestAppend(void);
|
||||
static void TestSurrogate(void);
|
||||
|
||||
void addUTF16Test(TestNode** root);
|
||||
|
@ -45,6 +47,7 @@ addUTF16Test(TestNode** root)
|
|||
addTest(root, &TestFwdBack, "utf16tst/TestFwdBack" );
|
||||
addTest(root, &TestSetChar, "utf16tst/TestSetChar" );
|
||||
addTest(root, &TestAppendChar, "utf16tst/TestAppendChar" );
|
||||
addTest(root, &TestAppend, "utf8tst/TestAppend" );
|
||||
addTest(root, &TestSurrogate, "utf16tst/TestSurrogate" );
|
||||
}
|
||||
|
||||
|
@ -57,17 +60,17 @@ static void TestCodeUnitValues()
|
|||
UChar c=codeunit[i];
|
||||
log_verbose("Testing code unit value of %x\n", c);
|
||||
if(i<4){
|
||||
if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c)){
|
||||
if(!UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || UTF16_IS_TRAIL(c) || !U16_IS_SINGLE(c) || U16_IS_LEAD(c) || U16_IS_TRAIL(c)){
|
||||
log_err("ERROR: %x is a single character\n", c);
|
||||
}
|
||||
}
|
||||
if(i >= 4 && i< 8){
|
||||
if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c)){
|
||||
if(!UTF16_IS_LEAD(c) || UTF16_IS_SINGLE(c) || UTF16_IS_TRAIL(c) || !U16_IS_LEAD(c) || U16_IS_SINGLE(c) || U16_IS_TRAIL(c)){
|
||||
log_err("ERROR: %x is a first surrogate\n", c);
|
||||
}
|
||||
}
|
||||
if(i >= 8 && i< 12){
|
||||
if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c)){
|
||||
if(!UTF16_IS_TRAIL(c) || UTF16_IS_SINGLE(c) || UTF16_IS_LEAD(c) || !U16_IS_TRAIL(c) || U16_IS_SINGLE(c) || U16_IS_LEAD(c)){
|
||||
log_err("ERROR: %x is a second surrogate\n", c);
|
||||
}
|
||||
}
|
||||
|
@ -93,7 +96,7 @@ static void TestCharLength()
|
|||
UBool multiple;
|
||||
for(i=0; i<sizeof(codepoint)/sizeof(codepoint[0]); i=(int16_t)(i+2)){
|
||||
UChar32 c=codepoint[i+1];
|
||||
if(UTF16_CHAR_LENGTH(c) != (uint16_t)codepoint[i]){
|
||||
if(UTF16_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U16_LENGTH(c) != (uint16_t)codepoint[i]){
|
||||
log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF16_CHAR_LENGTH(c));
|
||||
}else{
|
||||
log_verbose("The no: of code units for %lx is %d\n",c, UTF16_CHAR_LENGTH(c) );
|
||||
|
@ -150,11 +153,23 @@ static void TestGetChar()
|
|||
if(c != result[i]){
|
||||
log_err("ERROR: UTF16_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
}
|
||||
|
||||
U16_GET_UNSAFE(input, offset, c);
|
||||
if(c != result[i]){
|
||||
log_err("ERROR: U16_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
}
|
||||
}
|
||||
|
||||
UTF16_GET_CHAR_SAFE(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE);
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: UTF16_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
|
||||
U16_GET(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c);
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: U16_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
|
||||
UTF16_GET_CHAR_SAFE(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE);
|
||||
if(c != result[i+2]){
|
||||
log_err("ERROR: UTF16_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
|
||||
|
@ -213,6 +228,16 @@ static void TestNextPrevChar(){
|
|||
log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_NEXT_UNSAFE(input, setOffset, c);
|
||||
if(setOffset != movedOffset[i]){
|
||||
log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i], setOffset);
|
||||
}
|
||||
if(c != result[i]){
|
||||
log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
|
@ -223,6 +248,16 @@ static void TestNextPrevChar(){
|
|||
log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_NEXT(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
log_err("ERROR: U16_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
|
@ -247,6 +282,16 @@ static void TestNextPrevChar(){
|
|||
log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_PREV_UNSAFE(input, setOffset, c);
|
||||
if(setOffset != movedOffset[i+3]){
|
||||
log_err("ERROR: U16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+3], setOffset);
|
||||
}
|
||||
if(c != result[i+3]){
|
||||
log_err("ERROR: U16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
|
||||
if(setOffset != movedOffset[i+4]){
|
||||
|
@ -257,6 +302,16 @@ static void TestNextPrevChar(){
|
|||
log_err("ERROR: UTF16_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_PREV(input, 0, setOffset, c);
|
||||
if(setOffset != movedOffset[i+4]){
|
||||
log_err("ERROR: U16_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
}
|
||||
if(c != result[i+4]){
|
||||
log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
|
||||
if(setOffset != movedOffset[i+5]){
|
||||
|
@ -285,7 +340,6 @@ static void TestFwdBack(){
|
|||
static uint16_t back_N_unsafe[]={12, 11, 8, 5, 3};
|
||||
static uint16_t back_N_safe[] ={12, 11, 8, 5, 3, 0, 0};
|
||||
|
||||
|
||||
uint16_t offunsafe=0, offsafe=0;
|
||||
uint16_t i=0;
|
||||
while(offunsafe < sizeof(input)/U_SIZEOF_UCHAR){
|
||||
|
@ -295,6 +349,17 @@ static void TestFwdBack(){
|
|||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
offunsafe=0, offsafe=0;
|
||||
i=0;
|
||||
while(offunsafe < sizeof(input)/U_SIZEOF_UCHAR){
|
||||
U16_FWD_1_UNSAFE(input, offunsafe);
|
||||
if(offunsafe != fwd_unsafe[i]){
|
||||
log_err("ERROR: U16_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(offsafe < sizeof(input)/U_SIZEOF_UCHAR){
|
||||
UTF16_FWD_1_SAFE(input, offsafe, sizeof(input)/U_SIZEOF_UCHAR);
|
||||
|
@ -303,6 +368,16 @@ static void TestFwdBack(){
|
|||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(offsafe < sizeof(input)/U_SIZEOF_UCHAR){
|
||||
U16_FWD_1(input, offsafe, sizeof(input)/U_SIZEOF_UCHAR);
|
||||
if(offsafe != fwd_safe[i]){
|
||||
log_err("ERROR: U16_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
offunsafe=sizeof(input)/U_SIZEOF_UCHAR;
|
||||
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
|
||||
i=0;
|
||||
|
@ -313,6 +388,18 @@ static void TestFwdBack(){
|
|||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
offunsafe=sizeof(input)/U_SIZEOF_UCHAR;
|
||||
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
|
||||
i=0;
|
||||
while(offunsafe > 0){
|
||||
U16_BACK_1_UNSAFE(input, offunsafe);
|
||||
if(offunsafe != back_unsafe[i]){
|
||||
log_err("ERROR: U16_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(offsafe > 0){
|
||||
UTF16_BACK_1_SAFE(input,0, offsafe);
|
||||
|
@ -321,6 +408,16 @@ static void TestFwdBack(){
|
|||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(offsafe > 0){
|
||||
U16_BACK_1(input,0, offsafe);
|
||||
if(offsafe != back_safe[i]){
|
||||
log_err("ERROR: U16_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
offunsafe=0;
|
||||
offsafe=0;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ /*didn't want it to fail(we assume 0<i<length)*/
|
||||
|
@ -329,6 +426,16 @@ static void TestFwdBack(){
|
|||
log_err("ERROR: Forward_N_unsafe offset expected:%d, Got:%d\n", fwd_N_unsafe[i], offunsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offunsafe=0;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){ /*didn't want it to fail(we assume 0<i<length)*/
|
||||
U16_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
|
||||
if(offunsafe != fwd_N_unsafe[i]){
|
||||
log_err("ERROR: U16_FWD_N_UNSAFE offset expected:%d, Got:%d\n", fwd_N_unsafe[i], offunsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offsafe=0;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
|
||||
UTF16_FWD_N_SAFE(input, offsafe, sizeof(input)/U_SIZEOF_UCHAR, Nvalue[i]);
|
||||
if(offsafe != fwd_N_safe[i]){
|
||||
|
@ -336,20 +443,47 @@ static void TestFwdBack(){
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
offsafe=0;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
|
||||
U16_FWD_N(input, offsafe, sizeof(input)/U_SIZEOF_UCHAR, Nvalue[i]);
|
||||
if(offsafe != fwd_N_safe[i]){
|
||||
log_err("ERROR: U16_FWD_N offset expected:%d, Got:%d\n", fwd_N_safe[i], offsafe);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
offunsafe=sizeof(input)/U_SIZEOF_UCHAR;
|
||||
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
|
||||
UTF16_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
|
||||
if(offunsafe != back_N_unsafe[i]){
|
||||
log_err("ERROR: backward_N_unsafe offset expected:%d, Got:%d\n", back_N_unsafe[i], offunsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offunsafe=sizeof(input)/U_SIZEOF_UCHAR;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
|
||||
U16_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
|
||||
if(offunsafe != back_N_unsafe[i]){
|
||||
log_err("ERROR: U16_BACK_N_UNSAFE offset expected:%d, Got:%d\n", back_N_unsafe[i], offunsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
|
||||
UTF16_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
|
||||
if(offsafe != back_N_safe[i]){
|
||||
log_err("ERROR: backward_N_safe offset expected:%d, Got:%d\n", back_N_safe[i], offsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offsafe=sizeof(input)/U_SIZEOF_UCHAR;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
|
||||
U16_BACK_N(input, 0, offsafe, Nvalue[i]);
|
||||
if(offsafe != back_N_safe[i]){
|
||||
log_err("ERROR: U16_BACK_N offset expected:%d, Got:%d\n", back_N_safe[i], offsafe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void TestSetChar(){
|
||||
|
@ -367,23 +501,45 @@ static void TestSetChar(){
|
|||
if(setOffset != start_unsafe[i]){
|
||||
log_err("ERROR: UTF16_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, start_unsafe[i], setOffset);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_SET_CP_START_UNSAFE(input, setOffset);
|
||||
if(setOffset != start_unsafe[i]){
|
||||
log_err("ERROR: U16_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, start_unsafe[i], setOffset);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF16_SET_CHAR_START_SAFE(input, 0, setOffset);
|
||||
if(setOffset != start_safe[i]){
|
||||
log_err("ERROR: UTF16_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, start_safe[i], setOffset);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_SET_CP_START(input, 0, setOffset);
|
||||
if(setOffset != start_safe[i]){
|
||||
log_err("ERROR: U16_SET_CHAR_START failed for offset=%ld. Expected:%lx Got:%lx\n", offset, start_safe[i], setOffset);
|
||||
}
|
||||
|
||||
if (offset > 0) {
|
||||
setOffset=offset;
|
||||
UTF16_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
|
||||
if(setOffset != limit_unsafe[i]){
|
||||
log_err("ERROR: UTF16_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_unsafe[i], setOffset);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U16_SET_CP_LIMIT_UNSAFE(input, setOffset);
|
||||
if(setOffset != limit_unsafe[i]){
|
||||
log_err("ERROR: U16_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_unsafe[i], setOffset);
|
||||
}
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF16_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input)/U_SIZEOF_UCHAR);
|
||||
U16_SET_CP_LIMIT(input,0, setOffset, sizeof(input)/U_SIZEOF_UCHAR);
|
||||
if(setOffset != limit_safe[i]){
|
||||
log_err("ERROR: UTF16_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_safe[i], setOffset);
|
||||
log_err("ERROR: U16_SET_CHAR_LIMIT failed for offset=%ld. Expected:%lx Got:%lx\n", offset, limit_safe[i], setOffset);
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
@ -487,6 +643,67 @@ static void TestAppendChar(){
|
|||
|
||||
}
|
||||
|
||||
static void TestAppend() {
|
||||
static const UChar32 codePoints[]={
|
||||
0x61, 0xdf, 0x901, 0x3040,
|
||||
0xac00, 0xd800, 0xdbff, 0xdcde,
|
||||
0xdffd, 0xe000, 0xffff, 0x10000,
|
||||
0x12345, 0xe0021, 0x10ffff, 0x110000,
|
||||
0x234567, 0x7fffffff, -1, -1000,
|
||||
0, 0x400
|
||||
};
|
||||
static const UChar expectUnsafe[]={
|
||||
0x61, 0xdf, 0x901, 0x3040,
|
||||
0xac00, 0xd800, 0xdbff, 0xdcde,
|
||||
0xdffd, 0xe000, 0xffff, 0xd800, 0xdc00,
|
||||
0xd848, 0xdf45, 0xdb40, 0xdc21, 0xdbff, 0xdfff, /* not 0x110000 */
|
||||
/* none from this line */
|
||||
0, 0x400
|
||||
}, expectSafe[]={
|
||||
0x61, 0xdf, 0x901, 0x3040,
|
||||
0xac00, 0xd800, 0xdbff, 0xdcde,
|
||||
0xdffd, 0xe000, 0xffff, 0xd800, 0xdc00,
|
||||
0xd848, 0xdf45, 0xdb40, 0xdc21, 0xdbff, 0xdfff, /* not 0x110000 */
|
||||
/* none from this line */
|
||||
0, 0x400
|
||||
};
|
||||
|
||||
UChar buffer[100];
|
||||
UChar32 c;
|
||||
int32_t i, length;
|
||||
UBool isError, expectIsError, wrongIsError;
|
||||
|
||||
length=0;
|
||||
for(i=0; i<LENGTHOF(codePoints); ++i) {
|
||||
c=codePoints[i];
|
||||
if(c<0 || 0x10ffff<c) {
|
||||
continue; /* skip non-code points for U16_APPEND_UNSAFE */
|
||||
}
|
||||
|
||||
U16_APPEND_UNSAFE(buffer, length, c);
|
||||
}
|
||||
if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length*U_SIZEOF_UCHAR)) {
|
||||
log_err("U16_APPEND_UNSAFE did not generate the expected output\n");
|
||||
}
|
||||
|
||||
length=0;
|
||||
wrongIsError=FALSE;
|
||||
for(i=0; i<LENGTHOF(codePoints); ++i) {
|
||||
c=codePoints[i];
|
||||
expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
|
||||
isError=FALSE;
|
||||
|
||||
U16_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
|
||||
wrongIsError|= isError!=expectIsError;
|
||||
}
|
||||
if(wrongIsError) {
|
||||
log_err("U16_APPEND did not set isError correctly\n");
|
||||
}
|
||||
if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length*U_SIZEOF_UCHAR)) {
|
||||
log_err("U16_APPEND did not generate the expected output\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void TestSurrogate(){
|
||||
static UChar32 s[] = {0x10000, 0x10ffff, 0x50000, 0x100000, 0x1abcd};
|
||||
int i = 0;
|
||||
|
@ -497,11 +714,11 @@ static void TestSurrogate(){
|
|||
UChar firstresult = (UChar)(((s[i] - 0x10000) / 0x400) + 0xD800);
|
||||
UChar secondresult = (UChar)(((s[i] - 0x10000) % 0x400) + 0xDC00);
|
||||
|
||||
if (first != UTF16_LEAD(s[i]) || first != firstresult) {
|
||||
if (first != UTF16_LEAD(s[i]) || first != U16_LEAD(s[i]) || first != firstresult) {
|
||||
log_err("Failure in first surrogate in 0x%x expected to be 0x%x\n",
|
||||
s[i], firstresult);
|
||||
}
|
||||
if (second != UTF16_TRAIL(s[i]) || second != secondresult) {
|
||||
if (second != UTF16_TRAIL(s[i]) || second != U16_TRAIL(s[i]) || second != secondresult) {
|
||||
log_err("Failure in second surrogate in 0x%x expected to be 0x%x\n",
|
||||
s[i], secondresult);
|
||||
}
|
||||
|
@ -515,6 +732,3 @@ static void printUChars(const UChar *uchars){
|
|||
printf("%x ", *(uchars+i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "cmemory.h"
|
||||
#include "cintltst.h"
|
||||
|
||||
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
static void printUChars(const uint8_t *uchars, int16_t len);
|
||||
|
||||
|
@ -28,6 +29,7 @@ static void TestNextPrevChar(void);
|
|||
static void TestFwdBack(void);
|
||||
static void TestSetChar(void);
|
||||
static void TestAppendChar(void);
|
||||
static void TestAppend(void);
|
||||
|
||||
void addUTF8Test(TestNode** root);
|
||||
|
||||
|
@ -41,6 +43,7 @@ addUTF8Test(TestNode** root)
|
|||
addTest(root, &TestFwdBack, "utf8tst/TestFwdBack" );
|
||||
addTest(root, &TestSetChar, "utf8tst/TestSetChar" );
|
||||
addTest(root, &TestAppendChar, "utf8tst/TestAppendChar" );
|
||||
addTest(root, &TestAppend, "utf8tst/TestAppend" );
|
||||
}
|
||||
|
||||
static void TestCodeUnitValues()
|
||||
|
@ -52,17 +55,17 @@ static void TestCodeUnitValues()
|
|||
uint8_t c=codeunit[i];
|
||||
log_verbose("Testing code unit value of %x\n", c);
|
||||
if(i<4){
|
||||
if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c)){
|
||||
if(!UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || UTF8_IS_TRAIL(c) || !U8_IS_SINGLE(c) || U8_IS_LEAD(c) || U8_IS_TRAIL(c)){
|
||||
log_err("ERROR: 0x%02x is a single byte but results in single: %c lead: %c trail: %c\n",
|
||||
c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
|
||||
}
|
||||
} else if(i< 8){
|
||||
if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c)){
|
||||
if(!UTF8_IS_LEAD(c) || UTF8_IS_SINGLE(c) || UTF8_IS_TRAIL(c) || !U8_IS_LEAD(c) || U8_IS_SINGLE(c) || U8_IS_TRAIL(c)){
|
||||
log_err("ERROR: 0x%02x is a lead byte but results in single: %c lead: %c trail: %c\n",
|
||||
c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
|
||||
}
|
||||
} else if(i< 12){
|
||||
if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c)){
|
||||
if(!UTF8_IS_TRAIL(c) || UTF8_IS_SINGLE(c) || UTF8_IS_LEAD(c) || !U8_IS_TRAIL(c) || U8_IS_SINGLE(c) || U8_IS_LEAD(c)){
|
||||
log_err("ERROR: 0x%02x is a trail byte but results in single: %c lead: %c trail: %c\n",
|
||||
c, UTF8_IS_SINGLE(c) ? 'y' : 'n', UTF8_IS_LEAD(c) ? 'y' : 'n', UTF8_IS_TRAIL(c) ? 'y' : 'n');
|
||||
}
|
||||
|
@ -93,7 +96,7 @@ static void TestCharLength()
|
|||
UBool multiple;
|
||||
for(i=0; i<sizeof(codepoint)/sizeof(codepoint[0]); i=(int16_t)(i+2)){
|
||||
UChar32 c=codepoint[i+1];
|
||||
if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i]){
|
||||
if(UTF8_CHAR_LENGTH(c) != (uint16_t)codepoint[i] || U8_LENGTH(c) != (uint16_t)codepoint[i]){
|
||||
log_err("The no: of code units for %lx:- Expected: %d Got: %d\n", c, codepoint[i], UTF8_CHAR_LENGTH(c));
|
||||
}else{
|
||||
log_verbose("The no: of code units for %lx is %d\n",c, UTF8_CHAR_LENGTH(c) );
|
||||
|
@ -152,11 +155,24 @@ static void TestGetChar()
|
|||
log_err("ERROR: UTF8_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
|
||||
}
|
||||
|
||||
U8_GET_UNSAFE(input, offset, c);
|
||||
if(c != result[i]){
|
||||
log_err("ERROR: U8_GET_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
U8_GET(input, 0, offset, sizeof(input), c);
|
||||
if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
|
||||
log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
|
||||
UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, FALSE);
|
||||
if(c != result[i+1]){
|
||||
log_err("ERROR: UTF8_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
|
||||
UTF8_GET_CHAR_SAFE(input, 0, offset, sizeof(input), c, TRUE);
|
||||
if(c != result[i+2]){
|
||||
log_err("ERROR: UTF8_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
|
||||
|
@ -225,7 +241,18 @@ static void TestNextPrevChar(){
|
|||
if(c != result[i]){
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_NEXT_UNSAFE(input, setOffset, c);
|
||||
if(setOffset != movedOffset[i]){
|
||||
log_err("ERROR: U8_NEXT_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i], setOffset);
|
||||
}
|
||||
if(c != result[i]){
|
||||
log_err("ERROR: U8_NEXT_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
|
||||
}
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, FALSE);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
|
@ -235,6 +262,17 @@ static void TestNextPrevChar(){
|
|||
if(c != result[i+1]){
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_NEXT(input, setOffset, sizeof(input), c);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
log_err("ERROR: U8_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+1], setOffset);
|
||||
}
|
||||
if(UTF_IS_ERROR(result[i+1]) ? c >= 0 : c != result[i+1]){
|
||||
log_err("ERROR: U8_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF8_NEXT_CHAR_SAFE(input, setOffset, sizeof(input), c, TRUE);
|
||||
if(setOffset != movedOffset[i+1]){
|
||||
|
@ -244,8 +282,10 @@ static void TestNextPrevChar(){
|
|||
if(c != result[i+2]){
|
||||
log_err("ERROR: UTF8_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
|
||||
}
|
||||
|
||||
i=i+6;
|
||||
}
|
||||
|
||||
i=0;
|
||||
for(offset=sizeof(input); offset > 0; --offset){
|
||||
setOffset=offset;
|
||||
|
@ -257,6 +297,7 @@ static void TestNextPrevChar(){
|
|||
if(c != result[i+3]){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
|
||||
if(setOffset != movedOffset[i+4]){
|
||||
|
@ -266,6 +307,17 @@ static void TestNextPrevChar(){
|
|||
if(c != result[i+4]){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_PREV(input, 0, setOffset, c);
|
||||
if(setOffset != movedOffset[i+4]){
|
||||
log_err("ERROR: U8_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
|
||||
offset, movedOffset[i+4], setOffset);
|
||||
}
|
||||
if(UTF_IS_ERROR(result[i+4]) ? c >= 0 : c != result[i+4]){
|
||||
log_err("ERROR: U8_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF8_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE);
|
||||
if(setOffset != movedOffset[i+5]){
|
||||
|
@ -275,6 +327,7 @@ static void TestNextPrevChar(){
|
|||
if(c != result[i+5]){
|
||||
log_err("ERROR: UTF8_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
|
||||
}
|
||||
|
||||
i=i+6;
|
||||
}
|
||||
|
||||
|
@ -295,6 +348,7 @@ static void TestFwdBack(){
|
|||
|
||||
|
||||
uint32_t offunsafe=0, offsafe=0;
|
||||
|
||||
uint32_t i=0;
|
||||
while(offunsafe < sizeof(input)){
|
||||
UTF8_FWD_1_UNSAFE(input, offunsafe);
|
||||
|
@ -303,6 +357,16 @@ static void TestFwdBack(){
|
|||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(offunsafe < sizeof(input)){
|
||||
U8_FWD_1_UNSAFE(input, offunsafe);
|
||||
if(offunsafe != fwd_unsafe[i]){
|
||||
log_err("ERROR: U8_FWD_1_UNSAFE offset expected:%d, Got:%d\n", fwd_unsafe[i], offunsafe);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(offsafe < sizeof(input)){
|
||||
UTF8_FWD_1_SAFE(input, offsafe, sizeof(input));
|
||||
|
@ -311,6 +375,16 @@ static void TestFwdBack(){
|
|||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
i=0;
|
||||
while(offsafe < sizeof(input)){
|
||||
U8_FWD_1(input, offsafe, sizeof(input));
|
||||
if(offsafe != fwd_safe[i]){
|
||||
log_err("ERROR: U8_FWD_1 offset expected:%d, Got:%d\n", fwd_safe[i], offsafe);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
offunsafe=sizeof(input);
|
||||
i=0;
|
||||
while(offunsafe > 0){
|
||||
|
@ -320,6 +394,17 @@ static void TestFwdBack(){
|
|||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
offunsafe=sizeof(input);
|
||||
i=0;
|
||||
while(offunsafe > 0){
|
||||
U8_BACK_1_UNSAFE(input, offunsafe);
|
||||
if(offunsafe != back_unsafe[i]){
|
||||
log_err("ERROR: U8_BACK_1_UNSAFE offset expected:%d, Got:%d\n", back_unsafe[i], offunsafe);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
i=0;
|
||||
offsafe=sizeof(input);
|
||||
while(offsafe > 0){
|
||||
|
@ -329,14 +414,34 @@ static void TestFwdBack(){
|
|||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
i=0;
|
||||
offsafe=sizeof(input);
|
||||
while(offsafe > 0){
|
||||
U8_BACK_1(input, 0, offsafe);
|
||||
if(offsafe != back_safe[i]){
|
||||
log_err("ERROR: U8_BACK_1 offset expected:%d, Got:%d\n", back_unsafe[i], offsafe);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
offunsafe=0;
|
||||
offsafe=0;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
|
||||
UTF8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
|
||||
if(offunsafe != fwd_N_unsafe[i]){
|
||||
log_err("ERROR: Forward_N_unsafe offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offunsafe=0;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
|
||||
U8_FWD_N_UNSAFE(input, offunsafe, Nvalue[i]);
|
||||
if(offunsafe != fwd_N_unsafe[i]){
|
||||
log_err("ERROR: U8_FWD_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, fwd_N_unsafe[i], offunsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offsafe=0;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
|
||||
UTF8_FWD_N_SAFE(input, offsafe, sizeof(input), Nvalue[i]);
|
||||
if(offsafe != fwd_N_safe[i]){
|
||||
|
@ -344,20 +449,47 @@ static void TestFwdBack(){
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
offsafe=0;
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
|
||||
U8_FWD_N(input, offsafe, sizeof(input), Nvalue[i]);
|
||||
if(offsafe != fwd_N_safe[i]){
|
||||
log_err("ERROR: U8_FWD_N offset=%d expected:%d, Got:%d\n", i, fwd_N_safe[i], offsafe);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
offunsafe=sizeof(input);
|
||||
offsafe=sizeof(input);
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
|
||||
UTF8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
|
||||
if(offunsafe != back_N_unsafe[i]){
|
||||
log_err("ERROR: backward_N_unsafe offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offunsafe=sizeof(input);
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0])-2; i++){
|
||||
U8_BACK_N_UNSAFE(input, offunsafe, Nvalue[i]);
|
||||
if(offunsafe != back_N_unsafe[i]){
|
||||
log_err("ERROR: U8_BACK_N_UNSAFE offset=%d expected:%d, Got:%d\n", i, back_N_unsafe[i], offunsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offsafe=sizeof(input);
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
|
||||
UTF8_BACK_N_SAFE(input, 0, offsafe, Nvalue[i]);
|
||||
if(offsafe != back_N_safe[i]){
|
||||
log_err("ERROR: backward_N_safe offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
|
||||
}
|
||||
}
|
||||
|
||||
offsafe=sizeof(input);
|
||||
for(i=0; i<sizeof(Nvalue)/sizeof(Nvalue[0]); i++){
|
||||
U8_BACK_N(input, 0, offsafe, Nvalue[i]);
|
||||
if(offsafe != back_N_safe[i]){
|
||||
log_err("ERROR: U8_BACK_N offset=%d expected:%d, Got:%ld\n", i, back_N_safe[i], offsafe);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void TestSetChar(){
|
||||
|
@ -380,23 +512,51 @@ static void TestSetChar(){
|
|||
if(setOffset != start_unsafe[i]){
|
||||
log_err("ERROR: UTF8_SET_CHAR_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_SET_CP_START_UNSAFE(input, setOffset);
|
||||
if(setOffset != start_unsafe[i]){
|
||||
log_err("ERROR: U8_SET_CP_START_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_unsafe[i], setOffset);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF8_SET_CHAR_START_SAFE(input, 0, setOffset);
|
||||
if(setOffset != start_safe[i]){
|
||||
log_err("ERROR: UTF8_SET_CHAR_START_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_SET_CP_START(input, 0, setOffset);
|
||||
if(setOffset != start_safe[i]){
|
||||
log_err("ERROR: U8_SET_CP_START failed for offset=%ld. Expected:%ld Got:%ld\n", offset, start_safe[i], setOffset);
|
||||
}
|
||||
|
||||
if (offset != 0) { /* Can't have it go off the end of the array */
|
||||
setOffset=offset;
|
||||
UTF8_SET_CHAR_LIMIT_UNSAFE(input, setOffset);
|
||||
if(setOffset != limit_unsafe[i]){
|
||||
log_err("ERROR: UTF8_SET_CHAR_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_SET_CP_LIMIT_UNSAFE(input, setOffset);
|
||||
if(setOffset != limit_unsafe[i]){
|
||||
log_err("ERROR: U8_SET_CP_LIMIT_UNSAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_unsafe[i], setOffset);
|
||||
}
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
UTF8_SET_CHAR_LIMIT_SAFE(input,0, setOffset, sizeof(input));
|
||||
if(setOffset != limit_safe[i]){
|
||||
log_err("ERROR: UTF8_SET_CHAR_LIMIT_SAFE failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
|
||||
}
|
||||
|
||||
setOffset=offset;
|
||||
U8_SET_CP_LIMIT(input,0, setOffset, sizeof(input));
|
||||
if(setOffset != limit_safe[i]){
|
||||
log_err("ERROR: U8_SET_CP_LIMIT failed for offset=%ld. Expected:%ld Got:%ld\n", offset, limit_safe[i], setOffset);
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
@ -451,7 +611,7 @@ static void TestAppendChar(){
|
|||
8,
|
||||
9,
|
||||
|
||||
/*offse-moved-to(safe)*/
|
||||
/*offset-moved-to(safe)*/
|
||||
4, /*for append-pos: 0, CHAR 0x10401*/
|
||||
3,
|
||||
4,
|
||||
|
@ -570,6 +730,67 @@ static void TestAppendChar(){
|
|||
|
||||
}
|
||||
|
||||
static void TestAppend() {
|
||||
static const UChar32 codePoints[]={
|
||||
0x61, 0xdf, 0x901, 0x3040,
|
||||
0xac00, 0xd800, 0xdbff, 0xdcde,
|
||||
0xdffd, 0xe000, 0xffff, 0x10000,
|
||||
0x12345, 0xe0021, 0x10ffff, 0x110000,
|
||||
0x234567, 0x7fffffff, -1, -1000,
|
||||
0, 0x400
|
||||
};
|
||||
static const uint8_t expectUnsafe[]={
|
||||
0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
|
||||
0xea, 0xb0, 0x80, 0xed, 0xa0, 0x80, 0xed, 0xaf, 0xbf, 0xed, 0xb3, 0x9e,
|
||||
0xed, 0xbf, 0xbd, 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
|
||||
0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
|
||||
/* none from this line */
|
||||
0, 0xd0, 0x80
|
||||
}, expectSafe[]={
|
||||
0x61, 0xc3, 0x9f, 0xe0, 0xa4, 0x81, 0xe3, 0x81, 0x80,
|
||||
0xea, 0xb0, 0x80, /* no surrogates */
|
||||
/* no surrogates */ 0xee, 0x80, 0x80, 0xef, 0xbf, 0xbf, 0xf0, 0x90, 0x80, 0x80,
|
||||
0xf0, 0x92, 0x8d, 0x85, 0xf3, 0xa0, 0x80, 0xa1, 0xf4, 0x8f, 0xbf, 0xbf, /* not 0x110000 */
|
||||
/* none from this line */
|
||||
0, 0xd0, 0x80
|
||||
};
|
||||
|
||||
uint8_t buffer[100];
|
||||
UChar32 c;
|
||||
int32_t i, length;
|
||||
UBool isError, expectIsError, wrongIsError;
|
||||
|
||||
length=0;
|
||||
for(i=0; i<LENGTHOF(codePoints); ++i) {
|
||||
c=codePoints[i];
|
||||
if(c<0 || 0x10ffff<c) {
|
||||
continue; /* skip non-code points for U8_APPEND_UNSAFE */
|
||||
}
|
||||
|
||||
U8_APPEND_UNSAFE(buffer, length, c);
|
||||
}
|
||||
if(length!=LENGTHOF(expectUnsafe) || 0!=memcmp(buffer, expectUnsafe, length)) {
|
||||
log_err("U8_APPEND_UNSAFE did not generate the expected output\n");
|
||||
}
|
||||
|
||||
length=0;
|
||||
wrongIsError=FALSE;
|
||||
for(i=0; i<LENGTHOF(codePoints); ++i) {
|
||||
c=codePoints[i];
|
||||
expectIsError= c<0 || 0x10ffff<c || U_IS_SURROGATE(c);
|
||||
isError=FALSE;
|
||||
|
||||
U8_APPEND(buffer, length, LENGTHOF(buffer), c, isError);
|
||||
wrongIsError|= isError!=expectIsError;
|
||||
}
|
||||
if(wrongIsError) {
|
||||
log_err("U8_APPEND did not set isError correctly\n");
|
||||
}
|
||||
if(length!=LENGTHOF(expectSafe) || 0!=memcmp(buffer, expectSafe, length)) {
|
||||
log_err("U8_APPEND did not generate the expected output\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void printUChars(const uint8_t *uchars, int16_t len){
|
||||
int16_t i=0;
|
||||
for(i=0; i<len; i++){
|
||||
|
|
Loading…
Add table
Reference in a new issue