mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 06:25:30 +00:00
ICU-6677 add u_strToUTF32WithSub() and u_strFromUTF32WithSub()
X-SVN-Rev: 25444
This commit is contained in:
parent
a679ccf60c
commit
c4e1d3e0be
3 changed files with 309 additions and 71 deletions
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1998-2008, International Business Machines
|
||||
* Copyright (C) 1998-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
|
@ -1183,7 +1183,10 @@ u_strFoldCase(UChar *dest, int32_t destCapacity,
|
|||
|
||||
#if defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION
|
||||
/**
|
||||
* Converts a sequence of UChars to wchar_t units.
|
||||
* Convert a UTF-16 string to a wchar_t string.
|
||||
* If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
|
||||
* this function simply calls the fast, dedicated function for that.
|
||||
* Otherwise, two conversions UTF-16 -> default charset -> wchar_t* are performed.
|
||||
*
|
||||
* @param dest A buffer for the result string. The result will be zero-terminated if
|
||||
* the buffer is large enough.
|
||||
|
@ -1209,7 +1212,10 @@ u_strToWCS(wchar_t *dest,
|
|||
int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
/**
|
||||
* Converts a sequence of wchar_t units to UChars
|
||||
* Convert a wchar_t string to UTF-16.
|
||||
* If it is known at compile time that wchar_t strings are in UTF-16 or UTF-32, then
|
||||
* this function simply calls the fast, dedicated function for that.
|
||||
* Otherwise, two conversions wchar_t* -> default charset -> UTF-16 are performed.
|
||||
*
|
||||
* @param dest A buffer for the result string. The result will be zero-terminated if
|
||||
* the buffer is large enough.
|
||||
|
@ -1237,7 +1243,8 @@ u_strFromWCS(UChar *dest,
|
|||
#endif /* defined(U_WCHAR_IS_UTF16) || defined(U_WCHAR_IS_UTF32) || !UCONFIG_NO_CONVERSION */
|
||||
|
||||
/**
|
||||
* Converts a sequence of UChars (UTF-16) to UTF-8 bytes
|
||||
* Convert a UTF-16 string to UTF-8.
|
||||
* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
|
||||
*
|
||||
* @param dest A buffer for the result string. The result will be zero-terminated if
|
||||
* the buffer is large enough.
|
||||
|
@ -1266,7 +1273,8 @@ u_strToUTF8(char *dest,
|
|||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Converts a sequence of UTF-8 bytes to UChars (UTF-16).
|
||||
* Convert a UTF-8 string to UTF-16.
|
||||
* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
|
||||
*
|
||||
* @param dest A buffer for the result string. The result will be zero-terminated if
|
||||
* the buffer is large enough.
|
||||
|
@ -1295,7 +1303,9 @@ u_strFromUTF8(UChar *dest,
|
|||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Converts a sequence of UChars (UTF-16) to UTF-8 bytes.
|
||||
* Convert a UTF-16 string to UTF-8.
|
||||
* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
|
||||
*
|
||||
* Same as u_strToUTF8() except for the additional subchar which is output for
|
||||
* illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
|
||||
* With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF8().
|
||||
|
@ -1338,7 +1348,9 @@ u_strToUTF8WithSub(char *dest,
|
|||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Converts a sequence of UTF-8 bytes to UChars (UTF-16).
|
||||
* Convert a UTF-8 string to UTF-16.
|
||||
* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
|
||||
*
|
||||
* Same as u_strFromUTF8() except for the additional subchar which is output for
|
||||
* illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
|
||||
* With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF8().
|
||||
|
@ -1382,7 +1394,8 @@ u_strFromUTF8WithSub(UChar *dest,
|
|||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Converts a sequence of UTF-8 bytes to UChars (UTF-16).
|
||||
* Convert a UTF-8 string to UTF-16.
|
||||
*
|
||||
* Same as u_strFromUTF8() except that this function is designed to be very fast,
|
||||
* which it achieves by being lenient about malformed UTF-8 sequences.
|
||||
* This function is intended for use in environments where UTF-8 text is
|
||||
|
@ -1401,6 +1414,9 @@ u_strFromUTF8WithSub(UChar *dest,
|
|||
* For further performance improvement, if srcLength is given (>=0),
|
||||
* then it must be destCapacity>=srcLength.
|
||||
*
|
||||
* There is no inverse u_strToUTF8Lenient() function because there is practically
|
||||
* no performance gain from not checking that a UTF-16 string is well-formed.
|
||||
*
|
||||
* @param dest A buffer for the result string. The result will be zero-terminated if
|
||||
* the buffer is large enough.
|
||||
* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
|
||||
|
@ -1437,7 +1453,8 @@ u_strFromUTF8Lenient(UChar *dest,
|
|||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Converts a sequence of UChars (UTF-16) to UTF32 units.
|
||||
* Convert a UTF-16 string to UTF-32.
|
||||
* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
|
||||
*
|
||||
* @param dest A buffer for the result string. The result will be zero-terminated if
|
||||
* the buffer is large enough.
|
||||
|
@ -1453,6 +1470,8 @@ u_strFromUTF8Lenient(UChar *dest,
|
|||
* @param pErrorCode Must be a valid pointer to an error code value,
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The pointer to destination buffer.
|
||||
* @see u_strToUTF32WithSub
|
||||
* @see u_strFromUTF32
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_STABLE UChar32* U_EXPORT2
|
||||
|
@ -1464,7 +1483,8 @@ u_strToUTF32(UChar32 *dest,
|
|||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Converts a sequence of UTF32 units to UChars (UTF-16)
|
||||
* Convert a UTF-32 string to UTF-16.
|
||||
* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
|
||||
*
|
||||
* @param dest A buffer for the result string. The result will be zero-terminated if
|
||||
* the buffer is large enough.
|
||||
|
@ -1480,6 +1500,8 @@ u_strToUTF32(UChar32 *dest,
|
|||
* @param pErrorCode Must be a valid pointer to an error code value,
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The pointer to destination buffer.
|
||||
* @see u_strFromUTF32WithSub
|
||||
* @see u_strToUTF32
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_STABLE UChar* U_EXPORT2
|
||||
|
@ -1490,4 +1512,94 @@ u_strFromUTF32(UChar *dest,
|
|||
int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Convert a UTF-16 string to UTF-32.
|
||||
* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
|
||||
*
|
||||
* Same as u_strToUTF32() except for the additional subchar which is output for
|
||||
* illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
|
||||
* With subchar==U_SENTINEL, this function behaves exactly like u_strToUTF32().
|
||||
*
|
||||
* @param dest A buffer for the result string. The result will be zero-terminated if
|
||||
* the buffer is large enough.
|
||||
* @param destCapacity The size of the buffer (number of UChar32s). If it is 0, then
|
||||
* dest may be NULL and the function will only return the length of the
|
||||
* result without writing any of the result string (pre-flighting).
|
||||
* @param pDestLength A pointer to receive the number of units written to the destination. If
|
||||
* pDestLength!=NULL then *pDestLength is always set to the
|
||||
* number of output units corresponding to the transformation of
|
||||
* all the input units, even in case of a buffer overflow.
|
||||
* @param src The original source string
|
||||
* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
|
||||
* @param subchar The substitution character to use in place of an illegal input sequence,
|
||||
* or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
|
||||
* A substitution character can be any valid Unicode code point (up to U+10FFFF)
|
||||
* except for surrogate code points (U+D800..U+DFFF).
|
||||
* The recommended value is U+FFFD "REPLACEMENT CHARACTER".
|
||||
* @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
|
||||
* Set to 0 if no substitutions occur or subchar<0.
|
||||
* pNumSubstitutions can be NULL.
|
||||
* @param pErrorCode Pointer to a standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return The pointer to destination buffer.
|
||||
* @see u_strToUTF32
|
||||
* @see u_strFromUTF32WithSub
|
||||
* @draft ICU 4.2
|
||||
*/
|
||||
U_DRAFT UChar32* U_EXPORT2
|
||||
u_strToUTF32WithSub(UChar32 *dest,
|
||||
int32_t destCapacity,
|
||||
int32_t *pDestLength,
|
||||
const UChar *src,
|
||||
int32_t srcLength,
|
||||
UChar32 subchar, int32_t *pNumSubstitutions,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Convert a UTF-32 string to UTF-16.
|
||||
* If the input string is not well-formed, then the U_INVALID_CHAR_FOUND error code is set.
|
||||
*
|
||||
* Same as u_strFromUTF32() except for the additional subchar which is output for
|
||||
* illegal input sequences, instead of stopping with the U_INVALID_CHAR_FOUND error code.
|
||||
* With subchar==U_SENTINEL, this function behaves exactly like u_strFromUTF32().
|
||||
*
|
||||
* @param dest A buffer for the result string. The result will be zero-terminated if
|
||||
* the buffer is large enough.
|
||||
* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
|
||||
* dest may be NULL and the function will only return the length of the
|
||||
* result without writing any of the result string (pre-flighting).
|
||||
* @param pDestLength A pointer to receive the number of units written to the destination. If
|
||||
* pDestLength!=NULL then *pDestLength is always set to the
|
||||
* number of output units corresponding to the transformation of
|
||||
* all the input units, even in case of a buffer overflow.
|
||||
* @param src The original source string
|
||||
* @param srcLength The length of the original string. If -1, then src must be zero-terminated.
|
||||
* @param subchar The substitution character to use in place of an illegal input sequence,
|
||||
* or U_SENTINEL if the function is to return with U_INVALID_CHAR_FOUND instead.
|
||||
* A substitution character can be any valid Unicode code point (up to U+10FFFF)
|
||||
* except for surrogate code points (U+D800..U+DFFF).
|
||||
* The recommended value is U+FFFD "REPLACEMENT CHARACTER".
|
||||
* @param pNumSubstitutions Output parameter receiving the number of substitutions if subchar>=0.
|
||||
* Set to 0 if no substitutions occur or subchar<0.
|
||||
* pNumSubstitutions can be NULL.
|
||||
* @param pErrorCode Pointer to a standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return The pointer to destination buffer.
|
||||
* @see u_strFromUTF32
|
||||
* @see u_strToUTF32WithSub
|
||||
* @draft ICU 4.2
|
||||
*/
|
||||
U_DRAFT UChar* U_EXPORT2
|
||||
u_strFromUTF32WithSub(UChar *dest,
|
||||
int32_t destCapacity,
|
||||
int32_t *pDestLength,
|
||||
const UChar32 *src,
|
||||
int32_t srcLength,
|
||||
UChar32 subchar, int32_t *pNumSubstitutions,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -31,38 +31,50 @@
|
|||
#include "ustr_imp.h"
|
||||
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_strFromUTF32(UChar *dest,
|
||||
int32_t destCapacity,
|
||||
u_strFromUTF32WithSub(UChar *dest,
|
||||
int32_t destCapacity,
|
||||
int32_t *pDestLength,
|
||||
const UChar32 *src,
|
||||
int32_t srcLength,
|
||||
UChar32 subchar, int32_t *pNumSubstitutions,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UChar32 *srcLimit;
|
||||
UChar32 ch;
|
||||
UChar *destLimit;
|
||||
UChar *pDest;
|
||||
int32_t reqLength;
|
||||
int32_t numSubstitutions;
|
||||
|
||||
/* args check */
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
return NULL;
|
||||
}
|
||||
if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)) {
|
||||
if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
|
||||
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
|
||||
) {
|
||||
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(pNumSubstitutions != NULL) {
|
||||
*pNumSubstitutions = 0;
|
||||
}
|
||||
|
||||
pDest = dest;
|
||||
destLimit = dest + destCapacity;
|
||||
reqLength = 0;
|
||||
numSubstitutions = 0;
|
||||
|
||||
if(srcLength < 0) {
|
||||
/* simple loop for conversion of a NUL-terminated BMP string */
|
||||
while((ch=*src) != 0 &&
|
||||
((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) &&
|
||||
pDest < destLimit) {
|
||||
((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
|
||||
++src;
|
||||
*pDest++ = (UChar)ch;
|
||||
if(pDest < destLimit) {
|
||||
*pDest++ = (UChar)ch;
|
||||
} else {
|
||||
++reqLength;
|
||||
}
|
||||
}
|
||||
srcLimit = src;
|
||||
if(ch != 0) {
|
||||
|
@ -74,43 +86,42 @@ u_strFromUTF32(UChar *dest,
|
|||
}
|
||||
|
||||
/* convert with length */
|
||||
while(src < srcLimit && pDest < destLimit) {
|
||||
ch = *src++;
|
||||
if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
|
||||
*pDest++ = (UChar)ch;
|
||||
} else if(0x10000 <= ch && ch <= 0x10ffff) {
|
||||
*pDest++ = U16_LEAD(ch);
|
||||
if(pDest < destLimit) {
|
||||
*pDest++ = U16_TRAIL(ch);
|
||||
} else {
|
||||
reqLength = 1;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* surrogate code point, or not a Unicode code point at all */
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/* preflight the remaining string */
|
||||
while(src < srcLimit) {
|
||||
ch = *src++;
|
||||
if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
|
||||
++reqLength;
|
||||
} else if(0x10000 <= ch && ch <= 0x10ffff) {
|
||||
reqLength += 2;
|
||||
} else {
|
||||
/* surrogate code point, or not a Unicode code point at all */
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
do {
|
||||
/* usually "loops" once; twice only for writing subchar */
|
||||
if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
|
||||
if(pDest < destLimit) {
|
||||
*pDest++ = (UChar)ch;
|
||||
} else {
|
||||
++reqLength;
|
||||
}
|
||||
break;
|
||||
} else if(0x10000 <= ch && ch <= 0x10ffff) {
|
||||
if((pDest + 2) <= destLimit) {
|
||||
*pDest++ = U16_LEAD(ch);
|
||||
*pDest++ = U16_TRAIL(ch);
|
||||
} else {
|
||||
reqLength += 2;
|
||||
}
|
||||
break;
|
||||
} else if((ch = subchar) < 0) {
|
||||
/* surrogate code point, or not a Unicode code point at all */
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else {
|
||||
++numSubstitutions;
|
||||
}
|
||||
} while(TRUE);
|
||||
}
|
||||
|
||||
reqLength += (int32_t)(pDest - dest);
|
||||
if(pDestLength) {
|
||||
*pDestLength = reqLength;
|
||||
}
|
||||
if(pNumSubstitutions != NULL) {
|
||||
*pNumSubstitutions = numSubstitutions;
|
||||
}
|
||||
|
||||
/* Terminate the buffer */
|
||||
u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
|
||||
|
@ -118,13 +129,27 @@ u_strFromUTF32(UChar *dest,
|
|||
return dest;
|
||||
}
|
||||
|
||||
U_CAPI UChar* U_EXPORT2
|
||||
u_strFromUTF32(UChar *dest,
|
||||
int32_t destCapacity,
|
||||
int32_t *pDestLength,
|
||||
const UChar32 *src,
|
||||
int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return u_strFromUTF32WithSub(
|
||||
dest, destCapacity, pDestLength,
|
||||
src, srcLength,
|
||||
U_SENTINEL, NULL,
|
||||
pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI UChar32* U_EXPORT2
|
||||
u_strToUTF32(UChar32 *dest,
|
||||
u_strToUTF32WithSub(UChar32 *dest,
|
||||
int32_t destCapacity,
|
||||
int32_t *pDestLength,
|
||||
const UChar *src,
|
||||
const UChar *src,
|
||||
int32_t srcLength,
|
||||
UChar32 subchar, int32_t *pNumSubstitutions,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UChar *srcLimit;
|
||||
UChar32 ch;
|
||||
|
@ -132,25 +157,37 @@ u_strToUTF32(UChar32 *dest,
|
|||
UChar32 *destLimit;
|
||||
UChar32 *pDest;
|
||||
int32_t reqLength;
|
||||
int32_t numSubstitutions;
|
||||
|
||||
/* args check */
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
return NULL;
|
||||
}
|
||||
if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)) {
|
||||
if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) ||
|
||||
subchar > 0x10ffff || U_IS_SURROGATE(subchar)
|
||||
) {
|
||||
*pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(pNumSubstitutions != NULL) {
|
||||
*pNumSubstitutions = 0;
|
||||
}
|
||||
|
||||
pDest = dest;
|
||||
destLimit = dest + destCapacity;
|
||||
reqLength = 0;
|
||||
numSubstitutions = 0;
|
||||
|
||||
if(srcLength < 0) {
|
||||
/* simple loop for conversion of a NUL-terminated BMP string */
|
||||
while((ch=*src) != 0 && !U16_IS_SURROGATE(ch) && pDest < destLimit) {
|
||||
while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
|
||||
++src;
|
||||
*pDest++ = ch;
|
||||
if(pDest < destLimit) {
|
||||
*pDest++ = ch;
|
||||
} else {
|
||||
++reqLength;
|
||||
}
|
||||
}
|
||||
srcLimit = src;
|
||||
if(ch != 0) {
|
||||
|
@ -162,47 +199,55 @@ u_strToUTF32(UChar32 *dest,
|
|||
}
|
||||
|
||||
/* convert with length */
|
||||
while(src < srcLimit && pDest < destLimit) {
|
||||
ch = *src++;
|
||||
if(!U16_IS_SURROGATE(ch)) {
|
||||
/* write ch below */
|
||||
} else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
|
||||
++src;
|
||||
ch = U16_GET_SUPPLEMENTARY(ch, ch2);
|
||||
} else {
|
||||
/* unpaired surrogate */
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
}
|
||||
*pDest++ = ch;
|
||||
}
|
||||
|
||||
/* preflight the remaining string */
|
||||
while(src < srcLimit) {
|
||||
ch = *src++;
|
||||
if(!U16_IS_SURROGATE(ch)) {
|
||||
/* ++reqLength below */
|
||||
} else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(*src)) {
|
||||
/* write or count ch below */
|
||||
} else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
|
||||
++src;
|
||||
} else {
|
||||
ch = U16_GET_SUPPLEMENTARY(ch, ch2);
|
||||
} else if((ch = subchar) < 0) {
|
||||
/* unpaired surrogate */
|
||||
*pErrorCode = U_INVALID_CHAR_FOUND;
|
||||
return NULL;
|
||||
} else {
|
||||
++numSubstitutions;
|
||||
}
|
||||
if(pDest < destLimit) {
|
||||
*pDest++ = ch;
|
||||
} else {
|
||||
++reqLength;
|
||||
}
|
||||
++reqLength;
|
||||
}
|
||||
|
||||
reqLength += (int32_t)(pDest - dest);
|
||||
if(pDestLength) {
|
||||
*pDestLength = reqLength;
|
||||
}
|
||||
if(pNumSubstitutions != NULL) {
|
||||
*pNumSubstitutions = numSubstitutions;
|
||||
}
|
||||
|
||||
/* Terminate the buffer */
|
||||
u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
|
||||
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
U_CAPI UChar32* U_EXPORT2
|
||||
u_strToUTF32(UChar32 *dest,
|
||||
int32_t destCapacity,
|
||||
int32_t *pDestLength,
|
||||
const UChar *src,
|
||||
int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return u_strToUTF32WithSub(
|
||||
dest, destCapacity, pDestLength,
|
||||
src, srcLength,
|
||||
U_SENTINEL, NULL,
|
||||
pErrorCode);
|
||||
}
|
||||
|
||||
/* for utf8_nextCharSafeBodyTerminated() */
|
||||
static const UChar32
|
||||
utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
|
||||
|
@ -372,6 +417,9 @@ u_strFromUTF8WithSub(UChar *dest,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if(pNumSubstitutions!=NULL) {
|
||||
*pNumSubstitutions=0;
|
||||
}
|
||||
numSubstitutions=0;
|
||||
|
||||
/*
|
||||
|
@ -948,6 +996,9 @@ u_strToUTF8WithSub(char *dest,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if(pNumSubstitutions!=NULL) {
|
||||
*pNumSubstitutions=0;
|
||||
}
|
||||
numSubstitutions=0;
|
||||
|
||||
if(srcLength==-1) {
|
||||
|
|
|
@ -202,10 +202,13 @@ static void Test_strToUTF32_surrogates() {
|
|||
UErrorCode err = U_ZERO_ERROR;
|
||||
UChar32 u32Target[400];
|
||||
int32_t len16, u32DestLen;
|
||||
int32_t numSubstitutions;
|
||||
int i;
|
||||
|
||||
static const UChar surr16[] = { 0x41, 0xd900, 0x61, 0xdc00, 0x5a, 0xd900, 0xdc00, 0x7a, 0 };
|
||||
static const UChar32 expected[] = { 0x5a, 0x50000, 0x7a, 0 };
|
||||
static const UChar32 expected_FFFD[] = { 0x41, 0xfffd, 0x61, 0xfffd, 0x5a, 0x50000, 0x7a, 0 };
|
||||
static const UChar32 expected_12345[] = { 0x41, 0x12345, 0x61, 0x12345, 0x5a, 0x50000, 0x7a, 0 };
|
||||
len16 = LENGTHOF(surr16);
|
||||
for(i = 0; i < 4; ++i) {
|
||||
err = U_ZERO_ERROR;
|
||||
|
@ -272,6 +275,40 @@ static void Test_strToUTF32_surrogates() {
|
|||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
|
||||
/* with substitution character */
|
||||
numSubstitutions = -1;
|
||||
err = U_ZERO_ERROR;
|
||||
u_strToUTF32WithSub(u32Target, 0, &u32DestLen, surr16, len16-1, 0xfffd, &numSubstitutions, &err);
|
||||
if(err != U_BUFFER_OVERFLOW_ERROR || u32DestLen != 7 || numSubstitutions != 2) {
|
||||
log_err("u_strToUTF32WithSub(preflight surr16) sets %s != U_BUFFER_OVERFLOW_ERROR or an unexpected length\n",
|
||||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = U_ZERO_ERROR;
|
||||
u_strToUTF32WithSub(u32Target, LENGTHOF(u32Target), &u32DestLen, surr16, len16-1, 0xfffd, &numSubstitutions, &err);
|
||||
if(err != U_ZERO_ERROR || u32DestLen != 7 || numSubstitutions != 2 || uprv_memcmp(u32Target, expected_FFFD, 8*4)) {
|
||||
log_err("u_strToUTF32WithSub(surr16) sets %s != U_ZERO_ERROR or does not produce the expected string\n",
|
||||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = U_ZERO_ERROR;
|
||||
u_strToUTF32WithSub(NULL, 0, &u32DestLen, surr16, -1, 0x12345, &numSubstitutions, &err);
|
||||
if(err != U_BUFFER_OVERFLOW_ERROR || u32DestLen != 7 || numSubstitutions != 2) {
|
||||
log_err("u_strToUTF32WithSub(preflight surr16/NUL) sets %s != U_BUFFER_OVERFLOW_ERROR or an unexpected length\n",
|
||||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = U_ZERO_ERROR;
|
||||
u_strToUTF32WithSub(u32Target, LENGTHOF(u32Target), &u32DestLen, surr16, -1, 0x12345, &numSubstitutions, &err);
|
||||
if(err != U_ZERO_ERROR || u32DestLen != 7 || numSubstitutions != 2 || uprv_memcmp(u32Target, expected_12345, 8*4)) {
|
||||
log_err("u_strToUTF32WithSub(surr16/NUL) sets %s != U_ZERO_ERROR or does not produce the expected string\n",
|
||||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static void Test_strFromUTF32(void){
|
||||
|
@ -345,10 +382,14 @@ static void Test_strFromUTF32_surrogates() {
|
|||
UErrorCode err = U_ZERO_ERROR;
|
||||
UChar uTarget[400];
|
||||
int32_t len32, uDestLen;
|
||||
int32_t numSubstitutions;
|
||||
int i;
|
||||
|
||||
static const UChar32 surr32[] = { 0x41, 0xd900, 0x61, 0xdc00, -1, 0x110000, 0x5a, 0x50000, 0x7a, 0 };
|
||||
static const UChar expected[] = { 0x5a, 0xd900, 0xdc00, 0x7a, 0 };
|
||||
static const UChar expected_FFFD[] = { 0x41, 0xfffd, 0x61, 0xfffd, 0xfffd, 0xfffd, 0x5a, 0xd900, 0xdc00, 0x7a, 0 };
|
||||
static const UChar expected_12345[] = { 0x41, 0xd808, 0xdf45, 0x61, 0xd808, 0xdf45, 0xd808, 0xdf45, 0xd808, 0xdf45,
|
||||
0x5a, 0xd900, 0xdc00, 0x7a, 0 };
|
||||
len32 = LENGTHOF(surr32);
|
||||
for(i = 0; i < 6; ++i) {
|
||||
err = U_ZERO_ERROR;
|
||||
|
@ -415,6 +456,40 @@ static void Test_strFromUTF32_surrogates() {
|
|||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
|
||||
/* with substitution character */
|
||||
numSubstitutions = -1;
|
||||
err = U_ZERO_ERROR;
|
||||
u_strFromUTF32WithSub(uTarget, 0, &uDestLen, surr32, len32-1, 0xfffd, &numSubstitutions, &err);
|
||||
if(err != U_BUFFER_OVERFLOW_ERROR || uDestLen != 10 || numSubstitutions != 4) {
|
||||
log_err("u_strFromUTF32WithSub(preflight surr32) sets %s != U_BUFFER_OVERFLOW_ERROR or an unexpected length\n",
|
||||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = U_ZERO_ERROR;
|
||||
u_strFromUTF32WithSub(uTarget, LENGTHOF(uTarget), &uDestLen, surr32, len32-1, 0xfffd, &numSubstitutions, &err);
|
||||
if(err != U_ZERO_ERROR || uDestLen != 10 || numSubstitutions != 4 || u_memcmp(uTarget, expected_FFFD, 11)) {
|
||||
log_err("u_strFromUTF32WithSub(surr32) sets %s != U_ZERO_ERROR or does not produce the expected string\n",
|
||||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = U_ZERO_ERROR;
|
||||
u_strFromUTF32WithSub(NULL, 0, &uDestLen, surr32, -1, 0x12345, &numSubstitutions, &err);
|
||||
if(err != U_BUFFER_OVERFLOW_ERROR || uDestLen != 14 || numSubstitutions != 4) {
|
||||
log_err("u_strFromUTF32WithSub(preflight surr32/NUL) sets %s != U_BUFFER_OVERFLOW_ERROR or an unexpected length\n",
|
||||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
|
||||
err = U_ZERO_ERROR;
|
||||
u_strFromUTF32WithSub(uTarget, LENGTHOF(uTarget), &uDestLen, surr32, -1, 0x12345, &numSubstitutions, &err);
|
||||
if(err != U_ZERO_ERROR || uDestLen != 14 || numSubstitutions != 4 || u_memcmp(uTarget, expected_12345, 15)) {
|
||||
log_err("u_strFromUTF32WithSub(surr32/NUL) sets %s != U_ZERO_ERROR or does not produce the expected string\n",
|
||||
u_errorName(err));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static void Test_UChar_UTF8_API(void){
|
||||
|
|
Loading…
Add table
Reference in a new issue