mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-1967 tighten utf-8 code, must not encode surrogate code points (unpaired surrogates) per unicode 3.2
X-SVN-Rev: 8990
This commit is contained in:
parent
e3efed98e9
commit
2d5114e756
5 changed files with 224 additions and 93 deletions
|
@ -222,7 +222,9 @@ morebytes:
|
|||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=4 it is 0x10ffff<utf8_minChar32[])
|
||||
* - single surrogate code points are legal but irregular (also cause a callback)
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
*/
|
||||
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && !UTF_IS_SURROGATE(ch))
|
||||
{
|
||||
|
@ -254,12 +256,10 @@ morebytes:
|
|||
}
|
||||
else
|
||||
{
|
||||
UConverterCallbackReason reason =
|
||||
i == inBytes && i == 3 && UTF_IS_SURROGATE(ch) ? UCNV_IRREGULAR : UCNV_ILLEGAL;
|
||||
args->source = (const char *) mySource;
|
||||
args->target = myTarget;
|
||||
args->converter->invalidCharLength = (int8_t)i;
|
||||
if (T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err))
|
||||
if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
|
||||
{
|
||||
/* Stop if the error wasn't handled */
|
||||
break;
|
||||
|
@ -383,7 +383,9 @@ morebytes:
|
|||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=4 it is 0x10ffff<utf8_minChar32[])
|
||||
* - single surrogate code points are legal but irregular (also cause a callback)
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
*/
|
||||
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && !UTF_IS_SURROGATE(ch))
|
||||
{
|
||||
|
@ -417,8 +419,6 @@ morebytes:
|
|||
}
|
||||
else
|
||||
{
|
||||
UConverterCallbackReason reason =
|
||||
i == inBytes && i == 3 && UTF_IS_SURROGATE(ch) ? UCNV_IRREGULAR : UCNV_ILLEGAL;
|
||||
UBool useOffset;
|
||||
|
||||
args->source = (const char *) mySource;
|
||||
|
@ -426,7 +426,7 @@ morebytes:
|
|||
args->offsets = myOffsets;
|
||||
args->converter->invalidCharLength = (int8_t)i;
|
||||
if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args,
|
||||
offsetNum, reason, err))
|
||||
offsetNum, UCNV_ILLEGAL, err))
|
||||
{
|
||||
/* Stop if the error wasn't handled */
|
||||
break;
|
||||
|
@ -481,6 +481,7 @@ donefornow:
|
|||
U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
|
||||
UErrorCode * err)
|
||||
{
|
||||
UConverter *cnv = args->converter;
|
||||
const UChar *mySource = args->source;
|
||||
unsigned char *myTarget = (unsigned char *) args->target;
|
||||
const UChar *sourceLimit = args->sourceLimit;
|
||||
|
@ -489,11 +490,11 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
|
|||
int16_t indexToWrite;
|
||||
char temp[4];
|
||||
|
||||
if (args->converter->fromUnicodeStatus && myTarget < targetLimit)
|
||||
if (cnv->fromUSurrogateLead && myTarget < targetLimit)
|
||||
{
|
||||
ch = args->converter->fromUnicodeStatus;
|
||||
args->converter->fromUnicodeStatus = 0;
|
||||
goto lowsurogate;
|
||||
ch = cnv->fromUSurrogateLead;
|
||||
cnv->fromUSurrogateLead = 0;
|
||||
goto lowsurrogate;
|
||||
}
|
||||
|
||||
while (mySource < sourceLimit && myTarget < targetLimit)
|
||||
|
@ -513,31 +514,86 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
|
|||
}
|
||||
else
|
||||
{
|
||||
args->converter->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
|
||||
args->converter->charErrorBufferLength = 1;
|
||||
cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
|
||||
cnv->charErrorBufferLength = 1;
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
/* Check for surogates */
|
||||
/* Check for surrogates */
|
||||
{
|
||||
if ((ch >= SURROGATE_HIGH_START) && (ch <= SURROGATE_HIGH_END))
|
||||
{
|
||||
lowsurogate:
|
||||
if (mySource < sourceLimit)
|
||||
{
|
||||
ch2 = *mySource;
|
||||
if ((ch2 >= SURROGATE_LOW_START) && (ch2 <= SURROGATE_LOW_END))
|
||||
{
|
||||
/* If there were two surrogates, combine them otherwise treat them normally */
|
||||
ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
|
||||
mySource++;
|
||||
if(UTF_IS_SURROGATE(ch) /* && not CESU-8 */) {
|
||||
if(UTF_IS_SURROGATE_FIRST(ch)) {
|
||||
lowsurrogate:
|
||||
if (mySource < sourceLimit) {
|
||||
/* test the following code unit */
|
||||
UChar trail=*mySource;
|
||||
if(UTF_IS_SECOND_SURROGATE(trail)) {
|
||||
++mySource;
|
||||
ch=UTF16_GET_PAIR_VALUE(ch, trail);
|
||||
ch2 = 0;
|
||||
/* convert this supplementary code point */
|
||||
/* exit this condition tree */
|
||||
} else {
|
||||
/* this is an unmatched lead code unit (1st surrogate) */
|
||||
/* callback(illegal) */
|
||||
ch2 = ch;
|
||||
}
|
||||
} else {
|
||||
/* no more input */
|
||||
cnv->fromUSurrogateLead = (UChar)ch;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
ch2 = ch;
|
||||
}
|
||||
else if (!args->flush)
|
||||
{
|
||||
args->converter->fromUnicodeStatus = ch;
|
||||
break;
|
||||
|
||||
if(ch2 != 0) {
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
|
||||
/* update the arguments structure */
|
||||
args->source=mySource;
|
||||
args->target=(char *)myTarget;
|
||||
|
||||
/* write the code point as code units */
|
||||
cnv->invalidUCharBuffer[0] = (UChar)ch2;
|
||||
cnv->invalidUCharLength = 1;
|
||||
|
||||
/* call the callback function */
|
||||
cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
ch = cnv->fromUSurrogateLead;
|
||||
cnv->fromUSurrogateLead = 0;
|
||||
|
||||
myTarget=(uint8_t *)args->target;
|
||||
mySource=args->source;
|
||||
|
||||
/*
|
||||
* If the callback overflowed the target, then we need to
|
||||
* stop here with an overflow indication.
|
||||
*/
|
||||
if(*err==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(U_FAILURE(*err)) {
|
||||
/* break on error */
|
||||
break;
|
||||
} else if(cnv->charErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*err=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
/*
|
||||
* } else if(ch != 0) { ...
|
||||
* ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
|
||||
* does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
|
||||
* We would have to check myTarget<targetLimit and goto lowsurrogate?!
|
||||
*/
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -563,7 +619,7 @@ lowsurogate:
|
|||
}
|
||||
else
|
||||
{
|
||||
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
|
||||
cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
}
|
||||
|
@ -574,6 +630,11 @@ lowsurogate:
|
|||
{
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
|
||||
/* a Unicode code point remains incomplete (only a first surrogate) */
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
cnv->fromUSurrogateLead = 0;
|
||||
}
|
||||
|
||||
args->target = (char *) myTarget;
|
||||
args->source = mySource;
|
||||
|
@ -582,21 +643,22 @@ lowsurogate:
|
|||
U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
|
||||
UErrorCode * err)
|
||||
{
|
||||
UConverter *cnv = args->converter;
|
||||
const UChar *mySource = args->source;
|
||||
unsigned char *myTarget = (unsigned char *) args->target;
|
||||
int32_t *myOffsets = args->offsets;
|
||||
const UChar *sourceLimit = args->sourceLimit;
|
||||
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
|
||||
uint32_t ch, ch2;
|
||||
int32_t offsetNum = 0;
|
||||
int32_t offsetNum = 0, nextSourceIndex;
|
||||
int16_t indexToWrite;
|
||||
char temp[4];
|
||||
|
||||
if (args->converter->fromUnicodeStatus && myTarget < targetLimit)
|
||||
if (cnv->fromUSurrogateLead && myTarget < targetLimit)
|
||||
{
|
||||
ch = args->converter->fromUnicodeStatus;
|
||||
args->converter->fromUnicodeStatus = 0;
|
||||
goto lowsurogate;
|
||||
ch = cnv->fromUSurrogateLead;
|
||||
cnv->fromUSurrogateLead = 0;
|
||||
goto lowsurrogate;
|
||||
}
|
||||
|
||||
while (mySource < sourceLimit && myTarget < targetLimit)
|
||||
|
@ -619,31 +681,95 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeA
|
|||
}
|
||||
else
|
||||
{
|
||||
args->converter->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
|
||||
args->converter->charErrorBufferLength = 1;
|
||||
cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
|
||||
cnv->charErrorBufferLength = 1;
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
}
|
||||
else
|
||||
/* Check for surogates */
|
||||
/* Check for surrogates */
|
||||
{
|
||||
if ((ch >= SURROGATE_HIGH_START) && (ch <= SURROGATE_HIGH_END))
|
||||
{
|
||||
lowsurogate:
|
||||
if (mySource < sourceLimit)
|
||||
{
|
||||
ch2 = *mySource;
|
||||
if ((ch2 >= SURROGATE_LOW_START) && (ch2 <= SURROGATE_LOW_END))
|
||||
{
|
||||
/* If there were two surrogates, combine them otherwise treat them normally */
|
||||
ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
|
||||
mySource++;
|
||||
nextSourceIndex = offsetNum + 1;
|
||||
|
||||
if(UTF_IS_SURROGATE(ch) /* && not CESU-8 */) {
|
||||
if(UTF_IS_SURROGATE_FIRST(ch)) {
|
||||
lowsurrogate:
|
||||
if (mySource < sourceLimit) {
|
||||
/* test the following code unit */
|
||||
UChar trail=*mySource;
|
||||
if(UTF_IS_SECOND_SURROGATE(trail)) {
|
||||
++mySource;
|
||||
++nextSourceIndex;
|
||||
ch=UTF16_GET_PAIR_VALUE(ch, trail);
|
||||
ch2 = 0;
|
||||
/* convert this supplementary code point */
|
||||
/* exit this condition tree */
|
||||
} else {
|
||||
/* this is an unmatched lead code unit (1st surrogate) */
|
||||
/* callback(illegal) */
|
||||
ch2 = ch;
|
||||
}
|
||||
} else {
|
||||
/* no more input */
|
||||
cnv->fromUSurrogateLead = (UChar)ch;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* this is an unmatched trail code unit (2nd surrogate) */
|
||||
/* callback(illegal) */
|
||||
ch2 = ch;
|
||||
}
|
||||
else if (!args->flush)
|
||||
{
|
||||
args->converter->fromUnicodeStatus = ch;
|
||||
break;
|
||||
|
||||
if(ch2 != 0) {
|
||||
/* call the callback function with all the preparations and post-processing */
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
|
||||
/* update the arguments structure */
|
||||
args->source=mySource;
|
||||
args->target=(char *)myTarget;
|
||||
args->offsets=myOffsets;
|
||||
|
||||
/* write the code point as code units */
|
||||
cnv->invalidUCharBuffer[0] = (UChar)ch2;
|
||||
cnv->invalidUCharLength = 1;
|
||||
|
||||
/* call the callback function */
|
||||
cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
|
||||
|
||||
/* get the converter state from UConverter */
|
||||
ch = cnv->fromUSurrogateLead;
|
||||
cnv->fromUSurrogateLead = 0;
|
||||
|
||||
/* update target and deal with offsets if necessary */
|
||||
myOffsets=ucnv_updateCallbackOffsets(myOffsets, ((uint8_t *)args->target)-myTarget, offsetNum);
|
||||
myTarget=(uint8_t *)args->target;
|
||||
|
||||
/* update the source pointer and index */
|
||||
offsetNum=nextSourceIndex+(args->source-mySource);
|
||||
mySource=args->source;
|
||||
|
||||
/*
|
||||
* If the callback overflowed the target, then we need to
|
||||
* stop here with an overflow indication.
|
||||
*/
|
||||
if(*err==U_BUFFER_OVERFLOW_ERROR) {
|
||||
break;
|
||||
} else if(U_FAILURE(*err)) {
|
||||
/* break on error */
|
||||
break;
|
||||
} else if(cnv->charErrorBufferLength>0) {
|
||||
/* target is full */
|
||||
*err=U_BUFFER_OVERFLOW_ERROR;
|
||||
break;
|
||||
/*
|
||||
* } else if(ch != 0) { ...
|
||||
* ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
|
||||
* does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
|
||||
* We would have to check myTarget<targetLimit and goto lowsurrogate?!
|
||||
*/
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -670,11 +796,11 @@ lowsurogate:
|
|||
}
|
||||
else
|
||||
{
|
||||
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
|
||||
cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
}
|
||||
offsetNum += (ch >= 0x10000) + 1;
|
||||
offsetNum = nextSourceIndex;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -682,6 +808,11 @@ lowsurogate:
|
|||
{
|
||||
*err = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
|
||||
/* a Unicode code point remains incomplete (only a first surrogate) */
|
||||
*err = U_TRUNCATED_CHAR_FOUND;
|
||||
cnv->fromUSurrogateLead = 0;
|
||||
}
|
||||
|
||||
args->target = (char *) myTarget;
|
||||
args->source = mySource;
|
||||
|
@ -693,7 +824,6 @@ U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
|||
UChar buffer[2];
|
||||
char const *sourceInitial;
|
||||
UChar* myUCharPtr;
|
||||
UConverterCallbackReason reason;
|
||||
uint16_t extraBytesToWrite;
|
||||
uint8_t myByte;
|
||||
UChar32 ch;
|
||||
|
@ -777,7 +907,9 @@ U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
|
|||
* - encode a code point <= U+10ffff
|
||||
* - use the fewest possible number of bytes for their code points
|
||||
* - use at most 4 bytes (for i>=4 it is 0x10ffff<utf8_minChar32[])
|
||||
* - single surrogate code points are legal but irregular (also cause a callback)
|
||||
*
|
||||
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
|
||||
* There are no irregular sequences any more.
|
||||
*/
|
||||
if (isLegalSequence && (uint32_t)ch <= MAXIMUM_UTF && (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] && !UTF_IS_SURROGATE(ch)) {
|
||||
return ch; /* return the code point */
|
||||
|
@ -789,20 +921,14 @@ CALL_ERROR_FUNCTION:
|
|||
uprv_memcpy(args->converter->invalidCharBuffer, sourceInitial, extraBytesToWrite);
|
||||
|
||||
myUCharPtr = buffer;
|
||||
if (isLegalSequence && extraBytesToWrite == 3 && UTF_IS_SURROGATE(ch)) {
|
||||
reason = UCNV_IRREGULAR;
|
||||
*err = U_INVALID_CHAR_FOUND;
|
||||
} else {
|
||||
reason = UCNV_ILLEGAL;
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
}
|
||||
*err = U_ILLEGAL_CHAR_FOUND;
|
||||
args->target = myUCharPtr;
|
||||
args->targetLimit = buffer + 2;
|
||||
args->converter->fromCharErrorBehaviour(args->converter->toUContext,
|
||||
args,
|
||||
sourceInitial,
|
||||
extraBytesToWrite,
|
||||
reason,
|
||||
UCNV_ILLEGAL,
|
||||
err);
|
||||
|
||||
if(U_SUCCESS(*err)) {
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2001, International Business Machines
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -127,11 +127,13 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
|
|||
* that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
|
||||
*
|
||||
* Starting with Unicode 3.0.1, non-shortest forms are illegal.
|
||||
* Starting with Unicode 3.2, surrogate code points must not be
|
||||
* encoded in UTF-8, and there are no irregular sequences any more.
|
||||
*/
|
||||
|
||||
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
|
||||
/* illegal is also set if count>=4 */
|
||||
if(illegal || (c)<utf8_minLegal[count]) {
|
||||
if(illegal || (c)<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
|
||||
/* error handling */
|
||||
uint8_t errorCount=count;
|
||||
/* don't go beyond this sequence */
|
||||
|
@ -141,8 +143,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
|
|||
--count;
|
||||
}
|
||||
c=utf8_errorValue[errorCount-count];
|
||||
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) {
|
||||
/* irregular sequence */
|
||||
} else if((strict) && UTF_IS_UNICODE_NONCHAR(c)) {
|
||||
/* strict: forbid non-characters like U+fffe */
|
||||
c=utf8_errorValue[count];
|
||||
}
|
||||
} else /* too few bytes left */ {
|
||||
|
@ -167,7 +169,8 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
|
|||
return i;
|
||||
}
|
||||
} else if((uint32_t)(c)<=0xffff) {
|
||||
if((i)+2<(length)) {
|
||||
/* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
|
||||
if((i)+2<(length) && !UTF_IS_SURROGATE(c)) {
|
||||
(s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
|
||||
(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
|
||||
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
|
||||
|
@ -225,8 +228,8 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
|
|||
*pi=i;
|
||||
UTF8_MASK_LEAD_BYTE(b, count);
|
||||
c|=(UChar32)b<<shift;
|
||||
if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (strict && !UTF_IS_UNICODE_CHAR(c))) {
|
||||
/* illegal or irregular sequence */
|
||||
if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c) || (strict && UTF_IS_UNICODE_NONCHAR(c))) {
|
||||
/* illegal sequence or (strict and non-character) */
|
||||
if(count>=4) {
|
||||
count=3;
|
||||
}
|
||||
|
|
|
@ -1102,21 +1102,6 @@ static void TestStop(int32_t inputsize, int32_t outputsize)
|
|||
log_err("u-> iscii with stop did not match.\n");
|
||||
|
||||
|
||||
}
|
||||
log_verbose("Testing fromUnicode for UTF-8 with UCNV_FROM_U_CALLBACK_STOP \n");
|
||||
{
|
||||
static const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01, 0xd801, 0xffff, 0x0061,};
|
||||
static const uint8_t expectedUTF8[]= { 0xe2, 0x82, 0xac,
|
||||
0xf0, 0x90, 0x90, 0x81,
|
||||
0xed, 0xb0, 0x81, 0xed, 0xa0, 0x81,
|
||||
0xef, 0xbf, 0xbf, 0x61,
|
||||
|
||||
};
|
||||
static const int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6 };
|
||||
if(!testConvertFromUnicode(testinput, sizeof(testinput)/sizeof(testinput[0]),
|
||||
expectedUTF8, sizeof(expectedUTF8), "utf8",
|
||||
UCNV_FROM_U_CALLBACK_STOP, offsets, NULL, 0 ))
|
||||
log_err("u-> utf8 with stop did not match.\n");
|
||||
}
|
||||
log_verbose("Testing fromUnicode for SCSU with UCNV_FROM_U_CALLBACK_STOP \n");
|
||||
{
|
||||
|
@ -1364,6 +1349,23 @@ static void TestSub(int32_t inputsize, int32_t outputsize)
|
|||
log_err("u-> SCSU with substitute did not match.\n");
|
||||
}
|
||||
|
||||
log_verbose("Testing fromUnicode for UTF-8 with UCNV_FROM_U_CALLBACK_SUBSTITUTE\n");
|
||||
{
|
||||
static const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01, 0xd801, 0xffff, 0x0061,};
|
||||
static const uint8_t expectedUTF8[]= { 0xe2, 0x82, 0xac,
|
||||
0xf0, 0x90, 0x90, 0x81,
|
||||
0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd,
|
||||
0xef, 0xbf, 0xbf, 0x61,
|
||||
|
||||
};
|
||||
static const int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6 };
|
||||
if(!testConvertFromUnicode(testinput, sizeof(testinput)/sizeof(testinput[0]),
|
||||
expectedUTF8, sizeof(expectedUTF8), "utf8",
|
||||
UCNV_FROM_U_CALLBACK_SUBSTITUTE, offsets, NULL, 0 )) {
|
||||
log_err("u-> utf8 with stop did not match.\n");
|
||||
}
|
||||
}
|
||||
|
||||
log_verbose("Testing fromUnicode for UTF-16 with UCNV_FROM_U_CALLBACK_SUBSTITUTE\n");
|
||||
{
|
||||
static const UChar in[]={ 0x0041, 0xfeff };
|
||||
|
|
|
@ -730,12 +730,12 @@ static void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize )
|
|||
|
||||
log_verbose("Test surrogate behaviour for UTF8\n");
|
||||
{
|
||||
const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01, 0xd801};
|
||||
const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01 };
|
||||
const uint8_t expectedUTF8test2[]= { 0xe2, 0x82, 0xac,
|
||||
0xf0, 0x90, 0x90, 0x81,
|
||||
0xed, 0xb0, 0x81, 0xed, 0xa0, 0x81
|
||||
0xef, 0xbf, 0xbd
|
||||
};
|
||||
int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4 };
|
||||
int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3 };
|
||||
if(!testConvertFromU(testinput, sizeof(testinput)/sizeof(testinput[0]),
|
||||
expectedUTF8test2, sizeof(expectedUTF8test2), "UTF8", offsets,FALSE ))
|
||||
log_err("u-> UTF8 did not match.\n");
|
||||
|
|
|
@ -423,7 +423,7 @@ static void TestAppendChar(){
|
|||
0, 0x10401,
|
||||
2, 0x0028,
|
||||
3, 0x7f,
|
||||
3, 0xd801,
|
||||
3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
|
||||
1, 0x20402,
|
||||
9, 0x10401,
|
||||
5, 0xc0,
|
||||
|
@ -490,7 +490,7 @@ static void TestAppendChar(){
|
|||
{0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
|
||||
{0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
|
||||
{0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
|
||||
{0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
|
||||
{0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
|
||||
{0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
|
||||
{0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue