ICU-1967 tighten utf-8 code, must not encode surrogate code points (unpaired surrogates) per unicode 3.2

X-SVN-Rev: 8990
This commit is contained in:
Markus Scherer 2002-07-02 00:51:16 +00:00
parent e3efed98e9
commit 2d5114e756
5 changed files with 224 additions and 93 deletions

View file

@ -222,7 +222,9 @@ morebytes:
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=4 it is 0x10ffff<utf8_minChar32[])
* - single surrogate code points are legal but irregular (also cause a callback)
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
*/
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && !UTF_IS_SURROGATE(ch))
{
@ -254,12 +256,10 @@ morebytes:
}
else
{
UConverterCallbackReason reason =
i == inBytes && i == 3 && UTF_IS_SURROGATE(ch) ? UCNV_IRREGULAR : UCNV_ILLEGAL;
args->source = (const char *) mySource;
args->target = myTarget;
args->converter->invalidCharLength = (int8_t)i;
if (T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err))
if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err))
{
/* Stop if the error wasn't handled */
break;
@ -383,7 +383,9 @@ morebytes:
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=4 it is 0x10ffff<utf8_minChar32[])
* - single surrogate code points are legal but irregular (also cause a callback)
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
*/
if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && !UTF_IS_SURROGATE(ch))
{
@ -417,8 +419,6 @@ morebytes:
}
else
{
UConverterCallbackReason reason =
i == inBytes && i == 3 && UTF_IS_SURROGATE(ch) ? UCNV_IRREGULAR : UCNV_ILLEGAL;
UBool useOffset;
args->source = (const char *) mySource;
@ -426,7 +426,7 @@ morebytes:
args->offsets = myOffsets;
args->converter->invalidCharLength = (int8_t)i;
if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args,
offsetNum, reason, err))
offsetNum, UCNV_ILLEGAL, err))
{
/* Stop if the error wasn't handled */
break;
@ -481,6 +481,7 @@ donefornow:
U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
UConverter *cnv = args->converter;
const UChar *mySource = args->source;
unsigned char *myTarget = (unsigned char *) args->target;
const UChar *sourceLimit = args->sourceLimit;
@ -489,11 +490,11 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
int16_t indexToWrite;
char temp[4];
if (args->converter->fromUnicodeStatus && myTarget < targetLimit)
if (cnv->fromUSurrogateLead && myTarget < targetLimit)
{
ch = args->converter->fromUnicodeStatus;
args->converter->fromUnicodeStatus = 0;
goto lowsurogate;
ch = cnv->fromUSurrogateLead;
cnv->fromUSurrogateLead = 0;
goto lowsurrogate;
}
while (mySource < sourceLimit && myTarget < targetLimit)
@ -513,31 +514,86 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
}
else
{
args->converter->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
args->converter->charErrorBufferLength = 1;
cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
cnv->charErrorBufferLength = 1;
*err = U_BUFFER_OVERFLOW_ERROR;
}
}
else
/* Check for surogates */
/* Check for surrogates */
{
if ((ch >= SURROGATE_HIGH_START) && (ch <= SURROGATE_HIGH_END))
{
lowsurogate:
if (mySource < sourceLimit)
{
ch2 = *mySource;
if ((ch2 >= SURROGATE_LOW_START) && (ch2 <= SURROGATE_LOW_END))
{
/* If there were two surrogates, combine them otherwise treat them normally */
ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
mySource++;
if(UTF_IS_SURROGATE(ch) /* && not CESU-8 */) {
if(UTF_IS_SURROGATE_FIRST(ch)) {
lowsurrogate:
if (mySource < sourceLimit) {
/* test the following code unit */
UChar trail=*mySource;
if(UTF_IS_SECOND_SURROGATE(trail)) {
++mySource;
ch=UTF16_GET_PAIR_VALUE(ch, trail);
ch2 = 0;
/* convert this supplementary code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
ch2 = ch;
}
} else {
/* no more input */
cnv->fromUSurrogateLead = (UChar)ch;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
ch2 = ch;
}
else if (!args->flush)
{
args->converter->fromUnicodeStatus = ch;
break;
if(ch2 != 0) {
/* call the callback function with all the preparations and post-processing */
*err = U_ILLEGAL_CHAR_FOUND;
/* update the arguments structure */
args->source=mySource;
args->target=(char *)myTarget;
/* write the code point as code units */
cnv->invalidUCharBuffer[0] = (UChar)ch2;
cnv->invalidUCharLength = 1;
/* call the callback function */
cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
/* get the converter state from UConverter */
ch = cnv->fromUSurrogateLead;
cnv->fromUSurrogateLead = 0;
myTarget=(uint8_t *)args->target;
mySource=args->source;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*err==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*err)) {
/* break on error */
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*err=U_BUFFER_OVERFLOW_ERROR;
break;
/*
* } else if(ch != 0) { ...
* ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
* does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
* We would have to check myTarget<targetLimit and goto lowsurrogate?!
*/
}
continue;
}
}
@ -563,7 +619,7 @@ lowsurogate:
}
else
{
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
*err = U_BUFFER_OVERFLOW_ERROR;
}
}
@ -574,6 +630,11 @@ lowsurogate:
{
*err = U_BUFFER_OVERFLOW_ERROR;
}
if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
/* a Unicode code point remains incomplete (only a first surrogate) */
*err = U_TRUNCATED_CHAR_FOUND;
cnv->fromUSurrogateLead = 0;
}
args->target = (char *) myTarget;
args->source = mySource;
@ -582,21 +643,22 @@ lowsurogate:
U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
UErrorCode * err)
{
UConverter *cnv = args->converter;
const UChar *mySource = args->source;
unsigned char *myTarget = (unsigned char *) args->target;
int32_t *myOffsets = args->offsets;
const UChar *sourceLimit = args->sourceLimit;
const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
uint32_t ch, ch2;
int32_t offsetNum = 0;
int32_t offsetNum = 0, nextSourceIndex;
int16_t indexToWrite;
char temp[4];
if (args->converter->fromUnicodeStatus && myTarget < targetLimit)
if (cnv->fromUSurrogateLead && myTarget < targetLimit)
{
ch = args->converter->fromUnicodeStatus;
args->converter->fromUnicodeStatus = 0;
goto lowsurogate;
ch = cnv->fromUSurrogateLead;
cnv->fromUSurrogateLead = 0;
goto lowsurrogate;
}
while (mySource < sourceLimit && myTarget < targetLimit)
@ -619,31 +681,95 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeA
}
else
{
args->converter->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
args->converter->charErrorBufferLength = 1;
cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80);
cnv->charErrorBufferLength = 1;
*err = U_BUFFER_OVERFLOW_ERROR;
}
}
else
/* Check for surogates */
/* Check for surrogates */
{
if ((ch >= SURROGATE_HIGH_START) && (ch <= SURROGATE_HIGH_END))
{
lowsurogate:
if (mySource < sourceLimit)
{
ch2 = *mySource;
if ((ch2 >= SURROGATE_LOW_START) && (ch2 <= SURROGATE_LOW_END))
{
/* If there were two surrogates, combine them otherwise treat them normally */
ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
mySource++;
nextSourceIndex = offsetNum + 1;
if(UTF_IS_SURROGATE(ch) /* && not CESU-8 */) {
if(UTF_IS_SURROGATE_FIRST(ch)) {
lowsurrogate:
if (mySource < sourceLimit) {
/* test the following code unit */
UChar trail=*mySource;
if(UTF_IS_SECOND_SURROGATE(trail)) {
++mySource;
++nextSourceIndex;
ch=UTF16_GET_PAIR_VALUE(ch, trail);
ch2 = 0;
/* convert this supplementary code point */
/* exit this condition tree */
} else {
/* this is an unmatched lead code unit (1st surrogate) */
/* callback(illegal) */
ch2 = ch;
}
} else {
/* no more input */
cnv->fromUSurrogateLead = (UChar)ch;
break;
}
} else {
/* this is an unmatched trail code unit (2nd surrogate) */
/* callback(illegal) */
ch2 = ch;
}
else if (!args->flush)
{
args->converter->fromUnicodeStatus = ch;
break;
if(ch2 != 0) {
/* call the callback function with all the preparations and post-processing */
*err = U_ILLEGAL_CHAR_FOUND;
/* update the arguments structure */
args->source=mySource;
args->target=(char *)myTarget;
args->offsets=myOffsets;
/* write the code point as code units */
cnv->invalidUCharBuffer[0] = (UChar)ch2;
cnv->invalidUCharLength = 1;
/* call the callback function */
cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err);
/* get the converter state from UConverter */
ch = cnv->fromUSurrogateLead;
cnv->fromUSurrogateLead = 0;
/* update target and deal with offsets if necessary */
myOffsets=ucnv_updateCallbackOffsets(myOffsets, ((uint8_t *)args->target)-myTarget, offsetNum);
myTarget=(uint8_t *)args->target;
/* update the source pointer and index */
offsetNum=nextSourceIndex+(args->source-mySource);
mySource=args->source;
/*
* If the callback overflowed the target, then we need to
* stop here with an overflow indication.
*/
if(*err==U_BUFFER_OVERFLOW_ERROR) {
break;
} else if(U_FAILURE(*err)) {
/* break on error */
break;
} else if(cnv->charErrorBufferLength>0) {
/* target is full */
*err=U_BUFFER_OVERFLOW_ERROR;
break;
/*
* } else if(ch != 0) { ...
* ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c)
* does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 .
* We would have to check myTarget<targetLimit and goto lowsurrogate?!
*/
}
continue;
}
}
@ -670,11 +796,11 @@ lowsurogate:
}
else
{
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite];
*err = U_BUFFER_OVERFLOW_ERROR;
}
}
offsetNum += (ch >= 0x10000) + 1;
offsetNum = nextSourceIndex;
}
}
@ -682,6 +808,11 @@ lowsurogate:
{
*err = U_BUFFER_OVERFLOW_ERROR;
}
if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) {
/* a Unicode code point remains incomplete (only a first surrogate) */
*err = U_TRUNCATED_CHAR_FOUND;
cnv->fromUSurrogateLead = 0;
}
args->target = (char *) myTarget;
args->source = mySource;
@ -693,7 +824,6 @@ U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
UChar buffer[2];
char const *sourceInitial;
UChar* myUCharPtr;
UConverterCallbackReason reason;
uint16_t extraBytesToWrite;
uint8_t myByte;
UChar32 ch;
@ -777,7 +907,9 @@ U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
* - encode a code point <= U+10ffff
* - use the fewest possible number of bytes for their code points
* - use at most 4 bytes (for i>=4 it is 0x10ffff<utf8_minChar32[])
* - single surrogate code points are legal but irregular (also cause a callback)
*
* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
* There are no irregular sequences any more.
*/
if (isLegalSequence && (uint32_t)ch <= MAXIMUM_UTF && (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] && !UTF_IS_SURROGATE(ch)) {
return ch; /* return the code point */
@ -789,20 +921,14 @@ CALL_ERROR_FUNCTION:
uprv_memcpy(args->converter->invalidCharBuffer, sourceInitial, extraBytesToWrite);
myUCharPtr = buffer;
if (isLegalSequence && extraBytesToWrite == 3 && UTF_IS_SURROGATE(ch)) {
reason = UCNV_IRREGULAR;
*err = U_INVALID_CHAR_FOUND;
} else {
reason = UCNV_ILLEGAL;
*err = U_ILLEGAL_CHAR_FOUND;
}
*err = U_ILLEGAL_CHAR_FOUND;
args->target = myUCharPtr;
args->targetLimit = buffer + 2;
args->converter->fromCharErrorBehaviour(args->converter->toUContext,
args,
sourceInitial,
extraBytesToWrite,
reason,
UCNV_ILLEGAL,
err);
if(U_SUCCESS(*err)) {

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 1999-2001, International Business Machines
* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -127,11 +127,13 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
* that needs count bytes so that UTF8_GET_CHAR_SAFE() works right.
*
* Starting with Unicode 3.0.1, non-shortest forms are illegal.
* Starting with Unicode 3.2, surrogate code points must not be
* encoded in UTF-8, and there are no irregular sequences any more.
*/
/* correct sequence - all trail bytes have (b7..b6)==(10)? */
/* illegal is also set if count>=4 */
if(illegal || (c)<utf8_minLegal[count]) {
if(illegal || (c)<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
/* error handling */
uint8_t errorCount=count;
/* don't go beyond this sequence */
@ -141,8 +143,8 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c,
--count;
}
c=utf8_errorValue[errorCount-count];
} else if((strict) && !UTF_IS_UNICODE_CHAR(c)) {
/* irregular sequence */
} else if((strict) && UTF_IS_UNICODE_NONCHAR(c)) {
/* strict: forbid non-characters like U+fffe */
c=utf8_errorValue[count];
}
} else /* too few bytes left */ {
@ -167,7 +169,8 @@ utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c) {
return i;
}
} else if((uint32_t)(c)<=0xffff) {
if((i)+2<(length)) {
/* Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. */
if((i)+2<(length) && !UTF_IS_SURROGATE(c)) {
(s)[(i)++]=(uint8_t)(((c)>>12)|0xe0);
(s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80);
(s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80);
@ -225,8 +228,8 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U
*pi=i;
UTF8_MASK_LEAD_BYTE(b, count);
c|=(UChar32)b<<shift;
if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || (strict && !UTF_IS_UNICODE_CHAR(c))) {
/* illegal or irregular sequence */
if(count>=4 || c>0x10ffff || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c) || (strict && UTF_IS_UNICODE_NONCHAR(c))) {
/* illegal sequence or (strict and non-character) */
if(count>=4) {
count=3;
}

View file

@ -1102,21 +1102,6 @@ static void TestStop(int32_t inputsize, int32_t outputsize)
log_err("u-> iscii with stop did not match.\n");
}
log_verbose("Testing fromUnicode for UTF-8 with UCNV_FROM_U_CALLBACK_STOP \n");
{
static const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01, 0xd801, 0xffff, 0x0061,};
static const uint8_t expectedUTF8[]= { 0xe2, 0x82, 0xac,
0xf0, 0x90, 0x90, 0x81,
0xed, 0xb0, 0x81, 0xed, 0xa0, 0x81,
0xef, 0xbf, 0xbf, 0x61,
};
static const int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6 };
if(!testConvertFromUnicode(testinput, sizeof(testinput)/sizeof(testinput[0]),
expectedUTF8, sizeof(expectedUTF8), "utf8",
UCNV_FROM_U_CALLBACK_STOP, offsets, NULL, 0 ))
log_err("u-> utf8 with stop did not match.\n");
}
log_verbose("Testing fromUnicode for SCSU with UCNV_FROM_U_CALLBACK_STOP \n");
{
@ -1364,6 +1349,23 @@ static void TestSub(int32_t inputsize, int32_t outputsize)
log_err("u-> SCSU with substitute did not match.\n");
}
log_verbose("Testing fromUnicode for UTF-8 with UCNV_FROM_U_CALLBACK_SUBSTITUTE\n");
{
static const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01, 0xd801, 0xffff, 0x0061,};
static const uint8_t expectedUTF8[]= { 0xe2, 0x82, 0xac,
0xf0, 0x90, 0x90, 0x81,
0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd,
0xef, 0xbf, 0xbf, 0x61,
};
static const int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6 };
if(!testConvertFromUnicode(testinput, sizeof(testinput)/sizeof(testinput[0]),
expectedUTF8, sizeof(expectedUTF8), "utf8",
UCNV_FROM_U_CALLBACK_SUBSTITUTE, offsets, NULL, 0 )) {
log_err("u-> utf8 with stop did not match.\n");
}
}
log_verbose("Testing fromUnicode for UTF-16 with UCNV_FROM_U_CALLBACK_SUBSTITUTE\n");
{
static const UChar in[]={ 0x0041, 0xfeff };

View file

@ -730,12 +730,12 @@ static void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize )
log_verbose("Test surrogate behaviour for UTF8\n");
{
const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01, 0xd801};
const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01 };
const uint8_t expectedUTF8test2[]= { 0xe2, 0x82, 0xac,
0xf0, 0x90, 0x90, 0x81,
0xed, 0xb0, 0x81, 0xed, 0xa0, 0x81
0xef, 0xbf, 0xbd
};
int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4 };
int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3 };
if(!testConvertFromU(testinput, sizeof(testinput)/sizeof(testinput[0]),
expectedUTF8test2, sizeof(expectedUTF8test2), "UTF8", offsets,FALSE ))
log_err("u-> UTF8 did not match.\n");

View file

@ -423,7 +423,7 @@ static void TestAppendChar(){
0, 0x10401,
2, 0x0028,
3, 0x7f,
3, 0xd801,
3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */
1, 0x20402,
9, 0x10401,
5, 0xc0,
@ -490,7 +490,7 @@ static void TestAppendChar(){
{0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
{0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
{0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
{0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00},
{0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00},
{0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00},
{0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/