diff --git a/icu4c/source/common/ucnv_u8.c b/icu4c/source/common/ucnv_u8.c index cbb8ddd75a5..882f21f0a06 100644 --- a/icu4c/source/common/ucnv_u8.c +++ b/icu4c/source/common/ucnv_u8.c @@ -222,7 +222,9 @@ morebytes: * - encode a code point <= U+10ffff * - use the fewest possible number of bytes for their code points * - use at most 4 bytes (for i>=4 it is 0x10ffff= utf8_minChar32[i] && !UTF_IS_SURROGATE(ch)) { @@ -254,12 +256,10 @@ morebytes: } else { - UConverterCallbackReason reason = - i == inBytes && i == 3 && UTF_IS_SURROGATE(ch) ? UCNV_IRREGULAR : UCNV_ILLEGAL; args->source = (const char *) mySource; args->target = myTarget; args->converter->invalidCharLength = (int8_t)i; - if (T_UConverter_toUnicode_InvalidChar_Callback(args, reason, err)) + if (T_UConverter_toUnicode_InvalidChar_Callback(args, UCNV_ILLEGAL, err)) { /* Stop if the error wasn't handled */ break; @@ -383,7 +383,9 @@ morebytes: * - encode a code point <= U+10ffff * - use the fewest possible number of bytes for their code points * - use at most 4 bytes (for i>=4 it is 0x10ffff= utf8_minChar32[i] && !UTF_IS_SURROGATE(ch)) { @@ -417,8 +419,6 @@ morebytes: } else { - UConverterCallbackReason reason = - i == inBytes && i == 3 && UTF_IS_SURROGATE(ch) ? UCNV_IRREGULAR : UCNV_ILLEGAL; UBool useOffset; args->source = (const char *) mySource; @@ -426,7 +426,7 @@ morebytes: args->offsets = myOffsets; args->converter->invalidCharLength = (int8_t)i; if (T_UConverter_toUnicode_InvalidChar_OffsetCallback(args, - offsetNum, reason, err)) + offsetNum, UCNV_ILLEGAL, err)) { /* Stop if the error wasn't handled */ break; @@ -481,6 +481,7 @@ donefornow: U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, UErrorCode * err) { + UConverter *cnv = args->converter; const UChar *mySource = args->source; unsigned char *myTarget = (unsigned char *) args->target; const UChar *sourceLimit = args->sourceLimit; @@ -489,11 +490,11 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, int16_t indexToWrite; char temp[4]; - if (args->converter->fromUnicodeStatus && myTarget < targetLimit) + if (cnv->fromUSurrogateLead && myTarget < targetLimit) { - ch = args->converter->fromUnicodeStatus; - args->converter->fromUnicodeStatus = 0; - goto lowsurogate; + ch = cnv->fromUSurrogateLead; + cnv->fromUSurrogateLead = 0; + goto lowsurrogate; } while (mySource < sourceLimit && myTarget < targetLimit) @@ -513,31 +514,86 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, } else { - args->converter->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80); - args->converter->charErrorBufferLength = 1; + cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80); + cnv->charErrorBufferLength = 1; *err = U_BUFFER_OVERFLOW_ERROR; } } else - /* Check for surogates */ + /* Check for surrogates */ { - if ((ch >= SURROGATE_HIGH_START) && (ch <= SURROGATE_HIGH_END)) - { -lowsurogate: - if (mySource < sourceLimit) - { - ch2 = *mySource; - if ((ch2 >= SURROGATE_LOW_START) && (ch2 <= SURROGATE_LOW_END)) - { - /* If there were two surrogates, combine them otherwise treat them normally */ - ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; - mySource++; + if(UTF_IS_SURROGATE(ch) /* && not CESU-8 */) { + if(UTF_IS_SURROGATE_FIRST(ch)) { +lowsurrogate: + if (mySource < sourceLimit) { + /* test the following code unit */ + UChar trail=*mySource; + if(UTF_IS_SECOND_SURROGATE(trail)) { + ++mySource; + ch=UTF16_GET_PAIR_VALUE(ch, trail); + ch2 = 0; + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + ch2 = ch; + } + } else { + /* no more input */ + cnv->fromUSurrogateLead = (UChar)ch; + break; } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + ch2 = ch; } - else if (!args->flush) - { - args->converter->fromUnicodeStatus = ch; - break; + + if(ch2 != 0) { + /* call the callback function with all the preparations and post-processing */ + *err = U_ILLEGAL_CHAR_FOUND; + + /* update the arguments structure */ + args->source=mySource; + args->target=(char *)myTarget; + + /* write the code point as code units */ + cnv->invalidUCharBuffer[0] = (UChar)ch2; + cnv->invalidUCharLength = 1; + + /* call the callback function */ + cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err); + + /* get the converter state from UConverter */ + ch = cnv->fromUSurrogateLead; + cnv->fromUSurrogateLead = 0; + + myTarget=(uint8_t *)args->target; + mySource=args->source; + + /* + * If the callback overflowed the target, then we need to + * stop here with an overflow indication. + */ + if(*err==U_BUFFER_OVERFLOW_ERROR) { + break; + } else if(U_FAILURE(*err)) { + /* break on error */ + break; + } else if(cnv->charErrorBufferLength>0) { + /* target is full */ + *err=U_BUFFER_OVERFLOW_ERROR; + break; + /* + * } else if(ch != 0) { ... + * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c) + * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 . + * We would have to check myTargetconverter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; + cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite]; *err = U_BUFFER_OVERFLOW_ERROR; } } @@ -574,6 +630,11 @@ lowsurogate: { *err = U_BUFFER_OVERFLOW_ERROR; } + if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) { + /* a Unicode code point remains incomplete (only a first surrogate) */ + *err = U_TRUNCATED_CHAR_FOUND; + cnv->fromUSurrogateLead = 0; + } args->target = (char *) myTarget; args->source = mySource; @@ -582,21 +643,22 @@ lowsurogate: U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, UErrorCode * err) { + UConverter *cnv = args->converter; const UChar *mySource = args->source; unsigned char *myTarget = (unsigned char *) args->target; int32_t *myOffsets = args->offsets; const UChar *sourceLimit = args->sourceLimit; const unsigned char *targetLimit = (unsigned char *) args->targetLimit; uint32_t ch, ch2; - int32_t offsetNum = 0; + int32_t offsetNum = 0, nextSourceIndex; int16_t indexToWrite; char temp[4]; - if (args->converter->fromUnicodeStatus && myTarget < targetLimit) + if (cnv->fromUSurrogateLead && myTarget < targetLimit) { - ch = args->converter->fromUnicodeStatus; - args->converter->fromUnicodeStatus = 0; - goto lowsurogate; + ch = cnv->fromUSurrogateLead; + cnv->fromUSurrogateLead = 0; + goto lowsurrogate; } while (mySource < sourceLimit && myTarget < targetLimit) @@ -619,31 +681,95 @@ U_CFUNC void T_UConverter_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeA } else { - args->converter->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80); - args->converter->charErrorBufferLength = 1; + cnv->charErrorBuffer[0] = (char) ((ch & 0x3f) | 0x80); + cnv->charErrorBufferLength = 1; *err = U_BUFFER_OVERFLOW_ERROR; } } else - /* Check for surogates */ + /* Check for surrogates */ { - if ((ch >= SURROGATE_HIGH_START) && (ch <= SURROGATE_HIGH_END)) - { -lowsurogate: - if (mySource < sourceLimit) - { - ch2 = *mySource; - if ((ch2 >= SURROGATE_LOW_START) && (ch2 <= SURROGATE_LOW_END)) - { - /* If there were two surrogates, combine them otherwise treat them normally */ - ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; - mySource++; + nextSourceIndex = offsetNum + 1; + + if(UTF_IS_SURROGATE(ch) /* && not CESU-8 */) { + if(UTF_IS_SURROGATE_FIRST(ch)) { +lowsurrogate: + if (mySource < sourceLimit) { + /* test the following code unit */ + UChar trail=*mySource; + if(UTF_IS_SECOND_SURROGATE(trail)) { + ++mySource; + ++nextSourceIndex; + ch=UTF16_GET_PAIR_VALUE(ch, trail); + ch2 = 0; + /* convert this supplementary code point */ + /* exit this condition tree */ + } else { + /* this is an unmatched lead code unit (1st surrogate) */ + /* callback(illegal) */ + ch2 = ch; + } + } else { + /* no more input */ + cnv->fromUSurrogateLead = (UChar)ch; + break; } + } else { + /* this is an unmatched trail code unit (2nd surrogate) */ + /* callback(illegal) */ + ch2 = ch; } - else if (!args->flush) - { - args->converter->fromUnicodeStatus = ch; - break; + + if(ch2 != 0) { + /* call the callback function with all the preparations and post-processing */ + *err = U_ILLEGAL_CHAR_FOUND; + + /* update the arguments structure */ + args->source=mySource; + args->target=(char *)myTarget; + args->offsets=myOffsets; + + /* write the code point as code units */ + cnv->invalidUCharBuffer[0] = (UChar)ch2; + cnv->invalidUCharLength = 1; + + /* call the callback function */ + cnv->fromUCharErrorBehaviour(cnv->fromUContext, args, cnv->invalidUCharBuffer, 1, ch2, UCNV_ILLEGAL, err); + + /* get the converter state from UConverter */ + ch = cnv->fromUSurrogateLead; + cnv->fromUSurrogateLead = 0; + + /* update target and deal with offsets if necessary */ + myOffsets=ucnv_updateCallbackOffsets(myOffsets, ((uint8_t *)args->target)-myTarget, offsetNum); + myTarget=(uint8_t *)args->target; + + /* update the source pointer and index */ + offsetNum=nextSourceIndex+(args->source-mySource); + mySource=args->source; + + /* + * If the callback overflowed the target, then we need to + * stop here with an overflow indication. + */ + if(*err==U_BUFFER_OVERFLOW_ERROR) { + break; + } else if(U_FAILURE(*err)) { + /* break on error */ + break; + } else if(cnv->charErrorBufferLength>0) { + /* target is full */ + *err=U_BUFFER_OVERFLOW_ERROR; + break; + /* + * } else if(ch != 0) { ... + * ### TODO 2002jul01 markus: It looks like this code (from ucnvmbcs.c) + * does not handle the case where the callback leaves ch=fromUSurrogateLead!=0 . + * We would have to check myTargetconverter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; + cnv->charErrorBuffer[cnv->charErrorBufferLength++] = temp[indexToWrite]; *err = U_BUFFER_OVERFLOW_ERROR; } } - offsetNum += (ch >= 0x10000) + 1; + offsetNum = nextSourceIndex; } } @@ -682,6 +808,11 @@ lowsurogate: { *err = U_BUFFER_OVERFLOW_ERROR; } + if(args->flush && mySource >= sourceLimit && cnv->fromUSurrogateLead != 0 && U_SUCCESS(*err)) { + /* a Unicode code point remains incomplete (only a first surrogate) */ + *err = U_TRUNCATED_CHAR_FOUND; + cnv->fromUSurrogateLead = 0; + } args->target = (char *) myTarget; args->source = mySource; @@ -693,7 +824,6 @@ U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args, UChar buffer[2]; char const *sourceInitial; UChar* myUCharPtr; - UConverterCallbackReason reason; uint16_t extraBytesToWrite; uint8_t myByte; UChar32 ch; @@ -777,7 +907,9 @@ U_CFUNC UChar32 T_UConverter_getNextUChar_UTF8(UConverterToUnicodeArgs *args, * - encode a code point <= U+10ffff * - use the fewest possible number of bytes for their code points * - use at most 4 bytes (for i>=4 it is 0x10ffff= utf8_minChar32[extraBytesToWrite] && !UTF_IS_SURROGATE(ch)) { return ch; /* return the code point */ @@ -789,20 +921,14 @@ CALL_ERROR_FUNCTION: uprv_memcpy(args->converter->invalidCharBuffer, sourceInitial, extraBytesToWrite); myUCharPtr = buffer; - if (isLegalSequence && extraBytesToWrite == 3 && UTF_IS_SURROGATE(ch)) { - reason = UCNV_IRREGULAR; - *err = U_INVALID_CHAR_FOUND; - } else { - reason = UCNV_ILLEGAL; - *err = U_ILLEGAL_CHAR_FOUND; - } + *err = U_ILLEGAL_CHAR_FOUND; args->target = myUCharPtr; args->targetLimit = buffer + 2; args->converter->fromCharErrorBehaviour(args->converter->toUContext, args, sourceInitial, extraBytesToWrite, - reason, + UCNV_ILLEGAL, err); if(U_SUCCESS(*err)) { diff --git a/icu4c/source/common/utf_impl.c b/icu4c/source/common/utf_impl.c index c01544af6bb..2f6caa5e3d4 100644 --- a/icu4c/source/common/utf_impl.c +++ b/icu4c/source/common/utf_impl.c @@ -1,7 +1,7 @@ /* ****************************************************************************** * -* Copyright (C) 1999-2001, International Business Machines +* Copyright (C) 1999-2002, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** @@ -127,11 +127,13 @@ utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, * that needs count bytes so that UTF8_GET_CHAR_SAFE() works right. * * Starting with Unicode 3.0.1, non-shortest forms are illegal. + * Starting with Unicode 3.2, surrogate code points must not be + * encoded in UTF-8, and there are no irregular sequences any more. */ /* correct sequence - all trail bytes have (b7..b6)==(10)? */ /* illegal is also set if count>=4 */ - if(illegal || (c)>12)|0xe0); (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); @@ -225,8 +228,8 @@ utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, U *pi=i; UTF8_MASK_LEAD_BYTE(b, count); c|=(UChar32)b<=4 || c>0x10ffff || c=4 || c>0x10ffff || c=4) { count=3; } diff --git a/icu4c/source/test/cintltst/nccbtst.c b/icu4c/source/test/cintltst/nccbtst.c index f808379175a..5b7fa2f6e2b 100644 --- a/icu4c/source/test/cintltst/nccbtst.c +++ b/icu4c/source/test/cintltst/nccbtst.c @@ -1102,21 +1102,6 @@ static void TestStop(int32_t inputsize, int32_t outputsize) log_err("u-> iscii with stop did not match.\n"); - } - log_verbose("Testing fromUnicode for UTF-8 with UCNV_FROM_U_CALLBACK_STOP \n"); - { - static const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01, 0xd801, 0xffff, 0x0061,}; - static const uint8_t expectedUTF8[]= { 0xe2, 0x82, 0xac, - 0xf0, 0x90, 0x90, 0x81, - 0xed, 0xb0, 0x81, 0xed, 0xa0, 0x81, - 0xef, 0xbf, 0xbf, 0x61, - - }; - static const int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6 }; - if(!testConvertFromUnicode(testinput, sizeof(testinput)/sizeof(testinput[0]), - expectedUTF8, sizeof(expectedUTF8), "utf8", - UCNV_FROM_U_CALLBACK_STOP, offsets, NULL, 0 )) - log_err("u-> utf8 with stop did not match.\n"); } log_verbose("Testing fromUnicode for SCSU with UCNV_FROM_U_CALLBACK_STOP \n"); { @@ -1364,6 +1349,23 @@ static void TestSub(int32_t inputsize, int32_t outputsize) log_err("u-> SCSU with substitute did not match.\n"); } + log_verbose("Testing fromUnicode for UTF-8 with UCNV_FROM_U_CALLBACK_SUBSTITUTE\n"); + { + static const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01, 0xd801, 0xffff, 0x0061,}; + static const uint8_t expectedUTF8[]= { 0xe2, 0x82, 0xac, + 0xf0, 0x90, 0x90, 0x81, + 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, + 0xef, 0xbf, 0xbf, 0x61, + + }; + static const int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6 }; + if(!testConvertFromUnicode(testinput, sizeof(testinput)/sizeof(testinput[0]), + expectedUTF8, sizeof(expectedUTF8), "utf8", + UCNV_FROM_U_CALLBACK_SUBSTITUTE, offsets, NULL, 0 )) { + log_err("u-> utf8 with stop did not match.\n"); + } + } + log_verbose("Testing fromUnicode for UTF-16 with UCNV_FROM_U_CALLBACK_SUBSTITUTE\n"); { static const UChar in[]={ 0x0041, 0xfeff }; diff --git a/icu4c/source/test/cintltst/nucnvtst.c b/icu4c/source/test/cintltst/nucnvtst.c index 339e9764d0d..bcea6f53969 100644 --- a/icu4c/source/test/cintltst/nucnvtst.c +++ b/icu4c/source/test/cintltst/nucnvtst.c @@ -730,12 +730,12 @@ static void TestNewConvertWithBufferSizes(int32_t outsize, int32_t insize ) log_verbose("Test surrogate behaviour for UTF8\n"); { - const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01, 0xd801}; + const UChar testinput[]={ 0x20ac, 0xd801, 0xdc01, 0xdc01 }; const uint8_t expectedUTF8test2[]= { 0xe2, 0x82, 0xac, 0xf0, 0x90, 0x90, 0x81, - 0xed, 0xb0, 0x81, 0xed, 0xa0, 0x81 + 0xef, 0xbf, 0xbd }; - int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3, 4, 4, 4 }; + int32_t offsets[]={ 0, 0, 0, 1, 1, 1, 1, 3, 3, 3 }; if(!testConvertFromU(testinput, sizeof(testinput)/sizeof(testinput[0]), expectedUTF8test2, sizeof(expectedUTF8test2), "UTF8", offsets,FALSE )) log_err("u-> UTF8 did not match.\n"); diff --git a/icu4c/source/test/cintltst/utf8tst.c b/icu4c/source/test/cintltst/utf8tst.c index 2a3fb6a7c8d..b897802696f 100644 --- a/icu4c/source/test/cintltst/utf8tst.c +++ b/icu4c/source/test/cintltst/utf8tst.c @@ -423,7 +423,7 @@ static void TestAppendChar(){ 0, 0x10401, 2, 0x0028, 3, 0x7f, - 3, 0xd801, + 3, 0xd801, /* illegal for UTF-8 starting with Unicode 3.2 */ 1, 0x20402, 9, 0x10401, 5, 0xc0, @@ -490,7 +490,7 @@ static void TestAppendChar(){ {0xF0, 0x90, 0x90, 0x81, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, {0x61, 0x62, 0x28, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, {0x61, 0x62, 0x63, 0x7f, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, - {0x61, 0x62, 0x63, 0xed, 0xa0, 0x81, 0x67, 0x68, 0x69, 0x6a, 0x00}, + {0x61, 0x62, 0x63, 0xef, 0xbf, 0xbf, 0x67, 0x68, 0x69, 0x6a, 0x00}, {0x61, 0xF0, 0xa0, 0x90, 0x82, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x00}, {0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xc2, 0x9f}, /*gets UTF8_ERROR_VALUE_2 which takes 2 bytes 0xc0, 0x9f*/